In [3]:
# General import
import os
import shutil

# 3rd-party import
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

# Local import

In [4]:
DATA_PATH = "../data"

def load_doc():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    doc = loader.load()
    return doc

In [None]:
def split_txt(doc: list[Document]):
    txt_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=500,
        length_function=len,
        add_start_index=True,
    )

    chunks = txt_splitter.split_documents(doc)
    print(f"Splitted {len(doc)} documents into {len(chunks)} chunks.")

    print(chunks[3].page_content) # random chunk
    print(chunks[3].metadata) 

    return chunks

In [None]:
CHROMA_PATH = "chroma"

def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

db = Chroma.from_documents(
    chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
)


In [None]:
def generate_data_score():
    doc = load_doc()
    chunks = split_txt(doc)
    save_to_chroma(chunks)


In [None]:
def main():
    generate_data_score()