In [1]:
import os
import shutil

from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
EMBEDDING_MODEL = 'sentence-transformers/all-mpnet-base-v2'
CHROMA_PATH = "chroma"
DATA_PATH = "documents/books"

In [3]:
loader = DirectoryLoader(DATA_PATH, glob='*.txt')
docs = loader.load()

In [4]:
len(docs)

8

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
)
chunks = text_splitter.split_documents(docs)

In [6]:
len(chunks)

7042

In [7]:
chunks[10]

Document(page_content='Thus go they over through the umber’d wave, And ever they on the opposing bank Be landed, on this side another throng Still gathers. “Son,” thus spake the courteous guide, “Those, who die subject to the wrath of God, All here together come from every clime, And to o’erpass the river are not loth: For so heaven’s justice goads them on, that fear Is turn’d into desire. Hence ne’er hath past Good spirit. If of thee Charon complain, Now mayst thou know the import of his words.”\n\nThis said, the gloomy region trembling shook So terribly, that yet with clammy dews Fear chills my brow. The sad earth gave a blast, That, lightening, shot forth a vermilion flame, Which all my senses conquer’d quite, and I Down dropp’d, as one with sudden slumber seiz’d.\n\nCANTO IV\n\nBroke the deep slumber in my brain a crash Of heavy thunder, that I shook myself, As one by main force rous’d. Risen upright, My rested eyes I mov’d around, and search’d With fixed ken to know what place it 

In [8]:
if not os.path.exists(CHROMA_PATH):
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    db = Chroma.from_documents(
        chunks, embeddings, persist_directory=CHROMA_PATH
    )
    db.persist()