In [None]:
'''
Save .txt as a Chroma vector database with embeddings generated by all-MiniLM-L6-v2
'''
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

In [None]:
'''
history.txt: The Beginner's American History by D. H. Montgomery
'''
file_name = "history"

In [None]:
'''
Create a text splitter to split the .txt into chunks of 500 characters apiece. The chunks overlap by 100 characters 
to prevent valuable information from being split between two chunks.

Embeddings Model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
'''
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 100,
)

embeddings_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
'''
Create LangChain documents for the chunks as they are split.
Since it seems faster to load data into a ChromaDB piecewise, split
the list of docs into a list of lists with 1000 docs each.
'''
with open(file_name + '.txt', 'r') as source_file:
    source = source_file.read()

docs = text_splitter.create_documents([source])
docs_len = len(docs)
print(docs_len)
sublist_length = 1000
docs_split = [docs[i:i + sublist_length] for i in range(0, len(docs), sublist_length)]

In [None]:
'''
Load the docs into the vectorstore and persist it (create files 
which can be accessed later instead of keeping it all in memory)

Note that if this code is run again, it'll add to the existing vectorstore instead of replacing it,
which can create duplicates or unintentionally retain old data.
If you want to create a different vectorstore after creating one, either remove the existing one or
change the name for the new one. It is also smart to restart the python kernel to clear
your old vectorstore from your memory.
'''
completed_docs = 0
if 'vectordb' not in locals() and 'vectordb' not in globals():
    for docs in docs_split:
        completed_docs += len(docs)
        print(str(completed_docs) + ' / ' + str(docs_len))
        vectordb = Chroma.from_documents(documents=docs, embedding=embeddings_model, persist_directory=file_name + '_db')