## Indexing: Creating a Chroma Vectore (Storing)

In [2]:
%load_ext dotenv
%dotenv

In [3]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np

In [4]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Course Title"),
                           ("##", "Lecture Title")]
)

pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())

char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

In [5]:
len(pages_char_split)

20

In [6]:
embedding = OpenAIEmbeddings(model = "text-embedding-ada-002")

In [7]:
vectorstore = Chroma.from_documents(documents = pages_char_split, embedding = embedding, persist_directory = "./intro-to-ds-lectures")

In [None]:
vectorstore_from_directory = Chroma(persist_directory = "./intro-to-ds-lectures",
                                   embedding_function = embedding)