## Creating Chroma Vectorstore

In [7]:
from configs import API_KEY, DEFAULT_MODEL
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import os
import openai
import numpy as np
from langchain_community.vectorstores import Chroma

In [2]:
os.environ["OPENAI_API_KEY"] = API_KEY
# openai.api_key = API_KEY
openai.api_key = os.getenv('OPENAI_API_KEY')

In [3]:
loader_docx = Docx2txtLoader("data/Introduction_to_Data_and_Data_Science_1.docx")
pages_docx = loader_docx.load()

markdown_header_text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [('#', 'Course Title'),
                                                                                  ('##', 'Lecture Title')])

pages_markdown_split = markdown_header_text_splitter.split_text(pages_docx[0].page_content)

for i in range(len(pages_markdown_split)):
    pages_markdown_split[i].page_content = (' '.join(pages_markdown_split[i].page_content.split()))

character_splitter= CharacterTextSplitter(separator = '', chunk_size = 500, chunk_overlap = 50)
pages_md_char_split = character_splitter.split_documents(pages_markdown_split)

In [4]:
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')

In [5]:
len(pages_md_char_split)

19

In [9]:
vectorstore = Chroma.from_documents(documents = pages_md_char_split,
                                    embedding = embedding, 
                                    persist_directory = "./into-to-DS-vectorstore")

In [None]:
vectorstore_from_dir = Chroma(persist_directory = "./into-to-DS-vectorstore", embedding_function = embedding)

In [11]:
pages_md_char_split[0].metadata

{'Course Title': 'Introduction to Data and Data Science',
 'Lecture Title': 'Analysis vs Analytics'}

In [13]:
pages_md_char_split[-1].page_content

'retical preparation is strong enough, you will find yourself restricted by software. Knowing a programming language such as R and Python, gives you the freedom to create specific, ad-hoc tools for each project you are working on. Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'

In [15]:
pages_md_char_split[18].page_content

'retical preparation is strong enough, you will find yourself restricted by software. Knowing a programming language such as R and Python, gives you the freedom to create specific, ad-hoc tools for each project you are working on. Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'