In [3]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings 
from langchain_text_splitters import RecursiveCharacterTextSplitter 

In [4]:
loader = TextLoader("speech.txt")
data = loader.load()
data

[Document(metadata={'source': 'speech.txt'}, page_content='My parents impressed on me the value of that you work hard for what you want in life. \nThat your word is your bond and you do what you say and keep your promise. \nThat you treat people with respect. \nShow the values and morals in in the daily life. \nThat is the lesson that we continue to pass on to our son.\nWe need to pass those lessons on to the many generations to follow. \n[Cheering] Because we want our children in these nations to know that the only limit to your achievement is the strength of your dreams and your willingness to work for them.\nNow is the winter of our discontent\n  Made glorious summer by this sun of York;\n  And all the clouds that lour\'d upon our house\n  In the deep bosom of the ocean buried.\n  Now are our brows bound with victorious wreaths;\n  Our bruised arms hung up for monuments;\n  Our stern alarums changed to merry meetings,\n  Our dreadful marches to delightful measures.\n  Grim-visaged w

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
split = text_splitter.split_documents(data)

In [8]:
embedding = OllamaEmbeddings(model="nomic-embed-text", model_kwargs={"temperature": 0.1})

In [11]:
vector_db = Chroma.from_documents(
    documents=split,
    embedding=embedding
)


In [12]:
vector_db

<langchain_chroma.vectorstores.Chroma at 0x1204e7f70>

In [14]:
query = "What is the main topic of the speech?"
docs = vector_db.similarity_search(query, k=3)
docs[0].page_content

'🛠️ When to Use What?\nUse Case\tchunk_size\tchunk_overlap\nTiny docs\tsmall (100–200)\t10–20\nLong speeches/books\tbig (500–1000)\t50–100\nQA over documents\tmedium (300–600)\t50\nSentence-sensitive tasks\tsmall\tmedium\n\n🧁 TL;DR – Key Takeaways\nchunk_size = how big each text piece is\n\nchunk_overlap = how much text is shared between chunks\n\nSmall chunks: easier to process, but may lose context\n\nBig chunks: preserve meaning, but take more memory\n\nOverlap: preserves flow between chunks'

In [15]:
vector_db=Chroma.from_documents(documents=split, embedding=embedding, persist_directory="./chroma_db")