In [1]:
!pip install langchain chromadb openai tiktoken pypdf langchain_openai langchain-community langchain-huggingface huggingface-hub

INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Using cached langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Using cached langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Using cached langchain_community-0.3.31-py3-none-any.whl (2.5 MB)
Installing collected packages: langchain-community
  Attempting uninstall: langchain-community
    Found existing installation: langchain-community 0.4.1
    Uninstalling langchain-community-0.4.1:
      Successfully uninstalled langchain-community-0.4.1
Successfully installed langchain-community-0.3.31


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-experimental 0.4.1 requires langchain-community<1.0.0,>=0.4.0, but you have langchain-community 0.3.31 which is incompatible.
langchain-experimental 0.4.1 requires langchain-core<2.0.0,>=1.0.0, but you have langchain-core 0.3.80 which is incompatible.

[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain.schema import Document

# create Langchain document for ipl players

doc1= Document(
  page_content="Rohit Sharma is one of the most successful players in IPL history. He is known for his elegant batting style and excellent leadership, having led Mumbai Indians to multiple IPL titles.",
  metadata={"team":"Mumbai Indians (MI)"}
)
doc2= Document(
  page_content="MS Dhoni is famous for his calm nature and sharp decision-making. He is a legendary captain and wicketkeeper who has played a major role in Chennai Super Kings’ success over the years.",
  metadata={"team": "Chennai Super Kings (CSK)"}
)
doc3= Document(
  page_content="Virat Kohli is known for his aggressive batting and consistency. He is one of the highest run-scorers in IPL history and a key player for Royal Challengers Bengaluru.",
  metadata={"team": "Royal Challengers Bengaluru (RCB)"}
)
doc4= Document(
  page_content="Andre Russell is a powerful all-rounder known for his explosive batting and fast bowling. He can change the match quickly and is one of KKR’s most impactful players.",
  metadata={"team": "Kolkata Knight Riders (KKR)"}
)
doc5= Document(
  page_content="Sanju Samson is known for his stylish batting and wicketkeeping skills. As a captain and batter, he plays an important role in leading Rajasthan Royals and supporting young talent.",
  metadata={"team": "Rajasthan Royals (RR)"}
)

In [3]:
docs= [doc1, doc2, doc3, doc4, doc5]

embeddings= HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

vector_store= Chroma(
  embedding_function= embeddings,
  persist_directory= 'chroma_db', 
  collection_name='sample'
)

# persist_directory means at which location you want to store this database
# as knew we make collection in chroma_db (go to about chrma pic) so for that we provide collection_name

  vector_store= Chroma(


In [4]:
## now to see how to add documents in vector database
# so using add_documents we can add docs (any number of doc) and also nice thing is that
# it assigns a unique id to each doc

vector_store.add_documents(docs)

['133b6a3e-d6ff-43e4-ba16-9b5c3e0bbd0c',
 'b0196e1d-0b31-4548-b910-5ba543d01e90',
 '7eb53694-376e-4444-890d-ca85f6d2dfc7',
 '58e5f761-c6bc-4a87-b59d-6f17ecad4be4',
 '03ec6884-98c9-463f-a580-0503994f43a3']

In [5]:
# if you want to see how many documents in our vector db so for that we use **get** fn
# and inside get just provide what you would like to see --> eg embedding, documents, metadatas

vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['133b6a3e-d6ff-43e4-ba16-9b5c3e0bbd0c',
  'b0196e1d-0b31-4548-b910-5ba543d01e90',
  '7eb53694-376e-4444-890d-ca85f6d2dfc7',
  '58e5f761-c6bc-4a87-b59d-6f17ecad4be4',
  '03ec6884-98c9-463f-a580-0503994f43a3'],
 'embeddings': array([[-0.01868994,  0.00857278, -0.00871579, ..., -0.0093652 ,
          0.04480161, -0.01004053],
        [-0.04946323,  0.07057964, -0.01378085, ...,  0.01947097,
          0.0332321 , -0.00038092],
        [-0.01638   ,  0.0457838 , -0.05210069, ..., -0.02686616,
          0.01718367, -0.00215131],
        [-0.04834907,  0.03546508, -0.02737281, ..., -0.00107106,
          0.05774703,  0.04262868],
        [-0.02141571,  0.05108638, -0.01203347, ..., -0.03016481,
          0.09216356,  0.03206361]], shape=(5, 384)),
 'documents': ['Rohit Sharma is one of the most successful players in IPL history. He is known for his elegant batting style and excellent leadership, having led Mumbai Indians to multiple IPL titles.',
  'MS Dhoni is famous for his calm na

In [6]:
## search documents -->for that we have a fn called similarity_search

vector_store.similarity_search(
  query= "Who among these are a bowler?",
  k=2
) 

# k means how many similar documents or objects you want to show 


[Document(metadata={'team': 'Kolkata Knight Riders (KKR)'}, page_content='Andre Russell is a powerful all-rounder known for his explosive batting and fast bowling. He can change the match quickly and is one of KKR’s most impactful players.'),
 Document(metadata={'team': 'Royal Challengers Bengaluru (RCB)'}, page_content='Virat Kohli is known for his aggressive batting and consistency. He is one of the highest run-scorers in IPL history and a key player for Royal Challengers Bengaluru.')]

In [None]:
## if you also want to see similarity score

vector_store.similarity_search_with_score(
  query="Who among these are a bowler?",
  k=2
)

## low score indicate good similarity and high bad similary bz it is distance bw query and these docs


[(Document(metadata={'team': 'Kolkata Knight Riders (KKR)'}, page_content='Andre Russell is a powerful all-rounder known for his explosive batting and fast bowling. He can change the match quickly and is one of KKR’s most impactful players.'),
  0.7028247117996216),
 (Document(metadata={'team': 'Royal Challengers Bengaluru (RCB)'}, page_content='Virat Kohli is known for his aggressive batting and consistency. He is one of the highest run-scorers in IPL history and a key player for Royal Challengers Bengaluru.'),
  0.7636445760726929)]

In [None]:
## if you want you can filter on meta data
## like if you want to know how many players are from Chennai Super Kings (CSK)


vector_store.similarity_search_with_score(
  query="",
  filter={"team":"Chennai Super Kings (CSK)"}
)

# it will return all related meta data docs 

[(Document(metadata={'team': 'Chennai Super Kings (CSK)'}, page_content='MS Dhoni is famous for his calm nature and sharp decision-making. He is a legendary captain and wicketkeeper who has played a major role in Chennai Super Kings’ success over the years.'),
  1.0746990442276)]

In [13]:
## now if you want to update the database or document

# this is basically new doc
updated_doc1= Document(
  page_content="Virat Kohli is known for his fitness and discipline, and he has inspired many cricketers to focus on physical training. He is also admired for his strong work ethic and leadership mindset.",
  metadata={"team":"Royal Challengers Bengaluru (RCB)"}
)

# now i want to update exsiting one by this one

vector_store.update_document(document_id="7eb53694-376e-4444-890d-ca85f6d2dfc7", document=updated_doc1)



In [14]:
## now try to is it updated or not

vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['133b6a3e-d6ff-43e4-ba16-9b5c3e0bbd0c',
  'b0196e1d-0b31-4548-b910-5ba543d01e90',
  '7eb53694-376e-4444-890d-ca85f6d2dfc7',
  '58e5f761-c6bc-4a87-b59d-6f17ecad4be4',
  '03ec6884-98c9-463f-a580-0503994f43a3'],
 'embeddings': array([[-0.01868994,  0.00857278, -0.00871579, ..., -0.0093652 ,
          0.04480161, -0.01004053],
        [-0.04946323,  0.07057964, -0.01378085, ...,  0.01947097,
          0.0332321 , -0.00038092],
        [ 0.00511134,  0.09163809, -0.02952226, ...,  0.00293893,
          0.0321112 ,  0.02601057],
        [-0.04834907,  0.03546508, -0.02737281, ..., -0.00107106,
          0.05774703,  0.04262868],
        [-0.02141571,  0.05108638, -0.01203347, ..., -0.03016481,
          0.09216356,  0.03206361]], shape=(5, 384)),
 'documents': ['Rohit Sharma is one of the most successful players in IPL history. He is known for his elegant batting style and excellent leadership, having led Mumbai Indians to multiple IPL titles.',
  'MS Dhoni is famous for his calm na

In [15]:
## delete document

vector_store.delete(ids=["03ec6884-98c9-463f-a580-0503994f43a3"])


In [16]:
vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['133b6a3e-d6ff-43e4-ba16-9b5c3e0bbd0c',
  'b0196e1d-0b31-4548-b910-5ba543d01e90',
  '7eb53694-376e-4444-890d-ca85f6d2dfc7',
  '58e5f761-c6bc-4a87-b59d-6f17ecad4be4'],
 'embeddings': array([[-0.01868994,  0.00857278, -0.00871579, ..., -0.0093652 ,
          0.04480161, -0.01004053],
        [-0.04946323,  0.07057964, -0.01378085, ...,  0.01947097,
          0.0332321 , -0.00038092],
        [ 0.00511134,  0.09163809, -0.02952226, ...,  0.00293893,
          0.0321112 ,  0.02601057],
        [-0.04834907,  0.03546508, -0.02737281, ..., -0.00107106,
          0.05774703,  0.04262868]], shape=(4, 384)),
 'documents': ['Rohit Sharma is one of the most successful players in IPL history. He is known for his elegant batting style and excellent leadership, having led Mumbai Indians to multiple IPL titles.',
  'MS Dhoni is famous for his calm nature and sharp decision-making. He is a legendary captain and wicketkeeper who has played a major role in Chennai Super Kings’ success over the 