In [1]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# create your dummy documents
documents =  [
    Document(
        page_content="Zinedine Zidane was a legendary midfielder who played for France and won the 1998 FIFA World Cup.",
        metadata={"topic": "football", "country": "France"}
    ),
    Document(
        page_content="Thierry Henry is France's all-time leading goal scorer and a key figure in international football.",
        metadata={"topic": "football", "country": "France"}
    ),
    Document(
        page_content="Manuel Neuer revolutionized the role of the goalkeeper with his sweeper-keeper style for Germany.",
        metadata={"topic": "football", "country": "Germany"}
    ),
    Document(
        page_content="Lionel Messi led Argentina to victory in the 2022 FIFA World Cup and is regarded as one of the greatest players ever.",
        metadata={"topic": "football", "country": "Argentina"}
    ),
    Document(
        page_content="Machine learning enables computers to learn patterns from data without being explicitly programmed.",
        metadata={"topic": "machine_learning"}
    ),
    Document(
        page_content="Neural networks are inspired by the human brain and consist of layers of interconnected neurons.",
        metadata={"topic": "deep_learning"}
    ),
    Document(
        page_content="Transformers use self-attention mechanisms to process sequences in parallel, improving NLP performance.",
        metadata={"topic": "nlp", "model": "transformer"}
    ),
    Document(
        page_content="FAISS is a library developed by Meta for efficient similarity search and clustering of dense vectors.",
        metadata={"topic": "vector_database"}
    ),
    Document(
        page_content="Pinecone is a managed vector database designed for scalable similarity search in production systems.",
        metadata={"topic": "vector_database"}
    ),
    Document(
        page_content="Retrieval-Augmented Generation combines information retrieval with language models to produce grounded answers.",
        metadata={"topic": "rag"}
    ),
]

In [3]:
# initialize your embedding model
embedding_model = OllamaEmbeddings(
    model="all-minilm"
)

# create ollama vector store and store your docs
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory="dummy_docs_db",
    collection_name="sample"
)

In [4]:
# convert the vectorstore into retriever
retriever = vector_store.as_retriever(search_kwargs={"k":2}) # fetch the top 2 similar docs

In [6]:
# take an user query, and send to the retriever
# and the retriever will search from the vector store
# and resturn top k similar documents

query = "Which library is developed by Meta?"

result = retriever.invoke(query)

# show the documents 
for i,doc in enumerate(result):
    print(f"Doc {i+1}")
    print(doc.page_content)

Doc 1
FAISS is a library developed by Meta for efficient similarity search and clustering of dense vectors.
Doc 2
Retrieval-Augmented Generation combines information retrieval with language models to produce grounded answers.
