In [None]:
!pip install chromadb

In [None]:
from pydantic import BaseModel
from typing import List
import chromadb
from chromadb.config import Settings

class DocumentEntry(BaseModel):
    id: str
    document: str
    source: str

client = chromadb.PersistentClient(path="/content/chroma_db", settings=Settings(anonymized_telemetry=False))
collection = client.create_collection("numericite-demo")

documents = [
    DocumentEntry(id="id1", document="The quick brown fox jumps over the lazy dog.", source="proverb"),
    DocumentEntry(id="id2", document="E=mc² is a famous equation in physics.", source="physics textbook"),
    DocumentEntry(id="id3", document="To be or not to be, that is the question.", source="Shakespeare's Hamlet"),
    DocumentEntry(id="id4", document="Python is a versatile programming language.", source="programming guide"),
    DocumentEntry(id="id5", document="The capital of France is Paris.", source="geography book"),
    DocumentEntry(id="id6", document="Photosynthesis is the process by which plants make their food.", source="biology textbook"),
    DocumentEntry(id="id7", document="The Mona Lisa is a renowned painting by Leonardo da Vinci.", source="art history book"),
    DocumentEntry(id="id8", document="Mount Everest is the highest mountain in the world.", source="geography book"),
    DocumentEntry(id="id9", document="The theory of evolution was proposed by Charles Darwin.", source="biology textbook"),
    DocumentEntry(id="id10", document="In 1492, Columbus sailed the ocean blue.", source="history book"),
]

collection.add(
    documents=[doc.document for doc in documents],
    metadatas=[{"source": doc.source} for doc in documents],
    ids=[doc.id for doc in documents]
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 37.2MiB/s]


In [None]:
from typing import List, Tuple
def query(query_text: str, n_results: int = 1) -> List[Tuple[DocumentEntry, float]]:
    results = collection.query(
        query_texts=[query_text],
        n_results=n_results,
        include=['documents', 'distances']
    )

    retrieved_ids = results.get("ids", [[]])[0]
    retrieved_distances = results.get("distances", [[]])[0]

    # Create a list of tuples with (document, similarity_score)
    retrieved_documents_with_scores = [
        (doc, retrieved_distances[retrieved_ids.index(doc.id)])
        for doc in documents
        if doc.id in retrieved_ids
    ]

    # Sort by similarity score (ascending)
    retrieved_documents_with_scores.sort(key=lambda x: x[1])

    return retrieved_documents_with_scores

for doc, score in query("Discovery of america", 3):
    print(f"Document: {doc}")
    print(f"Similarity Score: {score}")
    print("---")

Document: id='id10' document='In 1492, Columbus sailed the ocean blue.' source='history book'
Similarity Score: 1.2723906923882786
---
Document: id='id9' document='The theory of evolution was proposed by Charles Darwin.' source='biology textbook'
Similarity Score: 1.3659229855718953
---
Document: id='id7' document='The Mona Lisa is a renowned painting by Leonardo da Vinci.' source='art history book'
Similarity Score: 1.7510504626183678
---
