In [1]:
import chromadb
from langchain_openai import OpenAIEmbeddings
from pyprojroot import here

### Vector DB connection

In [2]:
collection_name = "test"
chroma_client = chromadb.PersistentClient(path=str(here("data/db/chroma")))

In [3]:
try:
    collection = chroma_client.create_collection(name=collection_name)
except:
    collection = chroma_client.get_collection(collection_name)

### Embedding model client

In [4]:
embedding_client = OpenAIEmbeddings(
    model="text-embedding-3-small",
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)

In [5]:
embeddings = []
ids = []
metadatas = []
docs = []
text_to_embed = ["Something about movies", "Something about tv shows"]

for i, text in enumerate(text_to_embed):
    ids.append(f"text-{i}")
    metadatas.append({"source": "test"})
    embedding = embedding_client.embed_query(text)
    embeddings.append(embedding)
    docs.append(text)
    

In [6]:
print(ids)
print(docs)
print(metadatas)
print(embeddings)


['text-0', 'text-1']
['Something about movies', 'Something about tv shows']
[{'source': 'test'}, {'source': 'test'}]
[[-0.021893123164772987, 0.020016569644212723, -0.056063853204250336, 0.02955935336649418, 0.008713608607649803, -0.020467525348067284, 0.026155373081564903, 0.024104256182909012, 0.005906051956117153, -0.007138903718441725, 0.012284878641366959, -0.03639640659093857, -0.053008995950222015, -0.0024420651607215405, -0.016801699995994568, 0.001907465630210936, -0.012750380672514439, 0.0011064755963161588, 0.03313789516687393, -0.006320639047771692, 0.04972139373421669, 0.06831236183643341, -0.04518274962902069, -0.005833317060023546, -0.008669967763125896, 0.009680978953838348, -0.01333225704729557, 0.04864491894841194, 0.03660006448626518, 0.019056472927331924, 0.017019903287291527, -0.03435983881354332, -0.0022129511926323175, -0.03299242630600929, -0.06278453022241592, 0.00925911869853735, 0.0027639158070087433, 0.004705930594354868, 0.04841217026114464, 0.0009437318076

### Add to collection

In [7]:
collection.add(
    ids=ids,
    documents=docs, # Optional
    metadatas=metadatas, # Optional
    embeddings=embeddings
)
#collection.add(ids=ids, documents=docs, metadatas=metadatas, embeddings=embeddings)


In [8]:
metadatas

[{'source': 'test'}, {'source': 'test'}]

In [9]:
docs

['Something about movies', 'Something about tv shows']

In [10]:
print(collection.get(ids=["text-0"], include=['documents', 'metadatas']))

{'ids': ['text-0'], 'embeddings': None, 'documents': ['Something about movies'], 'uris': None, 'data': None, 'metadatas': [{'source': 'test'}], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [11]:
print("Length of collection:", collection.count())

Length of collection: 2


### Similarity search

In [12]:
question = "Can you tell me something about tv shows?"
question_embedding = embedding_client.embed_query(question)

In [13]:
results = collection.query(
    query_embeddings = question_embedding,
    n_results=1 #top_k
)

results

{'ids': [['text-1']],
 'embeddings': None,
 'documents': [['Something about tv shows']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'test'}]],
 'distances': [[0.48519167589565626]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [14]:
question = "Can you tell me something about movies?"
question_embedding = embedding_client.embed_query(question)

In [15]:
results = collection.query(
    query_embeddings = question_embedding,
    n_results=1 #top_k
)

results

{'ids': [['text-0']],
 'embeddings': None,
 'documents': [['Something about movies']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'test'}]],
 'distances': [[0.5451702138507617]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}