In [3]:
import os
import chromadb
from chromadb.utils import embedding_functions
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
import numpy as np

In [4]:
sentence_list = [
    "Meta drops multimodal Llama 3.2 — here's why it's such a big deal",
    "Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models",
    "Google is bringing Gemini to all older Pixel Buds",
    "The first Intel Battlmage GPU benchmarks have leaked",
    "Dell partners with Nvidia to accelerate AI adoption in telecoms",
]
ids = ["id1", "id2", "id3", "id4", "id5"]

**Creating a collection**

In [5]:
chroma_client = chromadb.Client()

In [7]:
# To persist in disk, use:
chroma_client = chromadb.PersistentClient(path="chromadb/")

In [8]:
collection = chroma_client.create_collection(name="udacity")

In [9]:
# By default, Chroma uses the Sentence Transformers all-MiniLM-L6-v2 
# model to create embeddings.
collection.add(
    documents=sentence_list,
    ids=ids
)

/Users/amundle/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 52.1MiB/s]


In [10]:
collection._embedding_function

<chromadb.api.types.DefaultEmbeddingFunction at 0x112244650>

In [11]:
collection.count()

5

In [12]:
collection.peek(2)

{'ids': ['id1', 'id2'],
 'embeddings': array([[ 6.06655143e-02, -3.51322480e-02,  6.06437288e-02,
         -5.11925854e-02,  1.13580197e-01, -1.88892763e-02,
         -2.68528331e-02,  5.48633784e-02,  3.23644504e-02,
          5.42442985e-02, -4.04198468e-02, -1.90558676e-02,
         -5.97919673e-02,  2.56032404e-02,  8.48459899e-02,
          4.12196927e-02,  3.95206176e-02, -4.00091521e-02,
         -7.66606703e-02,  2.78292280e-02,  5.38355187e-02,
         -1.35247353e-02,  9.65649709e-02, -3.04361507e-02,
          6.61455886e-03,  7.21730739e-02, -9.53866169e-02,
         -2.75959335e-02,  7.86793791e-03, -6.68519735e-02,
         -1.27341859e-02,  1.21338010e-01, -6.66138083e-02,
         -3.28670517e-02, -6.49284273e-02, -1.61901880e-02,
         -3.32960160e-03,  8.04080665e-02, -3.84503491e-02,
          1.37278825e-04,  3.72596853e-03,  4.83831167e-02,
         -3.65696087e-06, -4.51370180e-02, -1.37449345e-02,
         -7.15254620e-02,  1.01806317e-02, -4.23029736e-02,
  

In [13]:
collection.query(
    query_texts=["gadget"],
    n_results=2,
    include=['metadatas', 'documents', 'distances']
)

{'ids': [['id3', 'id1']],
 'embeddings': None,
 'documents': [['Google is bringing Gemini to all older Pixel Buds',
   "Meta drops multimodal Llama 3.2 — here's why it's such a big deal"]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.5251753330230713, 1.7548508644104004]]}

**Choosing other models**

In [14]:
embeddings_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-mpnet-base-v2"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
embeddings = embeddings_fn(sentence_list)
len(embeddings)

5

In [16]:
print(np.dot(embeddings[1], embeddings[4]))
print(sentence_list[1])
print(sentence_list[4])

0.5583155
Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models
Dell partners with Nvidia to accelerate AI adoption in telecoms


In [30]:
from dotenv import load_dotenv
load_dotenv()

embeddings_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("VOCAREUM_OPENAI_API_KEY"),
    api_base="https://openai.vocareum.com/v1",
    model_name="text-embedding-ada-002"
)

In [31]:
# Print all attributes to find the right one
for attr in dir(embeddings_fn):
    if 'model' in attr.lower():
        print(f"{attr}: {getattr(embeddings_fn, attr, 'N/A')}")

model_name: text-embedding-ada-002


In [32]:
embeddings_fn.model_name

'text-embedding-ada-002'

In [33]:
chroma_client.delete_collection(name="udacity")

collection = chroma_client.create_collection(
    name="udacity",
    embedding_function=embeddings_fn
)

In [34]:
collection.add(
    documents=sentence_list,
    ids=ids
)

In [35]:
collection._embedding_function

<chromadb.utils.embedding_functions.openai_embedding_function.OpenAIEmbeddingFunction at 0x128eced50>

In [36]:
collection.query(
    query_texts=["gadget"],
    n_results=2,
    include=['metadatas', 'documents', 'distances']
)

{'ids': [['id3', 'id4']],
 'embeddings': None,
 'documents': [['Google is bringing Gemini to all older Pixel Buds',
   'The first Intel Battlmage GPU benchmarks have leaked']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[0.46601054072380066, 0.48678600788116455]]}

**Using with LangChain**

In [37]:
chroma_client.delete_collection(name="udacity")

In [50]:
from dotenv import load_dotenv
load_dotenv()

vector_store = Chroma(
    collection_name="udacity",
    embedding_function=OpenAIEmbeddings(
        api_key=os.getenv("VOCAREUM_OPENAI_API_KEY"),
        base_url="https://openai.vocareum.com/v1",
        model="text-embedding-ada-002"
        # api_base="https://openai.vocareum.com/v1",
        # model_name="text-embedding-ada-002" 
    ),
)

In [51]:
documents = [
    Document(
        page_content="Meta drops multimodal Llama 3.2 — here's why it's such a big deal",
        metadata={"company":"Meta", "topic": "llama"}
    ),
    Document(
        page_content="Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models",
        metadata={"company":"Nvidia", "topic": "acquisition"}
    ),
    Document(
        page_content="Google is bringing Gemini to all older Pixel Buds",
        metadata={"company":"Google", "topic": "gemini"}
    ),
    Document(
        page_content="The first Intel Battlmage GPU benchmarks have leaked",
        metadata={"company":"Intel", "topic": "gpu"}
    ),
    Document(
        page_content="Dell partners with Nvidia to accelerate AI adoption in telecoms",
        metadata={"company":"Dell", "topic": "partnership"}
    ),
]

In [52]:
vector_store.add_documents(documents=documents, ids=ids)

['id1', 'id2', 'id3', 'id4', 'id5']

In [53]:
results = vector_store.similarity_search_with_score(query="gpu",k=2)
for doc, score in results:
    print(f"-> {doc.page_content}\n   [Score={score:.2f}]\n   [{doc.metadata}]\n\n")

-> The first Intel Battlmage GPU benchmarks have leaked
   [Score=0.35]
   [{'topic': 'gpu', 'company': 'Intel'}]


-> Chip giant Nvidia acquires OctoAI, a Seattle startup that helps companies run AI models
   [Score=0.41]
   [{'company': 'Nvidia', 'topic': 'acquisition'}]


