# Query OLS Embeddings with ChromaDB

This notebook demonstrates fast similarity search using ChromaDB.

**Prerequisites:** Run `migrate_to_chromadb.py` first to create the ChromaDB collection.

In [1]:
import openai
import chromadb
import os
from dotenv import load_dotenv
from chromadb.config import Settings

In [2]:
# Load environment variables
load_dotenv(dotenv_path='../.env')
openai.api_key = os.getenv("OPENAI_API_KEY")

if not openai.api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file or environment variables.")

In [3]:
# Connect to ChromaDB
client = chromadb.PersistentClient(
    path="./chroma_db",
    settings=Settings(anonymized_telemetry=False)
)

collection = client.get_collection(name="ols_embeddings")
print(f"Collection loaded: {collection.count():,} embeddings")

Collection loaded: 10,000 embeddings


In [4]:
# Your search query
query = "agricultural soil"

# Generate embedding for the query
response = openai.embeddings.create(
    model="text-embedding-3-small",
    input=query
)

query_embedding = response.data[0].embedding
print(f"Generated embedding for: '{query}'")

Generated embedding for: 'agricultural soil'


In [5]:
# Query ChromaDB for similar embeddings
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=10,  # Top 10 results
    include=["documents", "metadatas", "distances"]
)

print(f"\nTop 10 most similar terms (across ALL ontologies):\n")
for i in range(len(results['ids'][0])):
    iri = results['ids'][0][i]
    distance = results['distances'][0][i]
    ontology = results['metadatas'][0][i]['ontologyId']
    document = results['documents'][0][i]
    
    # Convert distance to similarity (ChromaDB uses L2 distance by default)
    # For cosine similarity, use collection with metric="cosine"
    print(f"{distance:.4f} | {ontology} | {iri}")
    print(f"  {document[:150]}...\n")


Top 10 most similar terms (across ALL ontologies):

0.6340 | agro | agro:http://purl.obolibrary.org/obo/ENVO_00005755
  field soil...

0.7575 | agro | agro:http://purl.obolibrary.org/obo/ENVO_0010003
  agricultural environmental material...

0.7877 | agro | agro:http://purl.obolibrary.org/obo/ENVO_00003914
  chalk soil...

0.7896 | agro | agro:http://purl.obolibrary.org/obo/ENVO_00000519
  agricultural terrace...

0.8486 | agro | agro:http://purl.obolibrary.org/obo/ENVO_00005761
  meadow soil...

0.8486 | agro | agro:http://purl.obolibrary.org/obo/ENVO_00005749
  farm soil; A portion of soil which is part of a cropland or a rangeland biome....

0.8925 | bco | bco:http://purl.obolibrary.org/obo/PCO_0000030
  agricultural household; A household in which the majority of the income of its members is derived from agricultural activities....

0.9123 | agro | agro:http://purl.obolibrary.org/obo/ENVO_00005786
  upland soil...

0.9361 | agro | agro:http://purl.obolibrary.org/obo/AGRO_00002071


In [6]:
# Filter by specific ontology (e.g., only ENVO terms)
results_envo = collection.query(
    query_embeddings=[query_embedding],
    n_results=10,
    where={"ontologyId": "envo"},  # Filter condition
    include=["documents", "metadatas", "distances"]
)

print(f"\nTop 10 most similar ENVO terms:\n")
for i in range(len(results_envo['ids'][0])):
    iri = results_envo['ids'][0][i]
    distance = results_envo['distances'][0][i]
    document = results_envo['documents'][0][i]
    
    print(f"{distance:.4f} | {iri}")
    print(f"  {document[:150]}...\n")


Top 10 most similar ENVO terms:



In [7]:
# Filter by multiple ontologies
results_multi = collection.query(
    query_embeddings=[query_embedding],
    n_results=20,
    where={"ontologyId": {"$in": ["envo", "go", "mondo"]}},  # Multiple ontologies
    include=["documents", "metadatas", "distances"]
)

print(f"\nTop 20 results from ENVO, GO, and MONDO:\n")
for i in range(len(results_multi['ids'][0])):
    iri = results_multi['ids'][0][i]
    distance = results_multi['distances'][0][i]
    ontology = results_multi['metadatas'][0][i]['ontologyId']
    document = results_multi['documents'][0][i]
    
    print(f"{distance:.4f} | {ontology} | {iri}")
    print(f"  {document[:100]}...\n")


Top 20 results from ENVO, GO, and MONDO:

