# Exploring OLS Embeddings

This notebook demonstrates how to connect to the OLS embeddings database, generate an embedding for a query using the OpenAI API, and find the most similar terms in the database.

In [27]:
import openai
import sqlite3
import numpy as np
import json
import os
from dotenv import load_dotenv

In [28]:
# Load environment variables from .env file
load_dotenv(dotenv_path='../.env')

# Set your API key
openai.api_key = os.getenv("OPENAI_API_KEY")

if not openai.api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file or environment variables.")

In [29]:
# Your search query
query = "agricultural soil"

# Generate embedding using the SAME model as OLS
response = openai.embeddings.create(
    model="text-embedding-3-small",
    input=query
)

query_embedding = response.data[0].embedding  # List of 1536 floats
print(f"Generated embedding for query: '{query}'")

Generated embedding for query: 'agricultural soil'


In [30]:
def cosine_similarity(vec1, vec2):
    """Calculates the cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [31]:
# Path to the embeddings database
db_path = '/media/mark/37d25dfc-7ae2-4e0e-a60a-05fd808cc0ba/stopgap/embeddings.db'

# Connect to the SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Fetch a subset of embeddings to search against (e.g., from a specific ontology)
# WARNING: Searching the entire 9.5M rows without an index will be very slow.
print("Fetching embeddings from the database... (this might take a moment)")
cursor.execute("""SELECT ontologyId, iri, document, embeddings 
                 FROM embeddings 
                 WHERE ontologyId = 'envo'""")

results = []
for row in cursor.fetchall():
    ontology_id, iri, document, embedding_json = row
    embedding_data = json.loads(embedding_json)
    
    # Handle different embedding storage formats
    if isinstance(embedding_data, dict):
        # If it's a dict, extract the embedding array (common keys: 'embedding', 'values', 'data')
        stored_embedding = embedding_data.get('embedding') or embedding_data.get('values') or embedding_data.get('data') or list(embedding_data.values())[0]
    else:
        stored_embedding = embedding_data
    
    similarity = cosine_similarity(query_embedding, stored_embedding)
    results.append((similarity, ontology_id, iri, document))

conn.close()
print(f"Calculated similarity for {len(results)} embeddings.")

Fetching embeddings from the database... (this might take a moment)
Calculated similarity for 7365 embeddings.


In [32]:
# Sort by similarity (descending)
results.sort(reverse=True, key=lambda x: x[0])

# Print the top 10 most similar results
print("\nTop 10 most similar terms in ENVO:")
for sim, ont_id, iri, doc in results[:10]:
    print(f"{sim:.4f} | {iri} | {doc[:100]}...")


Top 10 most similar terms in ENVO:
0.7442 | http://purl.obolibrary.org/obo/ENVO_00002259 | agricultural soil; Soil which is part of an ecosystem used for agricultural activities....
0.6317 | http://purl.obolibrary.org/obo/ENVO_00005755 | field soil; Soil which is part of an agricultural field....
0.6254 | http://purl.obolibrary.org/obo/ENVO_00005742 | arable soil; Soil which is capable of supporting the growth of crops....
0.5824 | http://purl.obolibrary.org/obo/CHEBI_33286 | agrochemical...
0.5757 | http://purl.obolibrary.org/obo/ENVO_00005749 | farm soil; A portion of soil which is part of a cropland or a rangeland biome....
0.5616 | http://purl.obolibrary.org/obo/ENVO_00000519 | agricultural terrace; A terrace which is used for agricultural activities....
0.5576 | http://purl.obolibrary.org/obo/ENVO_03000133 | agricultural potting mixture; potting soil; An agricultural environmental material which 1) is compo...
0.5494 | http://purl.obolibrary.org/obo/ENVO_00003884 | farmyard manur