In [1]:
import requests
import json
import os
import pandas as pd
import time
from pathlib import Path
from dotenv import load_dotenv
from collections import Counter
import Levenshtein

In [2]:
metpo_df = pd.read_csv("../data/metpo_terms/metpo_sample_labels.tsv", sep="\t", names=["metpo_id", "metpo_label"])
output_file = "../data/ontology_assessments/phase1_raw_results.tsv"
hq_file = "../data/ontology_assessments/phase1_high_quality_matches.tsv"
summary_stats_json = "../data/ontology_assessments/phase1_summary_stats.json"

In [27]:
requested_match_count = 75  # Number of results to request per API call
high_quality_similarity_ratio = 0.5

In [3]:
# Load environment variables
env_path = Path("..") / ".env"
load_dotenv(dotenv_path=env_path)

BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")

if not BIOPORTAL_API_KEY:
    print("⚠️  Warning: BIOPORTAL_API_KEY not set")
else:
    print("✓ BioPortal API key loaded")

✓ BioPortal API key loaded


In [4]:
# Define search functions
def search_ols(label, rows=requested_match_count):
    """Search OLS4 for a label"""
    url = "https://www.ebi.ac.uk/ols4/api/search"
    params = {"q": label, "type": "class", "rows": rows}

    try:
        response = requests.get(url, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            docs = data.get("response", {}).get("docs", [])
            return [{"label": d.get("label"),
                     "iri": d.get("iri"),
                     "ontology": d.get("ontology_name"),
                     "definition": d.get("description", [""])[0] if d.get("description") else ""}
                    for d in docs]
        return []
    except Exception as e:
        print(f"OLS error for '{label}': {e}")
        return []

In [5]:
def search_bioportal(label, api_key, pagesize=requested_match_count):
    """Search BioPortal for a label"""
    url = "https://data.bioontology.org/search"
    params = {"q": label, "pagesize": pagesize}
    headers = {}
    
    if api_key:
        headers["Authorization"] = f"apikey token={api_key}"
    
    try:
        response = requests.get(url, params=params, headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            collection = data.get("collection", [])
            results = []
            for item in collection:
                ontology_url = item.get("links", {}).get("ontology", "")
                ontology = ontology_url.split("/")[-1] if ontology_url else "unknown"
                
                results.append({
                    "label": item.get("prefLabel"),
                    "iri": item.get("@id"),
                    "ontology": ontology,
                    "definition": item.get("definition", [""])[0] if isinstance(item.get("definition"), list) else item.get("definition", "")
                })
            return results
        return []
    except Exception as e:
        print(f"BioPortal error for '{label}': {e}")
        return []

In [6]:
# Load METPO sample labels (20% sample = 50 classes)
print(f"\nLoaded {len(metpo_df)} METPO labels (20% sample)")
print(f"Estimated runtime: ~{len(metpo_df) * 2} seconds (~{len(metpo_df) * 2 / 60:.1f} minutes)")


Loaded 50 METPO labels (20% sample)
Estimated runtime: ~100 seconds (~1.7 minutes)


In [7]:
metpo_df

Unnamed: 0,metpo_id,metpo_label
0,METPO:1000678,oval shaped
1,METPO:1000481,NaCl delta mid2
2,METPO:1001002,growth temperature observation
3,METPO:1000802,Anaerobic respiration
4,METPO:1000883,cell length very small
5,METPO:1000664,organoheterotrophic
6,METPO:1001023,pH observation
7,METPO:1000476,pH delta mid2
8,METPO:1000614,psychrophilic
9,METPO:1000304,temperature optimum


In [8]:
# Run batch search
all_results = []

In [9]:
for idx, row in metpo_df.iterrows():
    metpo_id = row["metpo_id"]
    metpo_label = row["metpo_label"]
    
    print(f"\n[{idx+1}/{len(metpo_df)}] Searching: {metpo_label}")
    
    # Search OLS
    ols_results = search_ols(metpo_label)
    print(f"  OLS: {len(ols_results)} results")
    
    for result in ols_results:
        all_results.append({
            "metpo_id": metpo_id,
            "metpo_label": metpo_label,
            "source": "OLS",
            "match_label": result["label"],
            "match_iri": result["iri"],
            "match_ontology": result["ontology"],
            "match_definition": result["definition"]
        })
    
    # Search BioPortal
    bp_results = search_bioportal(metpo_label, BIOPORTAL_API_KEY)
    print(f"  BioPortal: {len(bp_results)} results")
    
    for result in bp_results:
        all_results.append({
            "metpo_id": metpo_id,
            "metpo_label": metpo_label,
            "source": "BioPortal",
            "match_label": result["label"],
            "match_iri": result["iri"],
            "match_ontology": result["ontology"],
            "match_definition": result["definition"]
        })
    
    # Rate limiting
    time.sleep(0.5)

print(f"\n✓ Complete: {len(all_results)} total results")


[1/50] Searching: oval shaped
  OLS: 75 results
  BioPortal: 75 results

[2/50] Searching: NaCl delta mid2
  OLS: 75 results
  BioPortal: 75 results

[3/50] Searching: growth temperature observation
  OLS: 75 results
  BioPortal: 75 results

[4/50] Searching: Anaerobic respiration
  OLS: 75 results
  BioPortal: 75 results

[5/50] Searching: cell length very small
  OLS: 75 results
  BioPortal: 75 results

[6/50] Searching: organoheterotrophic
  OLS: 0 results
  BioPortal: 2 results

[7/50] Searching: pH observation
  OLS: 75 results
  BioPortal: 75 results

[8/50] Searching: pH delta mid2
  OLS: 75 results
  BioPortal: 75 results

[9/50] Searching: psychrophilic
  OLS: 30 results
  BioPortal: 13 results

[10/50] Searching: temperature optimum
  OLS: 75 results
  BioPortal: 75 results

[11/50] Searching: dumbbell shaped
  OLS: 75 results
  BioPortal: 75 results

[12/50] Searching: NaCl range observation
  OLS: 75 results
  BioPortal: 75 results

[13/50] Searching: carotenoid pigmentati

In [46]:
# Convert to DataFrame and calculate string distances
results_df = pd.DataFrame(all_results)

In [47]:
results_df = results_df[~results_df["match_iri"].str.contains("metpo", case=False, na=False)]
print(f"After removing self-matches: {len(results_df)} results")


After removing self-matches: 5527 results


In [48]:
# Calculate Levenshtein distance and normalized similarity
results_df["levenshtein_distance"] = results_df.apply(
    lambda row: Levenshtein.distance(
        str(row["metpo_label"]).lower(),
        str(row["match_label"]).lower()
    ) if pd.notna(row["match_label"]) else None,
    axis=1
)

In [49]:
# Normalized similarity ratio (0-1, where 1 is exact match)
results_df["similarity_ratio"] = results_df.apply(
    lambda row: Levenshtein.ratio(
        str(row["metpo_label"]).lower(),
        str(row["match_label"]).lower()
    ) if pd.notna(row["match_label"]) else None,
    axis=1
)

In [50]:
print(f"Results shape: {results_df.shape}")
print(f"\nSimilarity statistics:")

Results shape: (5527, 9)

Similarity statistics:


In [51]:
results_df["similarity_ratio"].describe()

count    5527.000000
mean        0.527694
std         0.195134
min         0.000000
25%         0.391304
50%         0.512821
75%         0.652174
max         1.000000
Name: similarity_ratio, dtype: float64

In [52]:
results_df

Unnamed: 0,metpo_id,metpo_label,source,match_label,match_iri,match_ontology,match_definition,levenshtein_distance,similarity_ratio
0,METPO:1000678,oval shaped,OLS,Oval shaped lesion,http://snomed.info/id/129734009,snomed,,7,0.758621
1,METPO:1000678,oval shaped,OLS,Oval,http://purl.obolibrary.org/obo/NCIT_C48345,ncit,The shape of either an oval or an ellipse. Se...,7,0.533333
2,METPO:1000678,oval shaped,OLS,Oval macrocyte,http://snomed.info/id/117170009,snomed,,9,0.560000
3,METPO:1000678,oval shaped,OLS,oval nucleus,http://purl.obolibrary.org/obo/EMAPA_37667,emapa,,6,0.521739
4,METPO:1000678,oval shaped,OLS,oval nucleus,http://purl.obolibrary.org/obo/MA_0000929,ma,,6,0.521739
...,...,...,...,...,...,...,...,...,...
5928,METPO:1000480,NaCl delta mid1,BioPortal,"MID1, 3-BP DEL, MET438",http://purl.bioontology.org/ontology/OMIM/3005...,OMIM,,17,0.324324
5929,METPO:1000480,NaCl delta mid1,BioPortal,Mid1-interacting protein 1 (human),http://purl.obolibrary.org/obo/PR_Q9NPA3,PR,A Mid1-interacting protein 1 that is encoded i...,27,0.285714
5930,METPO:1000480,NaCl delta mid1,BioPortal,Mid1-interacting protein 1 (rat),http://purl.obolibrary.org/obo/PR_Q6P7D5,PR,A Mid1-interacting protein 1 that is encoded i...,26,0.297872
5931,METPO:1000480,NaCl delta mid1,BioPortal,Mid1-interacting protein 1 (mouse),http://purl.obolibrary.org/obo/PR_Q9CQ20,PR,A Mid1-interacting protein 1 that is encoded i...,27,0.285714


In [53]:
# Save raw results with all columns
results_df.to_csv(output_file, sep="\t", index=False)
print(f"✓ Saved full results to {output_file}")

✓ Saved full results to ../data/ontology_assessments/phase1_raw_results.tsv


In [54]:
# Also save high-quality matches separately
high_quality = results_df[results_df["similarity_ratio"] >= 0.5]

In [55]:
high_quality.to_csv(hq_file, sep="\t", index=False)
print(f"✓ Saved high-quality matches (similarity ≥ 0.5) to {hq_file}")

✓ Saved high-quality matches (similarity ≥ 0.5) to ../data/ontology_assessments/phase1_high_quality_matches.tsv


In [56]:
# Save summary statistics
summary_stats = {
    "total_metpo_terms": len(metpo_df),
    "total_results": len(results_df),
    "high_quality_results": len(high_quality),
    "unique_ontologies": results_df["match_ontology"].nunique(),
    "avg_similarity": results_df["similarity_ratio"].mean(),
    "median_similarity": results_df["similarity_ratio"].median()
}

In [57]:
with open(summary_stats_json, "w") as f:
    json.dump(summary_stats, f, indent=2)
print(f"✓ Saved summary statistics to phase1_summary_stats.json")

✓ Saved summary statistics to phase1_summary_stats.json


## Analysis: Ontology Frequency

In [58]:
# Count ontologies
ontology_counts = results_df["match_ontology"].value_counts()
print("Top 20 ontologies by result count:\n")


Top 20 ontologies by result count:



In [59]:
ontology_counts

match_ontology
ncbitaxon    481
snomed       462
ncit         346
SNOMEDCT     160
pato         132
            ... 
MFOMD          1
FGNHNS         1
VEO            1
JFO            1
VBO            1
Name: count, Length: 472, dtype: int64

In [60]:
# Break down by source
print("\n=== OLS Ontologies ===")
ols_df = results_df[results_df["source"] == "OLS"]



=== OLS Ontologies ===


In [61]:
ols_df["match_ontology"].value_counts()

match_ontology
ncbitaxon    481
snomed       462
ncit         346
pato         132
mesh         131
            ... 
slso           1
probonto       1
eupath         1
mf             1
vbo            1
Name: count, Length: 145, dtype: int64

In [62]:
print("\n=== BioPortal Ontologies ===")
bp_df = results_df[results_df["source"] == "BioPortal"]


=== BioPortal Ontologies ===


In [63]:
bp_df["match_ontology"].value_counts()

match_ontology
SNOMEDCT    160
OCHV        118
RCD          87
IOBC         82
RH-MESH      76
           ... 
MF            1
FGNHNS        1
MIRO          1
JFO           1
VBO           1
Name: count, Length: 327, dtype: int64

## Analysis: Label Similarity

In [64]:
# Analyze ontology distribution by similarity quality
print(f"\n=== Phase 2 Filter Recommendations ===")

# Filter to high-quality matches (similarity > 0.5)
high_quality = results_df[results_df["similarity_ratio"] >= high_quality_similarity_ratio]
print(f"\nHigh-quality matches (similarity ≥ 0.5): {len(high_quality)} / {len(results_df)} ({len(high_quality)/len(results_df)*100:.1f}%)")


=== Phase 2 Filter Recommendations ===

High-quality matches (similarity ≥ 0.5): 3036 / 5527 (54.9%)


In [65]:
# Top ontologies by high-quality match count
print(f"\nTop ontologies by HIGH-QUALITY match count:")
hq_ontology_counts = high_quality["match_ontology"].value_counts()



Top ontologies by HIGH-QUALITY match count:


In [66]:
hq_ontology_counts

match_ontology
snomed       255
ncit         156
ncbitaxon    122
pato         114
flopo        105
            ... 
HAAURAADO      1
emapa          1
JFO            1
VBO            1
NMR            1
Name: count, Length: 392, dtype: int64

In [67]:
# Compare to all matches
print(f"\nTop ontologies by TOTAL match count (including low similarity):")


Top ontologies by TOTAL match count (including low similarity):


In [68]:
ontology_counts

match_ontology
ncbitaxon    481
snomed       462
ncit         346
SNOMEDCT     160
pato         132
            ... 
MFOMD          1
FGNHNS         1
VEO            1
JFO            1
VBO            1
Name: count, Length: 472, dtype: int64

In [69]:
# Show cumulative coverage for high-quality matches
cumulative_pct = (hq_ontology_counts.cumsum() / hq_ontology_counts.sum() * 100)
print(f"\nCumulative coverage of high-quality matches:")
if len(cumulative_pct) >= 5:
    print(f"  Top 5 ontologies: {cumulative_pct.iloc[4]:.1f}%")
if len(cumulative_pct) >= 10:
    print(f"  Top 10 ontologies: {cumulative_pct.iloc[9]:.1f}%")
if len(cumulative_pct) >= 20:
    print(f"  Top 20 ontologies: {cumulative_pct.iloc[19]:.1f}%")


Cumulative coverage of high-quality matches:
  Top 5 ontologies: 24.8%
  Top 10 ontologies: 37.0%
  Top 20 ontologies: 51.2%


In [70]:
# Show average similarity by ontology
print(f"\nAverage similarity ratio by ontology (top 20):")
ontology_avg_sim = results_df.groupby("match_ontology")["similarity_ratio"].agg(["mean", "count"]).sort_values("mean", ascending=False)


Average similarity ratio by ontology (top 20):


In [71]:
ontology_avg_sim

Unnamed: 0_level_0,mean,count
match_ontology,Unnamed: 1_level_1,Unnamed: 2_level_1
wikipathways,1.000000,5
sbo,1.000000,1
ro,1.000000,1
pso,1.000000,1
COVID-19,1.000000,1
...,...,...
ONTOTOXNUC,0.266667,1
chiro,0.260870,2
ICD10CM,0.258718,12
poro,0.220238,2


## Analysis: Coverage by METPO Term

In [72]:
# How many results per METPO term?
coverage = results_df.groupby("metpo_label").agg({
    "match_label": "count",
    "match_ontology": lambda x: x.nunique()
}).rename(columns={"match_label": "total_results", "match_ontology": "unique_ontologies"})

coverage = coverage.sort_values("total_results", ascending=False)
print("Results per METPO term:\n")


Results per METPO term:



In [73]:
coverage

Unnamed: 0_level_0,total_results,unique_ontologies
metpo_label,Unnamed: 1_level_1,Unnamed: 2_level_1
phenotype,150,115
carotenoid pigmentation,149,62
dumbbell shaped,149,40
Oxidative phosphorylation,149,69
tailed shaped,149,35
motility,149,42
oval shaped,149,55
gliding,149,43
Anaerobic respiration,149,60
triangular shaped,149,26


## Recommendations for Phase 2 Filters

In [74]:
# Recommended ontology whitelist (top ontologies of interest)
target_ontologies = ["omp", "pato", "mco", "envo", "chebi", "go", "ncbitaxon"]
target_results = results_df[results_df["match_ontology"].str.lower().isin(target_ontologies)]

In [75]:
print(f"\n=== Phase 2 Recommendations ===")
print(f"\nTarget ontologies: {', '.join(target_ontologies)}")
print(f"Results from target ontologies: {len(target_results)} / {len(results_df)} ({len(target_results)/len(results_df)*100:.1f}%)")
print(f"\nTarget ontology breakdown:")



=== Phase 2 Recommendations ===

Target ontologies: omp, pato, mco, envo, chebi, go, ncbitaxon
Results from target ontologies: 905 / 5527 (16.4%)

Target ontology breakdown:


In [76]:
target_results["match_ontology"].value_counts()

match_ontology
ncbitaxon    481
pato         132
go            64
omp           61
OMP           47
mco           31
envo          24
chebi         18
PATO          13
GO             9
CHEBI          8
ENVO           8
NCBITAXON      7
MCO            2
Name: count, dtype: int64

In [77]:
exact_matches = results_df[results_df["similarity_ratio"] == 1.0]

In [78]:
# Summary statistics
print("\n=== Summary Statistics ===")
print(f"Total METPO terms searched: {metpo_df.shape[0]}")
print(f"Total results found: {len(results_df)}")
print(f"Average results per term: {len(results_df) / metpo_df.shape[0]:.1f}")
print(f"Unique ontologies found: {results_df['match_ontology'].nunique()}")
print(f"Terms with exact matches: {exact_matches['metpo_label'].nunique()}")
print(f"Terms with NO results: {metpo_df.shape[0] - coverage.shape[0]}")


=== Summary Statistics ===
Total METPO terms searched: 50
Total results found: 5527
Average results per term: 110.5
Unique ontologies found: 472
Terms with exact matches: 19
Terms with NO results: 1


In [79]:
exact_matches

Unnamed: 0,metpo_id,metpo_label,source,match_label,match_iri,match_ontology,match_definition,levenshtein_distance,similarity_ratio
450,METPO:1000802,Anaerobic respiration,OLS,anaerobic respiration,http://purl.obolibrary.org/obo/GO_0009061,go,The enzymatic release of energy from inorganic...,0,1.0
451,METPO:1000802,Anaerobic respiration,OLS,anaerobic respiration,http://purl.obolibrary.org/obo/GO_0009061,gaz,,0,1.0
452,METPO:1000802,Anaerobic respiration,OLS,anaerobic respiration,http://purl.obolibrary.org/obo/GO_0009061,envo,,0,1.0
453,METPO:1000802,Anaerobic respiration,OLS,anaerobic respiration,http://purl.obolibrary.org/obo/GO_0009061,ecocore,The enzymatic release of energy from inorganic...,0,1.0
454,METPO:1000802,Anaerobic respiration,OLS,anaerobic respiration,http://purl.obolibrary.org/obo/GO_0009061,oba,,0,1.0
...,...,...,...,...,...,...,...,...,...
5537,METPO:1000803,Oxidative phosphorylation,BioPortal,oxidative phosphorylation,http://purl.obolibrary.org/obo/GO_0006119,PHAGE,The phosphorylation of ADP to ATP that accompa...,0,1.0
5538,METPO:1000803,Oxidative phosphorylation,BioPortal,oxidative phosphorylation,http://purl.obolibrary.org/obo/GO_0006119,COVID-19,The phosphorylation of ADP to ATP that accompa...,0,1.0
5539,METPO:1000803,Oxidative phosphorylation,BioPortal,Oxidative phosphorylation,http://purl.obolibrary.org/obo/IEV_0001431,PTS,"""The phosphorylation of ADP to ATP that accomp...",0,1.0
5725,METPO:1000615,mesophilic,OLS,mesophilic,http://purl.obolibrary.org/obo/MICRO_0000111,micro,,0,1.0
