import requests
import json
import os
import pandas as pd
import time
from pathlib import Path
from dotenv import load_dotenv
from collections import Counter
import Levenshtein

# Load environment variables
env_path = Path("..") / ".env"
load_dotenv(dotenv_path=env_path)

BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")

if not BIOPORTAL_API_KEY:
    print("⚠️  Warning: BIOPORTAL_API_KEY not set")
else:
    print("✓ BioPortal API key loaded")

In [1]:
import requests
import json
import os
import pandas as pd
import time
from pathlib import Path
from dotenv import load_dotenv
from collections import Counter
import Levenshtein

# Load environment variables
env_path = Path("..") / ".env"
load_dotenv(dotenv_path=env_path)

BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")

if not BIOPORTAL_API_KEY:
    print("⚠️  Warning: BIOPORTAL_API_KEY not set")
else:
    print("✓ BioPortal API key loaded")

# Define search functions
def search_ols(label, rows=75):
    """Search OLS4 for a label"""
    url = "https://www.ebi.ac.uk/ols4/api/search"
    params = {"q": label, "type": "class", "rows": rows}
    
    try:
        response = requests.get(url, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            docs = data.get("response", {}).get("docs", [])
            return [{"label": d.get("label"), 
                     "iri": d.get("iri"), 
                     "ontology": d.get("ontology_name"),
                     "definition": d.get("description", [""])[0] if d.get("description") else ""} 
                    for d in docs]
        return []
    except Exception as e:
        print(f"OLS error for '{label}': {e}")
        return []

def search_bioportal(label, api_key, pagesize=75):
    """Search BioPortal for a label"""
    url = "https://data.bioontology.org/search"
    params = {"q": label, "pagesize": pagesize}
    headers = {}
    
    if api_key:
        headers["Authorization"] = f"apikey token={api_key}"
    
    try:
        response = requests.get(url, params=params, headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            collection = data.get("collection", [])
            results = []
            for item in collection:
                ontology_url = item.get("links", {}).get("ontology", "")
                ontology = ontology_url.split("/")[-1] if ontology_url else "unknown"
                
                results.append({
                    "label": item.get("prefLabel"),
                    "iri": item.get("@id"),
                    "ontology": ontology,
                    "definition": item.get("definition", [""])[0] if isinstance(item.get("definition"), list) else item.get("definition", "")
                })
            return results
        return []
    except Exception as e:
        print(f"BioPortal error for '{label}': {e}")
        return []

# Load METPO sample labels (20% sample = 50 classes)
metpo_df = pd.read_csv("metpo_sample_labels.tsv", sep="\t", names=["metpo_id", "metpo_label"])
print(f"\nLoaded {len(metpo_df)} METPO labels (20% sample)")
print(f"Estimated runtime: ~{len(metpo_df) * 2} seconds (~{len(metpo_df) * 2 / 60:.1f} minutes)")
print("\nSample:")
print(metpo_df.head(10))

✓ BioPortal API key loaded

Loaded 50 METPO labels (20% sample)
Estimated runtime: ~100 seconds (~1.7 minutes)

Sample:
        metpo_id                     metpo_label
0  METPO:1000678                     oval shaped
1  METPO:1000481                 NaCl delta mid2
2  METPO:1001002  growth temperature observation
3  METPO:1000802           Anaerobic respiration
4  METPO:1000883          cell length very small
5  METPO:1000664             organoheterotrophic
6  METPO:1001023                  pH observation
7  METPO:1000476                   pH delta mid2
8  METPO:1000614                   psychrophilic
9  METPO:1000304             temperature optimum


In [2]:
# Run batch search with rate limiting
all_results = []

for idx, row in metpo_df.iterrows():
    metpo_id = row["metpo_id"]
    metpo_label = row["metpo_label"]
    
    print(f"\n[{idx+1}/{len(metpo_df)}] Searching: {metpo_label}")
    
    # Search OLS
    ols_results = search_ols(metpo_label)
    print(f"  OLS: {len(ols_results)} results")
    
    for result in ols_results:
        all_results.append({
            "metpo_id": metpo_id,
            "metpo_label": metpo_label,
            "source": "OLS",
            "match_label": result["label"],
            "match_iri": result["iri"],
            "match_ontology": result["ontology"],
            "match_definition": result["definition"]
        })
    
    # Rate limiting between API calls
    time.sleep(1)
    
    # Search BioPortal
    bp_results = search_bioportal(metpo_label, BIOPORTAL_API_KEY)
    print(f"  BioPortal: {len(bp_results)} results")
    
    for result in bp_results:
        all_results.append({
            "metpo_id": metpo_id,
            "metpo_label": metpo_label,
            "source": "BioPortal",
            "match_label": result["label"],
            "match_iri": result["iri"],
            "match_ontology": result["ontology"],
            "match_definition": result["definition"]
        })
    
    # Rate limiting between METPO terms
    time.sleep(1)

print(f"\n✓ Complete: {len(all_results)} total results")


[1/50] Searching: oval shaped
  OLS: 75 results
  BioPortal: 75 results

[2/50] Searching: NaCl delta mid2
  OLS: 75 results
  BioPortal: 75 results

[3/50] Searching: growth temperature observation
  OLS: 75 results
  BioPortal: 75 results

[4/50] Searching: Anaerobic respiration
  OLS: 75 results
  BioPortal: 75 results

[5/50] Searching: cell length very small
  OLS: 75 results
  BioPortal: 75 results

[6/50] Searching: organoheterotrophic
  OLS: 0 results
  BioPortal: 2 results

[7/50] Searching: pH observation
  OLS: 75 results
  BioPortal: 75 results

[8/50] Searching: pH delta mid2
  OLS: 75 results
  BioPortal: 75 results

[9/50] Searching: psychrophilic
  OLS: 30 results
  BioPortal: 13 results

[10/50] Searching: temperature optimum
  OLS: 75 results
  BioPortal: 75 results

[11/50] Searching: dumbbell shaped
  OLS: 75 results
  BioPortal: 75 results

[12/50] Searching: NaCl range observation
  OLS: 75 results
  BioPortal: 75 results

[13/50] Searching: carotenoid pigmentati

In [3]:
def search_ols(label, rows=75):
    """Search OLS4 for a label"""
    url = "https://www.ebi.ac.uk/ols4/api/search"
    params = {"q": label, "type": "class", "rows": rows}  # Get up to 75 results
    
    try:
        response = requests.get(url, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            docs = data.get("response", {}).get("docs", [])
            return [{"label": d.get("label"), 
                     "iri": d.get("iri"), 
                     "ontology": d.get("ontology_name"),
                     "definition": d.get("description", [""])[0] if d.get("description") else ""} 
                    for d in docs]
        return []
    except Exception as e:
        print(f"OLS error for '{label}': {e}")
        return []

def search_bioportal(label, api_key, pagesize=75):
    """Search BioPortal for a label"""
    url = "https://data.bioontology.org/search"
    params = {"q": label, "pagesize": pagesize}  # Get up to 75 results per page
    headers = {}
    
    if api_key:
        headers["Authorization"] = f"apikey token={api_key}"
    
    try:
        response = requests.get(url, params=params, headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            collection = data.get("collection", [])
            results = []
            for item in collection:
                # Extract ontology acronym from links
                ontology_url = item.get("links", {}).get("ontology", "")
                ontology = ontology_url.split("/")[-1] if ontology_url else "unknown"
                
                results.append({
                    "label": item.get("prefLabel"),
                    "iri": item.get("@id"),
                    "ontology": ontology,
                    "definition": item.get("definition", [""])[0] if isinstance(item.get("definition"), list) else item.get("definition", "")
                })
            return results
        return []
    except Exception as e:
        print(f"BioPortal error for '{label}': {e}")
        return []

In [4]:
# Run batch search
all_results = []

for idx, row in metpo_df.iterrows():
    metpo_id = row["metpo_id"]
    metpo_label = row["metpo_label"]
    
    print(f"\n[{idx+1}/{len(metpo_df)}] Searching: {metpo_label}")
    
    # Search OLS
    ols_results = search_ols(metpo_label)
    print(f"  OLS: {len(ols_results)} results")
    
    for result in ols_results:
        all_results.append({
            "metpo_id": metpo_id,
            "metpo_label": metpo_label,
            "source": "OLS",
            "match_label": result["label"],
            "match_iri": result["iri"],
            "match_ontology": result["ontology"],
            "match_definition": result["definition"]
        })
    
    # Search BioPortal
    bp_results = search_bioportal(metpo_label, BIOPORTAL_API_KEY)
    print(f"  BioPortal: {len(bp_results)} results")
    
    for result in bp_results:
        all_results.append({
            "metpo_id": metpo_id,
            "metpo_label": metpo_label,
            "source": "BioPortal",
            "match_label": result["label"],
            "match_iri": result["iri"],
            "match_ontology": result["ontology"],
            "match_definition": result["definition"]
        })
    
    # Rate limiting
    time.sleep(0.5)

print(f"\n✓ Complete: {len(all_results)} total results")


[1/50] Searching: oval shaped
  OLS: 75 results
  BioPortal: 75 results

[2/50] Searching: NaCl delta mid2
  OLS: 75 results
  BioPortal: 75 results

[3/50] Searching: growth temperature observation
  OLS: 75 results
  BioPortal: 75 results

[4/50] Searching: Anaerobic respiration
  OLS: 75 results
  BioPortal: 75 results

[5/50] Searching: cell length very small
  OLS: 75 results
  BioPortal: 75 results

[6/50] Searching: organoheterotrophic
  OLS: 0 results
  BioPortal: 2 results

[7/50] Searching: pH observation
  OLS: 75 results
  BioPortal: 75 results

[8/50] Searching: pH delta mid2
  OLS: 75 results
  BioPortal: 75 results

[9/50] Searching: psychrophilic
  OLS: 30 results
  BioPortal: 13 results

[10/50] Searching: temperature optimum
  OLS: 75 results
  BioPortal: 75 results

[11/50] Searching: dumbbell shaped
  OLS: 75 results
  BioPortal: 75 results

[12/50] Searching: NaCl range observation
  OLS: 75 results
  BioPortal: 75 results

[13/50] Searching: carotenoid pigmentati

In [5]:
# Convert to DataFrame and calculate string distances
results_df = pd.DataFrame(all_results)

# Calculate Levenshtein distance and normalized similarity
results_df["levenshtein_distance"] = results_df.apply(
    lambda row: Levenshtein.distance(
        str(row["metpo_label"]).lower(), 
        str(row["match_label"]).lower()
    ) if pd.notna(row["match_label"]) else None,
    axis=1
)

# Normalized similarity ratio (0-1, where 1 is exact match)
results_df["similarity_ratio"] = results_df.apply(
    lambda row: Levenshtein.ratio(
        str(row["metpo_label"]).lower(), 
        str(row["match_label"]).lower()
    ) if pd.notna(row["match_label"]) else None,
    axis=1
)

print(f"Results shape: {results_df.shape}")
print(f"\nSimilarity statistics:")
print(results_df["similarity_ratio"].describe())
results_df.head()

Results shape: (5934, 9)

Similarity statistics:
count    5934.000000
mean        0.538369
std         0.197966
min         0.000000
25%         0.400000
50%         0.521739
75%         0.666667
max         1.000000
Name: similarity_ratio, dtype: float64


Unnamed: 0,metpo_id,metpo_label,source,match_label,match_iri,match_ontology,match_definition,levenshtein_distance,similarity_ratio
0,METPO:1000678,oval shaped,OLS,Oval shaped lesion,http://snomed.info/id/129734009,snomed,,7,0.758621
1,METPO:1000678,oval shaped,OLS,Oval,http://purl.obolibrary.org/obo/NCIT_C48345,ncit,The shape of either an oval or an ellipse. Se...,7,0.533333
2,METPO:1000678,oval shaped,OLS,Oval macrocyte,http://snomed.info/id/117170009,snomed,,9,0.56
3,METPO:1000678,oval shaped,OLS,oval nucleus,http://purl.obolibrary.org/obo/EMAPA_37667,emapa,,6,0.521739
4,METPO:1000678,oval shaped,OLS,oval nucleus,http://purl.obolibrary.org/obo/MA_0000929,ma,,6,0.521739


In [6]:
# Save raw results with all columns
output_file = "phase1_raw_results.tsv"
results_df.to_csv(output_file, sep="\t", index=False)
print(f"✓ Saved full results to {output_file}")

# Also save high-quality matches separately
high_quality = results_df[results_df["similarity_ratio"] >= 0.5]
hq_file = "phase1_high_quality_matches.tsv"
high_quality.to_csv(hq_file, sep="\t", index=False)
print(f"✓ Saved high-quality matches (similarity ≥ 0.5) to {hq_file}")

# Save summary statistics
summary_stats = {
    "total_metpo_terms": len(metpo_df),
    "total_results": len(results_df),
    "high_quality_results": len(high_quality),
    "unique_ontologies": results_df["match_ontology"].nunique(),
    "avg_similarity": results_df["similarity_ratio"].mean(),
    "median_similarity": results_df["similarity_ratio"].median()
}
with open("phase1_summary_stats.json", "w") as f:
    json.dump(summary_stats, f, indent=2)
print(f"✓ Saved summary statistics to phase1_summary_stats.json")

✓ Saved full results to phase1_raw_results.tsv
✓ Saved high-quality matches (similarity ≥ 0.5) to phase1_high_quality_matches.tsv
✓ Saved summary statistics to phase1_summary_stats.json


## Analysis: Ontology Frequency

In [7]:
# Count ontologies
ontology_counts = results_df["match_ontology"].value_counts()
print("Top 20 ontologies by result count:\n")
print(ontology_counts.head(20))

Top 20 ontologies by result count:

match_ontology
ncbitaxon    481
snomed       475
METPO        408
ncit         349
SNOMEDCT     160
pato         145
mesh         132
micro        130
OCHV         118
flopo        106
afo           95
RCD           87
IOBC          82
RH-MESH       76
FLOPO         72
NCIT          71
OMIM          66
upheno        64
go            64
omit          63
Name: count, dtype: int64


In [8]:
# Break down by source
print("\n=== OLS Ontologies ===")
ols_df = results_df[results_df["source"] == "OLS"]
print(ols_df["match_ontology"].value_counts().head(20))

print("\n=== BioPortal Ontologies ===")
bp_df = results_df[results_df["source"] == "BioPortal"]
print(bp_df["match_ontology"].value_counts().head(20))


=== OLS Ontologies ===
match_ontology
ncbitaxon    481
snomed       475
ncit         349
pato         145
mesh         132
micro        130
flopo        106
afo           95
upheno        64
go            64
omit          63
omp           61
doid          51
pr            46
oba           39
fypo          31
mco           30
cmo           28
efo           28
hp            23
Name: count, dtype: int64

=== BioPortal Ontologies ===
match_ontology
METPO        408
SNOMEDCT     160
OCHV         118
RCD           87
IOBC          82
RH-MESH       76
FLOPO         72
NCIT          71
OMIM          66
LOINC         49
BERO          48
OMP           47
UPHENO        46
MESH          43
PMAPP-PMO     41
PR            36
HP            33
CCO           31
MPO           31
D3O           28
Name: count, dtype: int64


## Analysis: Label Similarity

In [9]:
# Analyze ontology distribution by similarity quality
print(f"\n=== Phase 2 Filter Recommendations ===\")

# Filter to high-quality matches (similarity > 0.5)
high_quality = results_df[results_df["similarity_ratio"] >= 0.5]
print(f"\nHigh-quality matches (similarity ≥ 0.5): {len(high_quality)} / {len(results_df)} ({len(high_quality)/len(results_df)*100:.1f}%)")

# Top ontologies by high-quality match count
print(f"\nTop ontologies by HIGH-QUALITY match count:")
hq_ontology_counts = high_quality["match_ontology"].value_counts()
print(hq_ontology_counts.head(20))

# Compare to all matches
print(f"\nTop ontologies by TOTAL match count (including low similarity):")
print(ontology_counts.head(20))

# Show cumulative coverage for high-quality matches
cumulative_pct = (hq_ontology_counts.cumsum() / hq_ontology_counts.sum() * 100)
print(f"\nCumulative coverage of high-quality matches:")
if len(cumulative_pct) >= 5:
    print(f"  Top 5 ontologies: {cumulative_pct.iloc[4]:.1f}%")
if len(cumulative_pct) >= 10:
    print(f"  Top 10 ontologies: {cumulative_pct.iloc[9]:.1f}%")
if len(cumulative_pct) >= 20:
    print(f"  Top 20 ontologies: {cumulative_pct.iloc[19]:.1f}%")

# Show average similarity by ontology
print(f"\nAverage similarity ratio by ontology (top 20):")
ontology_avg_sim = results_df.groupby("match_ontology")["similarity_ratio"].agg(["mean", "count"]).sort_values("mean", ascending=False)
print(ontology_avg_sim.head(20))

SyntaxError: unterminated string literal (detected at line 2) (1663184465.py, line 2)

In [None]:
# Show exact matches by ontology
print("\nExact matches by ontology:")
print(exact_matches["match_ontology"].value_counts())

## Analysis: Coverage by METPO Term

In [None]:
# How many results per METPO term?
coverage = results_df.groupby("metpo_label").agg({
    "match_label": "count",
    "match_ontology": lambda x: x.nunique()
}).rename(columns={"match_label": "total_results", "match_ontology": "unique_ontologies"})

coverage = coverage.sort_values("total_results", ascending=False)
print("Results per METPO term:\n")
print(coverage)

## Recommendations for Phase 2 Filters

In [None]:
# Recommended ontology whitelist (top ontologies of interest)
target_ontologies = ["omp", "pato", "mco", "envo", "chebi", "go", "ncbitaxon"]
target_results = results_df[results_df["match_ontology"].str.lower().isin(target_ontologies)]

print(f"\n=== Phase 2 Recommendations ===")
print(f"\nTarget ontologies: {', '.join(target_ontologies)}")
print(f"Results from target ontologies: {len(target_results)} / {len(results_df)} ({len(target_results)/len(results_df)*100:.1f}%)")
print(f"\nTarget ontology breakdown:")
print(target_results["match_ontology"].value_counts())

In [None]:
# Summary statistics
print("\n=== Summary Statistics ===")
print(f"Total METPO terms searched: {metpo_df.shape[0]}")
print(f"Total results found: {len(results_df)}")
print(f"Average results per term: {len(results_df) / metpo_df.shape[0]:.1f}")
print(f"Unique ontologies found: {results_df['match_ontology'].nunique()}")
print(f"Terms with exact matches: {exact_matches['metpo_label'].nunique()}")
print(f"Terms with NO results: {metpo_df.shape[0] - coverage.shape[0]}")