# World Health Organization to MONDO Matching #

In [None]:
!pip install pydantic-ai

In [None]:
import os
import requests
import json
from dotenv import load_dotenv
from pydantic_ai import Agent, RunContext, Tool
from pydantic import BaseModel, ValidationError, Field
from typing import Optional, Literal, List
from oaklib import get_adapter
from oaklib.datamodels.search import SearchConfiguration
import pandas as pd
from tqdm import tqdm
import asyncio

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
mondo = get_adapter("ontobee:mondo")
stato = get_adapter("ontobee:stato")

In [None]:
HUMAN_DISEASE_ROOT = "MONDO:0700096"

def is_human_disease(curie: str) -> bool:
    ancestors = set(mondo.ancestors(curie))
    return HUMAN_DISEASE_ROOT in ancestors

def search_mondo(label: str) -> List[dict]:
    """Search the MONDO Ontology for disease identifiers."""
    results = list(mondo.basic_search(label, SearchConfiguration(is_partial=True)))
    data = []
    for curie in results:
        if not is_human_disease(curie):
            continue
        data.append({
            "id" : curie,
            "label" : mondo.label(curie),
            "definition": mondo.definition(curie),
        })
    return data

def search_stato(label: str) -> List[dict]:
    """Search the STATO Ontology for Prevalence, Incidence, or Count identifiers."""
    results = list(stato.basic_search(label))
    data = []
    for curie in results:
        data.append({
            "id" : curie,
            "label" : stato.label(curie),
            "definition": stato.definition(curie),
        })
    return data

In [None]:
# Data schema
class WHOAnnotation(BaseModel):
    IndicatorName: str
    MONDO_ID: Optional[str] = None
    MONDO_Label: Optional[str] = None
    STATO_ID: Optional[str] = None
    STATO_Label: Optional[str] = None
    Denominator: Optional[int] = None

In [None]:
PROP_AGENT_PROMPT = (
    """
    You are an expert biocurator familiar with the MONDO Disease Ontology and disease terminology. Your task is to help curate ontology terms for given World Health Organization proportion estimates.
    The input data is a list of World Health Organization indicators each known as the 'IndicatorName', which has information about whether a statistic is either a prevalence, incidence, or count, and what the disease or characteristic it describes.

    After reading the IndicatorName column (one row at a time), use your function calling ability to fill out a TSV with the following six columns:
    1. IndicatorName -> leave the same as is in the input TSV
    2. MONDO ID -> if a disease or phenotypic entity is contained in the IndicatorName value, use the search_mondo function to search for the disease and return the MONDO ID. If at first you cannot find one, search again with various synonyms. If there is no disease or phenotypic entity in IndicatorName, or a match in MONDO, leave this field blank.
    3. MONDO Label -> if a disease or phenotypic entity is contained in the IndicatorName value, use the search_mondo function to search for the disease and return the MONDO Label. If at first you cannot find one, search again with various synonyms. If there is no disease or phenotypic entity in IndicatorName, or a match in MONDO, leave this field blank.
    4. STATO ID -> if contained in the IndicatorName value, use the search_stato function to search for one of 'prevalence, incidence, or count' and return the STATO ID
    5. STATO Label -> if contained in the IndicatorName value, use the search_stato function to search for one of 'prevalence, incidence, or count' and return the STATO Label
    6. Denominator -> strip the denominator from the IndicatorName value. For example, if you see 'per 100,000' input 100000 or if you see 'per 1000' input 1000 into the Denominator field. If there is not a feasible denominator value, leave this field blank.

    IMPORTANT: The ontology information gathered from search_mondo and search_stato are the source of truth about these ontologies. Do not create a new term or ID if a match does not exist, just leave the Ontology ID and Label fields blank!
    """
)

prop_agent = Agent(
    model="openai:gpt-4.1",
    output_type=List[WHOAnnotation],
    system_prompt=PROP_AGENT_PROMPT,
    tools=[search_mondo, search_stato],
)

In [None]:
who_df = pd.read_csv("who_incidence_data.tsv", sep="\t")
unique_indicators = who_df['IndicatorName'].dropna().unique()
print(unique_indicators)