In [68]:
import pystow
import rich
import bioontologies
from bioontologies.obograph import _clean_uri
import bioregistry
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from dataclasses import dataclass
import json
from functools import lru_cache
import biomappings
from tabulate import tabulate
from IPython.display import HTML
import time
import pandas as pd
from collections import defaultdict

In [35]:
EVALUATION = pystow.module("biomappings", "evaluation")
SIDER_URL = "http://sideeffects.embl.de/media/download/meddra_all_se.tsv.gz"
CTD_URL = "https://ctdbase.org/reports/CTD_chem_gene_ixns.tsv.gz"


@dataclass
class Result:
    dataset: str
    source: str
    target: str
    total: int
    missing: int
    missing_biomappings: int

    @classmethod
    def from_dicts(
        cls,
        dataset,
        source,
        target,
        datasource_identifiers,
        ontology_external_identifiers,
        biomappings_external_identifiers,
    ):
        return Result(
            dataset=dataset,
            source=source,
            target=target,
            total=len(datasource_identifiers),
            missing=len(datasource_identifiers - ontology_external_identifiers),
            missing_biomappings=len(
                datasource_identifiers
                - ontology_external_identifiers
                - biomappings_external_identifiers
            ),
        )


evaluation_results = []

In [56]:
biomappings_dd = defaultdict(lambda: defaultdict(dict))
for mapping in biomappings.load_mappings():
    source_prefix = mapping["source prefix"]
    source_resource = bioregistry.get_resource(source_prefix)
    source_id = source_resource.standardize_identifier(mapping["source identifier"])
    target_prefix = mapping["target prefix"]
    target_resource = bioregistry.get_resource(target_prefix)
    target_id = target_resource.standardize_identifier(mapping["target identifier"])
    biomappings_dd[source_prefix][target_prefix][source_id] = target_id
    biomappings_dd[target_prefix][source_prefix][target_id] = source_id
biomappings_dd = {k: dict(v) for k, v in biomappings_dd.items()}

{'agrovoc': {'agro': {'0619dd9e': '00000137',
   '0a9fbc47': '00000456',
   '1012': '00000066',
   '10457': '00000116',
   '1101': '00000040',
   '12679': '00000220',
   '13452': '00000093',
   '13811': '00000229',
   '1410249722990': '00000329',
   '155': '00000037',
   '15970': '00000047',
   '16097': '00000580',
   '16118': '00000154',
   '16650': '00002009',
   '1775': '00000473',
   '17857': '00000321',
   '1882e93f': '00000365',
   '1972': '00000325',
   '200': '00000197',
   '2019': '00000393',
   '207': '00000584',
   '2151': '00000046',
   '2386': '00000396',
   '24001': '00000540',
   '24065': '00010132',
   '24262': '00002042',
   '24947': '00000204',
   '24975': '00020001',
   '25021': '00000272',
   '2502372e': '00000331',
   '25206': '00000112',
   '25243': '00000426',
   '25265': '00000520',
   '25305': '00000589',
   '25354': '00000588',
   '25356': '00000055',
   '25674': '00000463',
   '25695': '00000129',
   '25720': '00000412',
   '25763': '00000540',
   '25779': '0

In [70]:
print(time.asctime())
g = get_graph("chebi", "http://purl.obolibrary.org/obo/chebi.owl")
print(time.asctime())
print(len(g.nodes), len(g.edges))

Thu Aug  4 15:51:03 2022
Thu Aug  4 15:51:03 2022
180322 322739


In [71]:
@lru_cache
def get_graph(prefix: str, graph_uri=None):
    parse_results = bioontologies.get_obograph_by_prefix(prefix)
    if len(parse_results.graph_document.graphs) == 1:
        return parse_results.graph_document.graphs[0]
    if graph_uri is None:
        uris = sorted(graph.id for graph in parse_results.graph_document.graphs)
        raise ValueError(f"need a graph_uri for {prefix} since it has multiple graphs: {uris}.")
    return next(
        graph 
        for graph in parse_results.graph_document.graphs 
        if graph.id == graph_uri
    )
    
def get_primary_mappings(prefix: str, graph_uri: str, external_prefix: str) -> set[str]:
    cache_path = EVALUATION.join("mappings", name=f"{prefix}_{external_prefix}.json")
    if cache_path.is_file():
        return json.loads(cache_path.read_text())

    graph = get_graph(prefix, graph_uri)
    rv = {}
    for node in tqdm(graph.nodes, unit="node", unit_scale=True, desc=f"Extracting {external_prefix} from {prefix}"):
        for xref in node.xrefs:
            xref_prefix, xref_identifier = bioregistry.parse_curie(xref.val)
            if xref_prefix != external_prefix:
                continue
            rv[xref_identifier] = _clean_uri(node.id, keep_invalid=False)
    
    cache_path.write_text(json.dumps(rv, indent=2, sort_keys=True))
    return graph.version, rv

config = [
    ("doid", "umls", "http://purl.obolibrary.org/obo/doid.owl"),
    ("doid", "mesh", "http://purl.obolibrary.org/obo/doid.owl"),
    ("doid", "mondo", "http://purl.obolibrary.org/obo/doid.owl"),
    ("doid", "efo", "http://purl.obolibrary.org/obo/doid.owl"),

    ("mondo", "umls", "http://purl.obolibrary.org/obo/mondo.owl"),
    ("mondo", "mesh", "http://purl.obolibrary.org/obo/mondo.owl"),
    ("mondo", "doid", "http://purl.obolibrary.org/obo/mondo.owl"),
    ("mondo", "efo", "http://purl.obolibrary.org/obo/mondo.owl"),

    ("efo", "mesh", "http://www.ebi.ac.uk/efo/efo.obo"),
    ("efo", "doid", "http://www.ebi.ac.uk/efo/efo.obo"),

    ("hp", "mesh", "http://purl.obolibrary.org/obo/hp.owl"),

    ("go", "mesh", "http://purl.obolibrary.org/obo/go.owl"),
    ("go", "reactome", "http://purl.obolibrary.org/obo/go.owl"),
    ("go", "wikipathways", "http://purl.obolibrary.org/obo/go.owl"),

    ("uberon", "mesh", "http://purl.obolibrary.org/obo/uberon.owl"),

    ("cl", "mesh", "http://purl.obolibrary.org/obo/cl.owl"),
    
    ("chebi", "mesh", "http://purl.obolibrary.org/obo/chebi.owl"),
    ("chebi", "ncit", "http://purl.obolibrary.org/obo/chebi.owl"),
]

primary_dd = defaultdict(dict)
summary_rows = []
for prefix, external, uri in config:
    version, primary =  = get_primary_mappings(prefix, uri, external)
    primary_dd[external][prefix] = primary
    n_primary = len(primary)
    
    bm = biomappings_dd.get(external, {}).get(prefix, {})
    n_biomappings = len(bm)
    n_total = len(set(primary).union(bm))
    gain = 100 * n_biomappings / n_primary if n_primary else None
    
    summary_rows.append((
        prefix, 
        version,
        external, 
        n_primary,
        n_biomappings,
        n_total,
        gain,
    ))

pd.DataFrame(
    summary_rows,
    columns=[
        "resource", "version", "external", 
        "primary_xrefs", "biomappings_xrefs", 
        "total_xrefs", "percentage_gain",
    ],
)

Extracting mesh from hp:   0%|          | 0.00/31.6k [00:00<?, ?node/s]

Extracting mesh from chebi:   0%|          | 0.00/180k [00:00<?, ?node/s]

Extracting ncit from chebi:   0%|          | 0.00/180k [00:00<?, ?node/s]

Unnamed: 0,resource,external,primary_xrefs,biomappings_xrefs,total_xrefs,percentage_gain
0,doid,umls,6852,121,6912,1.765908
1,doid,mesh,3249,1306,4046,40.196984
2,doid,mondo,0,0,0,
3,doid,efo,131,61,192,46.564885
4,mondo,umls,17886,0,17886,0.0
5,mondo,mesh,8295,206,8314,2.483424
6,mondo,doid,9886,0,9886,0.0
7,mondo,efo,2867,0,2867,0.0
8,efo,mesh,7703,96,7778,1.246268
9,efo,doid,5623,61,5674,1.08483


# CTD


In [40]:
"""
ChemicalName
ChemicalID (MeSH identifier)
CasRN (CAS Registry Number, if available)
GeneSymbol
GeneID (NCBI Gene identifier)
GeneForms ('|'-delimited list)
Organism (scientific name)
OrganismID (NCBI Taxonomy identifier)
Interaction
InteractionActions ('|'-delimited list)
PubMedIDs ('|'-delimited list)
"""

ctd_header = [
    "chemical_name",
    "chemical_mesh_id",
    "chemical_cas",
    "gene_symbol",
    "gene_ncbigene_id",
    "gene_forms",
    "organism_name",
    "organism_ncbitaxon_id",
    "interaction",
    "pubmed_ids",
]
ctd_df = EVALUATION.ensure_csv(
    url=CTD_URL,
    read_csv_kwargs=dict(sep="\t", comment="#", header=None, dtype=str, keep_default_na=False),
)
ctd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606,10074-G5 affects the reaction [MYC protein res...,affects^reaction|increases^expression,32184358
1,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606,10074-G5 inhibits the reaction [EPHB2 protein ...,decreases^reaction|increases^expression,32184358
2,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606,10074-G5 results in decreased expression of AR...,decreases^expression,32184358
3,10074-G5,C534883,,AR,367,protein,Homo sapiens,9606,10074-G5 results in decreased expression of AR...,decreases^expression,32184358
4,10074-G5,C534883,,EPHB2,2048,protein,Homo sapiens,9606,10074-G5 inhibits the reaction [EPHB2 protein ...,decreases^reaction|increases^expression,32184358


# SIDER Side Effects

In [2]:
SIDE_EFFECTS_HEADER = [
    "STITCH_FLAT_ID",
    "STITCH_STEREO_ID",
    "UMLS CUI from Label",
    "MedDRA Concept Type",
    "UMLS CUI from MedDRA",
    "MedDRA Concept name",
]

side_effects_df = EVALUATION.ensure_csv(
    url=SIDER_URL,
    read_csv_kwargs=dict(
        dtype=str,
        header=None,
        names=SIDE_EFFECTS_HEADER,
    ),
)
side_effects_df

Unnamed: 0,STITCH_FLAT_ID,STITCH_STEREO_ID,UMLS CUI from Label,MedDRA Concept Type,UMLS CUI from MedDRA,MedDRA Concept name
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain
...,...,...,...,...,...,...
309844,CID171306834,CID071306834,C3203358,PT,C1145670,Respiratory failure
309845,CID171306834,CID071306834,C3665386,LLT,C3665386,Abnormal vision
309846,CID171306834,CID071306834,C3665386,PT,C3665347,Visual impairment
309847,CID171306834,CID071306834,C3665596,LLT,C3665596,Warts


In [32]:
datasource_identifiers = set(side_effects_df["UMLS CUI from MedDRA"].unique())
ontology_external_identifiers = set(get_incoming_xrefs(graph, "umls"))
biomappings_external_identifiers = set(biomappings_dd["umls"]["doid"])

  0%|          | 0/13554 [00:00<?, ?it/s]

In [33]:
n_missing = len(datasource_identifiers - ontology_external_identifiers)
percentage_missing = n_missing / len(datasource_identifiers)

n_missing_biomappings = len(
    datasource_identifiers - ontology_external_identifiers - biomappings_external_identifiers
)
percentage_missing_biomappings = n_missing_biomappings / len(datasource_identifiers)

print(
    f"""\
Total:                      {len(datasource_identifiers):,}
Missing w/ DOID:            {n_missing:,} ({percentage_missing:.2%})
Missing w/ DOID+Biomappings {n_missing_biomappings:,} ({percentage_missing_biomappings:.2%})"""
)

result = Result.from_dicts(
    dataset="sider",
    source="umls",
    target="doid",
    datasource_identifiers=datasource_identifiers,
    ontology_external_identifiers=ontology_external_identifiers,
    biomappings_external_identifiers=biomappings_external_identifiers,
)

evaluation_results.append(result)

Total:                      6,061
Missing w/ DOID:            4,896 (80.78%)
Missing w/ DOID+Biomappings 4,890 (80.68%)
