This notebook assesses the added value Biomappings gives to several resources.

In [1]:
import json
import sys
import time
from collections import Counter, defaultdict
from dataclasses import dataclass
from functools import lru_cache
from textwrap import dedent

import bioontologies
import bioregistry
import bioversions
import matplotlib.pyplot as plt
import pandas as pd
import pyobo
import pystow
from IPython.display import HTML
from matplotlib_venn import venn2
from tabulate import tabulate
from tqdm.auto import tqdm

import biomappings
from biomappings.paper_analysis import (
    EVALUATION,
    Result,
    get_graph,
    get_non_obo_mappings,
    get_obo_mappings,
    get_primary_mappings,
    index_mappings,
)

In [2]:
print(sys.version)
print(time.asctime())

3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]
Mon Oct 31 15:20:43 2022


In [3]:
evaluation_results = []

# Importing Mappings

## Biomappings

Manually curated mappings from Biomappings

In [4]:
biomappings_dd = index_mappings(
    biomappings.load_mappings(),
    path=EVALUATION.join(name="positive_mapping_index.pkl"),
)

Predicted mappings from Biomappings

In [5]:
biomappings_predictions_dd = index_mappings(
    biomappings.load_predictions(),
    path=EVALUATION.join(name="predicted_mapping_index.pkl"),
)

## Primary Mappings

Get primary mappings from 1) OBO ontologies that can be parsed with ROBOT and 2) other resoruces, via PyOBO.

In [6]:
# Primary mappings from OBO and other sources are going in here
primary_dd = defaultdict(dict)
summary_rows = []

summary_rows.extend(get_obo_mappings(primary_dd, biomappings_dd))
summary_rows.extend(get_non_obo_mappings(primary_dd, biomappings_dd))



## Summary

In [7]:
summary_df = pd.DataFrame(
    summary_rows,
    columns=[
        "resource",
        "version",
        "external",
        "primary_xrefs",
        "biomappings_xrefs",
        "total_xrefs",
        "percentage_gain",
    ],
)
pd.option_context("display.max_rows", summary_df.shape[0])
summary_df

Unnamed: 0,resource,version,external,primary_xrefs,biomappings_xrefs,total_xrefs,percentage_gain
0,doid,2022-07-27,umls,6852,246,7037,3.6
1,doid,2022-07-27,mesh,3249,2905,5643,89.4
2,doid,2022-07-27,mondo,0,0,0,-
3,doid,2022-07-27,efo,131,126,257,96.2
4,mondo,2022-08-01,umls,16751,0,16751,0.0
5,mondo,2022-08-01,mesh,8114,414,8344,5.1
6,mondo,2022-08-01,doid,9886,0,9886,0.0
7,mondo,2022-08-01,efo,2862,0,2862,0.0
8,efo,3.43.0,mesh,0,217,217,inf
9,efo,3.43.0,doid,0,126,126,inf


# Assess Impact on Mapping Hetionet Datasources

## CTD Chemical-Gene Interactions

In [8]:
CTD_CHEMICAL_GENE_URL = "https://ctdbase.org/reports/CTD_chem_gene_ixns.tsv.gz"
ctd_header = [
    "chemical_name",
    "chemical_mesh_id",
    "chemical_cas",
    "gene_symbol",
    "gene_ncbigene_id",
    "gene_forms",
    "organism_name",
    "organism_ncbitaxon_id",
    "evidence",
    "interaction",
    "pubmed_ids",
]
ctd_gene_chemical_df = EVALUATION.ensure_csv(
    url=CTD_CHEMICAL_GENE_URL,
    read_csv_kwargs=dict(
        sep="\t",
        comment="#",
        header=None,
        dtype=str,
        keep_default_na=False,
        usecols=[1],
        squeeze=True,
    ),
)
ctd_gene_chemical_df.head()

0    C534883
1    C534883
2    C534883
3    C534883
4    C534883
Name: 1, dtype: object

In [9]:
result = Result.make(
    dataset="ctd-chemical-gene",
    source="mesh",
    target="chebi",
    datasource_identifiers=set(ctd_gene_chemical_df.tolist()),
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to chebi    % Unmappable
-----------------------------  ---------------------  --------------
Total in ctd-chemical-gene     14,337
Missing w/ mesh                14,337                 100.00%
Missing w/ mesh + BM.          12,884                 89.9%
Missing w/ mesh + BM. + Pred.  9,086                  63.4%


## CTD Chemical-Diseases

In [10]:
CTD_CHEMICAL_DISEASES_URL = "https://ctdbase.org/reports/CTD_chemicals_diseases.tsv.gz"
"""
    ChemicalName
    ChemicalID (MeSH identifier)
    CasRN (CAS Registry Number, if available)
    DiseaseName
    DiseaseID (MeSH or OMIM identifier)
    DirectEvidence ('|'-delimited list)
    InferenceGeneSymbol
    InferenceScore
    OmimIDs ('|'-delimited list)
    PubMedIDs ('|'-delimited list)
"""
ctd_chemical_diseases_df = EVALUATION.ensure_csv(
    url=CTD_CHEMICAL_DISEASES_URL,
    read_csv_kwargs=dict(
        sep="\t",
        comment="#",
        header=None,
        dtype=str,
        keep_default_na=False,
        usecols=[4],
        squeeze=True,
    ),
)
ctd_chemical_diseases_df.head()

0       MESH:D054198
1       MESH:D000230
2    MESH:D000077192
3       MESH:D000505
4       MESH:D013734
Name: 4, dtype: object

In [11]:
ctd_chemical_diseases_mesh = {
    x.split(":")[1] for x in ctd_chemical_diseases_df.tolist() if x.startswith("MESH")
}
ctd_chemical_diseases_omim = {
    x.split(":")[1] for x in ctd_chemical_diseases_df.tolist() if x.startswith("OMIM")
}

In [12]:
result = Result.make(
    dataset="ctd-gene-disease",
    source="mesh",
    target="doid",
    datasource_identifiers=ctd_chemical_diseases_mesh,
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to doid    % Unmappable
-----------------------------  --------------------  --------------
Total in ctd-gene-disease      5,821
Missing w/ mesh                3,342                 57.41%
Missing w/ mesh + BM.          2,675                 46.0%
Missing w/ mesh + BM. + Pred.  2,632                 45.2%


In [13]:
result = Result.make(
    dataset="ctd-gene-disease",
    source="mesh",
    target="mondo",
    datasource_identifiers=ctd_chemical_diseases_mesh,
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to mondo    % Unmappable
-----------------------------  ---------------------  --------------
Total in ctd-gene-disease      5,821
Missing w/ mesh                1,518                  26.08%
Missing w/ mesh + BM.          1,502                  25.8%
Missing w/ mesh + BM. + Pred.  1,458                  25.0%


# SIDER Side Effects

In [14]:
SIDER_URL = "http://sideeffects.embl.de/media/download/meddra_all_se.tsv.gz"

SIDE_EFFECTS_HEADER = [
    "STITCH_FLAT_ID",
    "STITCH_STEREO_ID",
    "UMLS CUI from Label",
    "MedDRA Concept Type",
    "UMLS CUI from MedDRA",
    "MedDRA Concept name",
]

side_effects_df = EVALUATION.ensure_csv(
    url=SIDER_URL,
    read_csv_kwargs=dict(
        dtype=str,
        header=None,
        names=SIDE_EFFECTS_HEADER,
    ),
)
side_effects_df

Unnamed: 0,STITCH_FLAT_ID,STITCH_STEREO_ID,UMLS CUI from Label,MedDRA Concept Type,UMLS CUI from MedDRA,MedDRA Concept name
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain
...,...,...,...,...,...,...
309844,CID171306834,CID071306834,C3203358,PT,C1145670,Respiratory failure
309845,CID171306834,CID071306834,C3665386,LLT,C3665386,Abnormal vision
309846,CID171306834,CID071306834,C3665386,PT,C3665347,Visual impairment
309847,CID171306834,CID071306834,C3665596,LLT,C3665596,Warts


In [15]:
result = Result.make(
    dataset="sider",
    source="umls",
    target="doid",
    datasource_identifiers=set(side_effects_df["UMLS CUI from Label"].unique()),
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to doid    % Unmappable
-----------------------------  --------------------  --------------
Total in sider                 5,868
Missing w/ umls                4,730                 80.61%
Missing w/ umls + BM.          4,724                 80.5%
Missing w/ umls + BM. + Pred.  4,619                 78.7%


## CCLE Achilles Cell Lines

In [16]:
# See https://depmap.org/portal/download/
# CCLE_ACHILLES_URL = "https://ndownloader.figshare.com/files/35020903"
# unfortuntaly, this is set up to thwart automated downloading, so this file is included in this directory.

ccle_achilles_df = pd.read_csv("sample_info.csv")
ccle_achilles_df.head()

Unnamed: 0,DepMap_ID,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,RRID,WTSI_Master_Cell_ID,...,lineage_sub_subtype,lineage_molecular_subtype,default_growth_pattern,model_manipulation,model_manipulation_details,patient_id,parent_depmap_id,Cellosaurus_NCIt_disease,Cellosaurus_NCIt_id,Cellosaurus_issues
0,ACH-000016,SLR 21,SLR21,SLR21_KIDNEY,,,,Academic lab,CVCL_V607,,...,,,,,,PT-JnARLB,,Clear cell renal cell carcinoma,C4033,
1,ACH-000032,MHH-CALL-3,MHHCALL3,MHHCALL3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,Female,DSMZ,CVCL_0089,,...,b_cell,,,,,PT-p2KOyI,,Childhood B acute lymphoblastic leukemia,C9140,
2,ACH-000033,NCI-H1819,NCIH1819,NCIH1819_LUNG,,,Female,Academic lab,CVCL_1497,,...,NSCLC_adenocarcinoma,,,,,PT-9p1WQv,,Lung adenocarcinoma,C3512,
3,ACH-000043,Hs 895.T,HS895T,HS895T_FIBROBLAST,,,Female,ATCC,CVCL_0993,,...,,,2D: adherent,,,PT-rTUVZQ,,Melanoma,C3224,
4,ACH-000049,HEK TE,HEKTE,HEKTE_KIDNEY,,,,Academic lab,CVCL_WS59,,...,,,,immortalized,,PT-qWYYgr,,,,No information is available about this cell li...


In [17]:
result = Result.make(
    dataset="ccle-achilles",
    source="ccle",
    target="efo",
    datasource_identifiers=set(ccle_achilles_df["CCLE_Name"].unique()),
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to efo    % Unmappable
-----------------------------  -------------------  --------------
Total in ccle-achilles         1,837
Missing w/ ccle                1,837                100.00%
Missing w/ ccle + BM.          1,326                72.2%
Missing w/ ccle + BM. + Pred.  1,270                69.1%


## Rhea

In [18]:
RHEA_URL = "https://ftp.expasy.org/databases/rhea/tsv/chebiId%5Fname.tsv"
rhea_chebi_ids = {
    curie.removeprefix("CHEBI:")
    for curie in pd.read_csv(RHEA_URL, sep="\t", header=None, usecols=[0], squeeze=True)
    if curie.startswith("CHEBI:")
}
result = Result.make(
    dataset="rhea",
    source="chebi",
    target="mesh",
    datasource_identifiers=rhea_chebi_ids,
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                         Unmappable to mesh    % Unmappable
------------------------------  --------------------  --------------
Total in rhea                   10,812
Missing w/ chebi                10,812                100.00%
Missing w/ chebi + BM.          10,684                98.8%
Missing w/ chebi + BM. + Pred.  9,861                 91.2%


## Summary

In [19]:
evaluation_df_rows = []
for result in evaluation_results:
    evaluation_df_rows.append(
        (
            result.dataset,
            result.source,
            result.target,
            result.total,
            result.missing,
            round(100 * result.missing / result.total, 1),
            result.missing_biomappings,
            round(100 * result.missing_biomappings / result.total, 1),
            round(100 * (result.missing - result.missing_biomappings) / result.total, 1),
            result.missing_predictions,
            round(100 * result.missing_predictions / result.total, 1),
            round(100 * (result.missing - result.missing_predictions) / result.total, 1),
        )
    )
pd.DataFrame(
    evaluation_df_rows,
    columns=[
        "dataset",
        "source",
        "target",
        "total",
        "missing_w_primary",
        "m1 (%)",
        "missing_w_curations",
        "m2 (%)",
        "m2d",
        "missing_w_predictions",
        "m3 (%)",
        "m3d",
    ],
)

Unnamed: 0,dataset,source,target,total,missing_w_primary,m1 (%),missing_w_curations,m2 (%),m2d,missing_w_predictions,m3 (%),m3d
0,ctd-chemical-gene,mesh,chebi,14337,14337,100.0,12884,89.9,10.1,9086,63.4,36.6
1,ctd-gene-disease,mesh,doid,5821,3342,57.4,2675,46.0,11.5,2632,45.2,12.2
2,ctd-gene-disease,mesh,mondo,5821,1518,26.1,1502,25.8,0.3,1458,25.0,1.0
3,sider,umls,doid,5868,4730,80.6,4724,80.5,0.1,4619,78.7,1.9
4,ccle-achilles,ccle,efo,1837,1837,100.0,1326,72.2,27.8,1270,69.1,30.9
5,rhea,chebi,mesh,10812,10812,100.0,10684,98.8,1.2,9861,91.2,8.8
