This notebook assesses the added value Biomappings gives to several resources.

In [1]:
import json
import sys
import time
from collections import Counter, defaultdict
from dataclasses import dataclass
from functools import lru_cache
from textwrap import dedent

import bioontologies
import bioregistry
import bioversions
import matplotlib.pyplot as plt
import pandas as pd
import pyobo
import pystow
from IPython.display import HTML
from matplotlib_venn import venn2
from tabulate import tabulate
from tqdm.auto import tqdm

import biomappings
from biomappings.paper_analysis import (
    EVALUATION,
    Result,
    get_non_obo_mappings,
    get_obo_mappings,
    get_primary_mappings,
    index_mappings,
)

In [2]:
print(sys.version)
print(time.asctime())

3.10.8 (main, Oct 13 2022, 10:17:43) [Clang 14.0.0 (clang-1400.0.29.102)]
Mon Oct 31 16:52:10 2022


In [3]:
evaluation_results = []

# Importing Mappings

## Biomappings

Manually curated mappings from Biomappings

In [4]:
biomappings_dd = index_mappings(
    biomappings.load_mappings(),
    path=EVALUATION.join(name="positive_mapping_index.pkl"),
)

Predicted mappings from Biomappings

In [5]:
biomappings_predictions_dd = index_mappings(
    biomappings.load_predictions(),
    path=EVALUATION.join(name="predicted_mapping_index.pkl"),
)

## Primary Mappings

Get primary mappings from 1) OBO ontologies that can be parsed with ROBOT and 2) other resoruces, via PyOBO.

In [6]:
# Primary mappings from OBO and other sources are going in here
primary_dd = defaultdict(dict)
summary_rows = []

summary_rows.extend(get_obo_mappings(primary_dd, biomappings_dd))
summary_rows.extend(get_non_obo_mappings(primary_dd, biomappings_dd))

Extracting umls from doid:   0%|          | 0.00/13.7k [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#DO_rare_slim
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#DO_AGR_slim
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#DO_CFDE_slim
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#gram-negative_bacterial_infectious_disease
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#DO_IEDB_slim
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#DO_GXD_slim
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#zoonotic_infectious_disease
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#tick-borne_infectious_disease
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#TopNodes_DOcancerslim
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#DO_FlyBase_slim
could not parse OBO internal relation: http://pu

Extracting umls from doid:   0%|          | 0.00/372 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/185 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/27.0 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/16.0 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/59.0 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/29.0 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/447 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/1.14k [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/202 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/40.0 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/247 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/905 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/39.0 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/791 [00:00<?, ?node/s]

Extracting umls from doid:   0%|          | 0.00/10.0 [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#chebi
could not parse OBO internal relation: http://purl.obolibrary.org/obo/doid#sequence


Extracting mesh from doid:   0%|          | 0.00/13.7k [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/372 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/185 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/27.0 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/16.0 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/59.0 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/29.0 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/447 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/1.14k [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/202 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/40.0 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/247 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/905 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/39.0 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/791 [00:00<?, ?node/s]

Extracting mesh from doid:   0%|          | 0.00/10.0 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/13.7k [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/372 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/185 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/27.0 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/16.0 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/59.0 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/29.0 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/447 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/1.14k [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/202 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/40.0 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/247 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/905 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/39.0 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/791 [00:00<?, ?node/s]

Extracting mondo from doid:   0%|          | 0.00/10.0 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/13.7k [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/372 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/185 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/27.0 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/16.0 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/59.0 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/29.0 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/447 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/1.14k [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/202 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/40.0 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/247 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/905 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/39.0 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/791 [00:00<?, ?node/s]

Extracting efo from doid:   0%|          | 0.00/10.0 [00:00<?, ?node/s]

Extracting umls from mondo:   0%|          | 0.00/45.0k [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/mondo#has_onset_during_or_after
could not parse OBO internal relation: http://purl.obolibrary.org/obo/exo#interacts_with_an_exposure_stressor_via
could not parse OBO internal relation: http://purl.obolibrary.org/obo/mondo#disease_triggers
could not parse OBO internal relation: http://purl.obolibrary.org/obo/mondo#disease_causes_feature
could not parse OBO internal relation: http://purl.obolibrary.org/obo/mondo#has_onset
could not parse OBO internal relation: http://purl.obolibrary.org/obo/mondo#disease_has_basis_in_development_of
could not parse OBO internal relation: http://purl.obolibrary.org/obo/mondo#AMBIGUOUS
could not parse OBO internal relation: http://purl.obolibrary.org/obo/mondo#has_onset_before
could not parse OBO internal relation: http://purl.obolibrary.org/obo/cl#has_completed
could not parse OBO internal relation: http://purl.obolibrary.org/obo/mondo#has_exclusion_reason
could not parse OBO internal re

Extracting mesh from mondo:   0%|          | 0.00/45.0k [00:00<?, ?node/s]

Extracting doid from mondo:   0%|          | 0.00/45.0k [00:00<?, ?node/s]

Extracting efo from mondo:   0%|          | 0.00/45.0k [00:00<?, ?node/s]

Extracting mesh from efo:   0%|          | 0.00/38.0k [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#branch_of
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#transitively_distally_connected_to
could not parse OBO internal relation: http://purl.obolibrary.org/obo/chebi#BRAND_NAME
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#channel_for
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#existence_starts_after
could not parse OBO internal relation: http://purl.obolibrary.org/obo/chebi#INN
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#transitively_anteriorly_connected_to
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#enclosed_by
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#existence_starts_or_ends_during
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#proximally_connected_to
could not parse OBO internal rel

Extracting doid from efo:   0%|          | 0.00/38.0k [00:00<?, ?node/s]

Extracting cl from efo:   0%|          | 0.00/38.0k [00:00<?, ?node/s]

Extracting ccle from efo:   0%|          | 0.00/38.0k [00:00<?, ?node/s]



Extracting mesh from hp:   0%|          | 0.00/31.7k [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#transitively_anteriorly_connected_to
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#proximally_connected_to
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#transitively_connected_to
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#distally_connected_to
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#channel_for
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#subdivision_of
could not parse OBO internal relation: http://purl.obolibrary.org/obo/hp#layperson
could not parse OBO internal relation: http://purl.obolibrary.org/obo/hp#plural_form
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#trunk_part_of
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#transitively_proximally_co

Extracting mesh from go:   0%|          | 0.00/51.1k [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/go#systematic_synonym
could not parse OBO internal relation: http://purl.obolibrary.org/obo/go#syngo_official_label


Extracting reactome from go:   0%|          | 0.00/51.1k [00:00<?, ?node/s]

Extracting wikipathways from go:   0%|          | 0.00/51.1k [00:00<?, ?node/s]



Extracting mesh from uberon:   0%|          | 0.00/21.4k [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#SYSTEMATIC
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#DEVELOPMENTAL
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#PENDING_REVIEW
could not parse OBO internal relation: http://purl.obolibrary.org/obo/bspo#vertebrate
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#BRAIN_NAME_ABV
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#ABBREVIATION
could not parse OBO internal relation: http://purl.obolibrary.org/obo/cl#HUMAN_PREFERRED
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#MISSPELLING
could not parse OBO internal relation: http://purl.obolibrary.org/obo/cl#abbreviation
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon/core#NON_MAMMAL
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uber

Extracting efo from cl:   0%|          | 0.00/17.3k [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#PLURAL
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#HUMAN_PREFERRED
could not parse OBO internal relation: http://purl.obolibrary.org/obo/uberon#LATIN


Extracting mesh from cl:   0%|          | 0.00/17.3k [00:00<?, ?node/s]

Extracting mesh from chebi:   0%|          | 0.00/182k [00:00<?, ?node/s]

could not parse OBO internal relation: http://purl.obolibrary.org/obo/chebi#has_major_microspecies_at_pH_7_3


Extracting ncit from chebi:   0%|          | 0.00/182k [00:00<?, ?node/s]



## Summary

In [20]:
summary_df = pd.DataFrame(
    summary_rows,
    columns=[
        "resource",
        "version",
        "external",
        "primary_xrefs",
        "biomappings_xrefs",
        "total_xrefs",
        "percentage_gain",
    ],
)
pd.option_context("display.max_rows", summary_df.shape[0])
summary_df

Unnamed: 0,resource,version,external,primary_xrefs,biomappings_xrefs,total_xrefs,percentage_gain
0,doid,2022-09-29,umls,0,246,246,inf
1,doid,2022-09-29,mesh,0,2905,2905,inf
2,doid,2022-09-29,efo,0,126,126,inf
3,mondo,2022-10-11,umls,16771,0,16771,0.0
4,mondo,2022-10-11,mesh,7982,414,8396,5.2
5,mondo,2022-10-11,doid,9894,0,9894,0.0
6,mondo,2022-10-11,efo,2865,0,2865,0.0
7,efo,3.47.0,mesh,2429,217,2646,8.9
8,efo,3.47.0,doid,2300,126,2426,5.5
9,efo,3.47.0,cl,11,0,11,0.0


# Assess Impact on Mapping Hetionet Datasources

## CTD Chemical-Gene Interactions

In [8]:
CTD_CHEMICAL_GENE_URL = "https://ctdbase.org/reports/CTD_chem_gene_ixns.tsv.gz"
ctd_header = [
    "chemical_name",
    "chemical_mesh_id",
    "chemical_cas",
    "gene_symbol",
    "gene_ncbigene_id",
    "gene_forms",
    "organism_name",
    "organism_ncbitaxon_id",
    "evidence",
    "interaction",
    "pubmed_ids",
]
ctd_gene_chemical_df = EVALUATION.ensure_csv(
    url=CTD_CHEMICAL_GENE_URL,
    read_csv_kwargs=dict(
        sep="\t",
        comment="#",
        header=None,
        dtype=str,
        keep_default_na=False,
        usecols=[1],
        squeeze=True,
    ),
)
ctd_gene_chemical_df.head()

0    C534883
1    C534883
2    C534883
3    C534883
4    C534883
Name: 1, dtype: object

In [9]:
# TODO consider that CTD provides CAS identifiers - mesh
# provides CAS ids, which can also be mapped to ChEBI

result = Result.make(
    dataset="ctd-chemical-gene",
    source="mesh",
    target="chebi",
    datasource_identifiers=set(ctd_gene_chemical_df.tolist()),
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to chebi    % Unmappable
-----------------------------  ---------------------  --------------
Total in ctd-chemical-gene     14,337
Missing w/ mesh                14,337                 100.00%
Missing w/ mesh + BM.          12,884                 89.9%
Missing w/ mesh + BM. + Pred.  9,086                  63.4%


## CTD Chemical-Diseases

In [10]:
CTD_CHEMICAL_DISEASES_URL = "https://ctdbase.org/reports/CTD_chemicals_diseases.tsv.gz"
"""
    ChemicalName
    ChemicalID (MeSH identifier)
    CasRN (CAS Registry Number, if available)
    DiseaseName
    DiseaseID (MeSH or OMIM identifier)
    DirectEvidence ('|'-delimited list)
    InferenceGeneSymbol
    InferenceScore
    OmimIDs ('|'-delimited list)
    PubMedIDs ('|'-delimited list)
"""
ctd_chemical_diseases_df = EVALUATION.ensure_csv(
    url=CTD_CHEMICAL_DISEASES_URL,
    read_csv_kwargs=dict(
        sep="\t",
        comment="#",
        header=None,
        dtype=str,
        keep_default_na=False,
        usecols=[4],
        squeeze=True,
    ),
)
ctd_chemical_diseases_df.head()

0       MESH:D054198
1       MESH:D000230
2    MESH:D000077192
3       MESH:D000505
4       MESH:D013734
Name: 4, dtype: object

In [11]:
ctd_chemical_diseases_mesh = {
    x.split(":")[1] for x in ctd_chemical_diseases_df.tolist() if x.startswith("MESH")
}
ctd_chemical_diseases_omim = {
    x.split(":")[1] for x in ctd_chemical_diseases_df.tolist() if x.startswith("OMIM")
}

In [12]:
result = Result.make(
    dataset="ctd-gene-disease",
    source="mesh",
    target="doid",
    datasource_identifiers=ctd_chemical_diseases_mesh,
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to doid    % Unmappable
-----------------------------  --------------------  --------------
Total in ctd-gene-disease      5,821
Missing w/ mesh                5,821                 100.00%
Missing w/ mesh + BM.          4,751                 81.6%
Missing w/ mesh + BM. + Pred.  4,706                 80.8%


In [13]:
result = Result.make(
    dataset="ctd-gene-disease",
    source="mesh",
    target="mondo",
    datasource_identifiers=ctd_chemical_diseases_mesh,
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to mondo    % Unmappable
-----------------------------  ---------------------  --------------
Total in ctd-gene-disease      5,821
Missing w/ mesh                1,472                  25.29%
Missing w/ mesh + BM.          1,461                  25.1%
Missing w/ mesh + BM. + Pred.  1,418                  24.4%


# SIDER Side Effects

In [14]:
SIDER_URL = "http://sideeffects.embl.de/media/download/meddra_all_se.tsv.gz"

SIDE_EFFECTS_HEADER = [
    "STITCH_FLAT_ID",
    "STITCH_STEREO_ID",
    "UMLS CUI from Label",
    "MedDRA Concept Type",
    "UMLS CUI from MedDRA",
    "MedDRA Concept name",
]

side_effects_df = EVALUATION.ensure_csv(
    url=SIDER_URL,
    read_csv_kwargs=dict(
        dtype=str,
        header=None,
        names=SIDE_EFFECTS_HEADER,
    ),
)
side_effects_df

Unnamed: 0,STITCH_FLAT_ID,STITCH_STEREO_ID,UMLS CUI from Label,MedDRA Concept Type,UMLS CUI from MedDRA,MedDRA Concept name
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain
...,...,...,...,...,...,...
309844,CID171306834,CID071306834,C3203358,PT,C1145670,Respiratory failure
309845,CID171306834,CID071306834,C3665386,LLT,C3665386,Abnormal vision
309846,CID171306834,CID071306834,C3665386,PT,C3665347,Visual impairment
309847,CID171306834,CID071306834,C3665596,LLT,C3665596,Warts


In [15]:
result = Result.make(
    dataset="sider",
    source="umls",
    target="doid",
    datasource_identifiers=set(side_effects_df["UMLS CUI from Label"].unique()),
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to doid    % Unmappable
-----------------------------  --------------------  --------------
Total in sider                 5,868
Missing w/ umls                5,868                 100.00%
Missing w/ umls + BM.          5,855                 99.8%
Missing w/ umls + BM. + Pred.  5,726                 97.6%


## CCLE Achilles Cell Lines

In [16]:
# See https://depmap.org/portal/download/
# CCLE_ACHILLES_URL = "https://ndownloader.figshare.com/files/35020903"
# unfortuntaly, this is set up to thwart automated downloading, so this file is included in this directory.

ccle_achilles_df = pd.read_csv("sample_info.csv")
ccle_achilles_df.head()

Unnamed: 0,DepMap_ID,cell_line_name,stripped_cell_line_name,CCLE_Name,alias,COSMICID,sex,source,RRID,WTSI_Master_Cell_ID,...,lineage_sub_subtype,lineage_molecular_subtype,default_growth_pattern,model_manipulation,model_manipulation_details,patient_id,parent_depmap_id,Cellosaurus_NCIt_disease,Cellosaurus_NCIt_id,Cellosaurus_issues
0,ACH-000016,SLR 21,SLR21,SLR21_KIDNEY,,,,Academic lab,CVCL_V607,,...,,,,,,PT-JnARLB,,Clear cell renal cell carcinoma,C4033,
1,ACH-000032,MHH-CALL-3,MHHCALL3,MHHCALL3_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,Female,DSMZ,CVCL_0089,,...,b_cell,,,,,PT-p2KOyI,,Childhood B acute lymphoblastic leukemia,C9140,
2,ACH-000033,NCI-H1819,NCIH1819,NCIH1819_LUNG,,,Female,Academic lab,CVCL_1497,,...,NSCLC_adenocarcinoma,,,,,PT-9p1WQv,,Lung adenocarcinoma,C3512,
3,ACH-000043,Hs 895.T,HS895T,HS895T_FIBROBLAST,,,Female,ATCC,CVCL_0993,,...,,,2D: adherent,,,PT-rTUVZQ,,Melanoma,C3224,
4,ACH-000049,HEK TE,HEKTE,HEKTE_KIDNEY,,,,Academic lab,CVCL_WS59,,...,,,,immortalized,,PT-qWYYgr,,,,No information is available about this cell li...


In [17]:
result = Result.make(
    dataset="ccle-achilles",
    source="ccle",
    target="efo",
    datasource_identifiers=set(ccle_achilles_df["CCLE_Name"].unique()),
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                        Unmappable to efo    % Unmappable
-----------------------------  -------------------  --------------
Total in ccle-achilles         1,837
Missing w/ ccle                1,837                100.00%
Missing w/ ccle + BM.          1,326                72.2%
Missing w/ ccle + BM. + Pred.  1,270                69.1%


## Rhea

In [18]:
RHEA_URL = "https://ftp.expasy.org/databases/rhea/tsv/chebiId%5Fname.tsv"
rhea_chebi_ids = {
    curie.removeprefix("CHEBI:")
    for curie in pd.read_csv(RHEA_URL, sep="\t", header=None, usecols=[0], squeeze=True)
    if curie.startswith("CHEBI:")
}
result = Result.make(
    dataset="rhea",
    source="chebi",
    target="mesh",
    datasource_identifiers=rhea_chebi_ids,
    primary=primary_dd,
    secondary=biomappings_dd,
    tertiary=biomappings_predictions_dd,
)
result.print()
evaluation_results.append(result)

Missing                         Unmappable to mesh    % Unmappable
------------------------------  --------------------  --------------
Total in rhea                   10,812
Missing w/ chebi                10,812                100.00%
Missing w/ chebi + BM.          10,684                98.8%
Missing w/ chebi + BM. + Pred.  9,861                 91.2%


## Summary

In [19]:
evaluation_df_rows = []
for result in evaluation_results:
    evaluation_df_rows.append(
        (
            result.dataset,
            result.source,
            result.target,
            result.total,
            result.missing,
            round(100 * result.missing / result.total, 1),
            result.missing_biomappings,
            round(100 * result.missing_biomappings / result.total, 1),
            round(100 * (result.missing - result.missing_biomappings) / result.total, 1),
            result.missing_predictions,
            round(100 * result.missing_predictions / result.total, 1),
            round(100 * (result.missing - result.missing_predictions) / result.total, 1),
        )
    )
pd.DataFrame(
    evaluation_df_rows,
    columns=[
        "dataset",
        "source",
        "target",
        "total",
        "missing_w_primary",
        "m1 (%)",
        "missing_w_curations",
        "m2 (%)",
        "m2d",
        "missing_w_predictions",
        "m3 (%)",
        "m3d",
    ],
)

Unnamed: 0,dataset,source,target,total,missing_w_primary,m1 (%),missing_w_curations,m2 (%),m2d,missing_w_predictions,m3 (%),m3d
0,ctd-chemical-gene,mesh,chebi,14337,14337,100.0,12884,89.9,10.1,9086,63.4,36.6
1,ctd-gene-disease,mesh,doid,5821,5821,100.0,4751,81.6,18.4,4706,80.8,19.2
2,ctd-gene-disease,mesh,mondo,5821,1472,25.3,1461,25.1,0.2,1418,24.4,0.9
3,sider,umls,doid,5868,5868,100.0,5855,99.8,0.2,5726,97.6,2.4
4,ccle-achilles,ccle,efo,1837,1837,100.0,1326,72.2,27.8,1270,69.1,30.9
5,rhea,chebi,mesh,10812,10812,100.0,10684,98.8,1.2,9861,91.2,8.8
