# going to analyze the results of logmap matching
## load in data frames and packages

In [1]:
import polars as pl
from mapnet.utils import format_mappings, load_known_mappings_df, load_biomappings_df, get_novel_mappings
import obonet
from indra.databases import mesh_client
import os 

load the known mappings if they are not already present. 

In [None]:
if os.path.exists("../../knownMaps/doid_to_mesh_provided_maps.tsv"):
    from mapnet.bertmap.example import data_setup
    os.chdir('../..')
    data_setup()
    os.chdir("./mapnet/logmap-matcher/")


Please enter the maximum memory located to JVM [8g]:


INFO: [2025-04-15 16:10:53] deeponto - 16g maximum memory allocated to JVM.
INFO: [2025-04-15 16:10:53] deeponto - JVM started successfully.
INFO: [2025-04-15 16:10:55] datasets - PyTorch version 2.6.0 available.
INFO: [2025-04-15 16:10:55] datasets - Polars version 1.27.1 available.


filtering supplemental terms 


INFO: [2025-04-15 16:13:15] root - None pruning algorithm has been applied.
INFO: [2025-04-15 16:13:15] root - Save the pruned ontology file to resources/mesh_primary.owl.


In [2]:
def permissive_map(x):
    """a function to get the name of a given DOID entity, will return 'NO NAME FOUND', if the entity does not exist."""
    try:
        return g.nodes[x]["name"]
    except:
        return "NO NAME FOUND"


g = obonet.read_obo(
    "https://raw.githubusercontent.com/DiseaseOntology/"
    "HumanDiseaseOntology/main/src/ontology/HumanDO.obo"
)
maps_path = "../../mapnet/logmap-matcher/output/logmap2_mappings.tsv"
raw_maps = pl.read_csv(
    maps_path,
    separator="\t",
    has_header=False,
    new_columns=['SrcEntity', 'TgtEntity', 'Score'],
)
predicted_maps = format_mappings(
    df=raw_maps,
    source_prefix="DOID",
    target_prefix="MESH",
    matching_source="LogMap",
        source_name_func=permissive_map,
        target_name_func=mesh_client.get_mesh_name,
)

known_mappings = load_known_mappings_df(known_mappings_path="../../knownMaps/doid_to_mesh_provided_maps.tsv",
        source_name_func=permissive_map,
        target_name_func=mesh_client.get_mesh_name,)

biomappings_maps = load_biomappings_df(
    source_prefix="DOID",
    target_prefix="MESH",
)
evidence = known_mappings.vstack(biomappings_maps)



INFO: [2025-04-15 16:14:48] root - Will decode content from https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/HumanDO.obo using utf-8 charset.


## inspect number of cases where a predicted map exactly matches a map we have seen in the evidence table

In [3]:
predictions_in_known_maps = predicted_maps.join(evidence, on=['source identifier', 'target identifier'], how='inner')
print(f'out of {len(predicted_maps)} total mappings, there are {len(predictions_in_known_maps)} that is {len(predictions_in_known_maps)/len(predicted_maps)}')
predictions_in_known_maps.head()

out of 3107 total mappings, there are 2623 that is 0.8442227228838107


source prefix,source identifier,source name,relation,target prefix,target identifier,target name,type,confidence,source,source name_right,target name_right
str,str,str,str,str,str,str,str,f64,str,str,str
"""DOID""","""DOID:0001816""","""angiosarcoma""","""skos:exactMatch""","""MESH""","""D006394""","""Hemangiosarcoma""","""semapv:SemanticSimilarityThres…",0.86,"""LogMap""","""angiosarcoma""","""Hemangiosarcoma"""
"""DOID""","""DOID:0014667""","""disease of metabolism""","""skos:exactMatch""","""MESH""","""D008659""","""Metabolic Diseases""","""semapv:SemanticSimilarityThres…",0.78,"""LogMap""","""disease of metabolism""","""Metabolic Diseases"""
"""DOID""","""DOID:0040091""","""autoimmune pancreatitis""","""skos:exactMatch""","""MESH""","""D000081012""","""Autoimmune Pancreatitis""","""semapv:SemanticSimilarityThres…",0.77,"""LogMap""","""autoimmune pancreatitis""","""Autoimmune Pancreatitis"""
"""DOID""","""DOID:0050052""","""Rocky Mountain spotted fever""","""skos:exactMatch""","""MESH""","""D012373""","""Rocky Mountain Spotted Fever""","""semapv:SemanticSimilarityThres…",0.61,"""LogMap""","""Rocky Mountain spotted fever""","""Rocky Mountain Spotted Fever"""
"""DOID""","""DOID:0050061""","""erysipeloid""","""skos:exactMatch""","""MESH""","""D004887""","""Erysipeloid""","""semapv:SemanticSimilarityThres…",0.79,"""LogMap""","""erysipeloid""","""Erysipeloid"""


## Now lets look at the novel mappings, here we are defining a novel mapping as a map that involves classes that have no maps to them in the evidence

In [4]:
novel_predictions = predicted_maps.join(evidence, on=['target identifier'], how='anti')
novel_predictions = novel_predictions.join(evidence, on=['source identifier'], how='anti')

print(f'out of {len(predicted_maps)} total mappings, there are {len(novel_predictions)} that is {len(novel_predictions)/len(predicted_maps)}')
novel_predictions.head()

out of 3107 total mappings, there are 148 that is 0.04763437399420663


source prefix,source identifier,source name,relation,target prefix,target identifier,target name,type,confidence,source
str,str,str,str,str,str,str,str,f64,str
"""DOID""","""DOID:12369""","""prolapse of urethra""","""skos:exactMatch""","""MESH""","""D011391""","""Prolapse""","""semapv:SemanticSimilarityThres…",0.54,"""LogMap"""
"""DOID""","""DOID:9771""","""transient neonatal thrombocyto…","""skos:exactMatch""","""MESH""","""D054098""","""Thrombocytopenia, Neonatal All…","""semapv:SemanticSimilarityThres…",0.57,"""LogMap"""
"""DOID""","""DOID:1725""","""peritoneum cancer""","""skos:exactMatch""","""MESH""","""D010534""","""Peritoneal Neoplasms""","""semapv:SemanticSimilarityThres…",0.65,"""LogMap"""
"""DOID""","""DOID:0070327""","""melanoma in congenital melanoc…","""skos:exactMatch""","""MESH""","""D009508""","""Nevus, Pigmented""","""semapv:SemanticSimilarityThres…",0.56,"""LogMap"""
"""DOID""","""DOID:1248""","""ocular hyperemia""","""skos:exactMatch""","""MESH""","""D006940""","""Hyperemia""","""semapv:SemanticSimilarityThres…",0.62,"""LogMap"""


## inspect number of cases where a class is mapped incorrectly. That is either the source or target class is mapped to a different class in our evidence table.

In [5]:
targets_mapped_wrong = predicted_maps.join(evidence, on=['target identifier'], how='inner')
targets_mapped_wrong = targets_mapped_wrong.join(evidence, on=['source identifier', 'source name'], how='anti')

print(f'out of {len(predicted_maps)} total mappings, there are {len(targets_mapped_wrong)} that is {len(targets_mapped_wrong)/len(predicted_maps)}')
targets_mapped_wrong.select(
    pl.col('target name'),
    pl.col("source name").alias("predicted source name"),
    pl.col("source name_right").alias("true source name"),
).head()

out of 3107 total mappings, there are 252 that is 0.08110717734148697


target name,predicted source name,true source name
str,str,str
"""Lymphohistiocytosis, Hemophago…","""familial hemophagocytic lympho…","""hemophagocytic lymphohistiocyt…"
"""Kartagener Syndrome""","""primary ciliary dyskinesia 1""","""Kartagener syndrome"""
"""Idiopathic Pulmonary Fibrosis""","""interstitial lung disease 1""","""idiopathic pulmonary fibrosis"""
"""Encephalitis, Tick-Borne""","""Powassan encephalitis""","""tick-borne encephalitis"""
"""Andersen Syndrome""","""long QT syndrome 8""","""Andersen-Tawil syndrome"""


In [6]:
sources_mapped_wrong = predicted_maps.join(evidence, on=['source identifier'], how='inner')
sources_mapped_wrong = sources_mapped_wrong.join(evidence, on=['target identifier', 'target name'], how='anti')

print(f'out of {len(predicted_maps)} total mappings, there are {len(sources_mapped_wrong)} that is {len(sources_mapped_wrong)/len(predicted_maps)}')
sources_mapped_wrong.head()
sources_mapped_wrong.select(
    pl.col('source name'),
    pl.col("target name").alias("predicted target name"),
    pl.col("target name_right").alias("true target name"),
).head()

out of 3107 total mappings, there are 179 that is 0.057611844222722884


source name,predicted target name,true target name
str,str,str
"""nonepidermolytic palmoplantar …","""Keratoderma, Palmoplantar, Dif…","""Palmoplantar Keratoderma, None…"
"""X-linked hypophosphatemic rick…","""Rickets, Hypophosphatemic""","""Familial Hypophosphatemic Rick…"
"""intestinal schistosomiasis""","""Schistosomiasis mansoni""","""Schistosomiasis japonica"""
"""paraganglioma""","""Paraganglioma, Extra-Adrenal""","""Paraganglioma"""
"""generalized dystonia""","""Dystonic Disorders""","""Dystonia 12"""


## I want to compare the BERTmap and LogMap results. Unfortunately, the original BERTmap results were lost when I gave back my old work computer. I am re-running the code now.