# Mapping Clinical Trials to ChEBI

This notebook assess the impact of mapping between MeSH and ChEBI through the scope of clinical trial data from ClinicalTrials.gov. Note that this notebook is rather difficult to re-run due to the difficulty of downloading the clinical trials data in bulk.

In [1]:
from collections import defaultdict

import matplotlib.pyplot as plt
import pandas
import pystow
import seaborn as sns
from indra_cogex.sources.clinicaltrials import get_correct_mesh_id

from biomappings import load_mappings

In [2]:
df = pandas.read_csv(
    pystow.join("indra", "cogex", "clinicaltrials", name="clinical_trials.csv.gz"), skiprows=10
)
del df["Rank"]

Fix errors in data due to incorrect encoding of MeSH identifiers (both syntax and mismatch with labels for interventions/conditions).

In [3]:
conditions = defaultdict(list)
interventions = defaultdict(list)

for row in df.itertuples():
    if not pandas.isna(row.ConditionMeshTerm):
        for mesh_id, mesh_term in zip(
            row.ConditionMeshId.split("|"), row.ConditionMeshTerm.split("|")
        ):
            fixed_mesh_id = get_correct_mesh_id(mesh_id, mesh_term)
            if not fixed_mesh_id:
                continue

            conditions[row.NCTId].append(fixed_mesh_id)
    if not pandas.isna(row.InterventionMeshTerm):
        for mesh_id, mesh_term in zip(
            row.InterventionMeshId.split("|"), row.InterventionMeshTerm.split("|")
        ):
            fixed_mesh_id = get_correct_mesh_id(mesh_id, mesh_term)
            if not fixed_mesh_id:
                continue
            interventions[row.NCTId].append(fixed_mesh_id)

In [4]:
n_conditions = sum(len(v) for v in conditions.values())
n_unique_conditions = len({i for v in conditions.values() for i in v})
n_interventions = sum(len(v) for v in interventions.values())
n_unique_interventions = len({i for v in interventions.values() for i in v})


print(f"Of {n_conditions:,} conditions, {n_unique_conditions:,} are unique")
print(f"Of {n_interventions:,} intervention, {n_unique_interventions:,} are unique")

Of 721,997 conditions, 4,181 are unique
Of 279,610 intervention, 3,614 are unique


In [5]:
mesh_chebi_mappings = {}

for mapping in load_mappings():
    if mapping["source prefix"] == "mesh" and mapping["target prefix"] == "chebi":
        mesh_chebi_mappings[mapping["source identifier"]] = mapping["target identifier"]
    elif mapping["target prefix"] == "mesh" and mapping["source prefix"] == "chebi":
        mesh_chebi_mappings[mapping["target identifier"]] = mapping["source identifier"]

len(mesh_chebi_mappings)

2663

In [6]:
absolute_distribution = []
relative_distribution = []
all_mappable = 0
some_mappable = 0
none_mappable = 0
n_trials = len(interventions)
unique_chemicals = set()
for trial, mesh_ids in interventions.items():
    n_mappable = 0
    for mesh_id in mesh_ids:
        chebi_id = mesh_chebi_mappings.get(mesh_id)
        if chebi_id:
            n_mappable += 1
            unique_chemicals.add(chebi_id)

    if n_mappable == len(mesh_ids):
        all_mappable += 1
    elif n_mappable > 0:
        some_mappable += 1
    else:
        none_mappable += 1

print(
    f"""\
{all_mappable:,}/{n_trials:,} ({all_mappable/n_trials:.1%}) trials were fully mapped
{some_mappable:,}/{n_trials:,} ({some_mappable/n_trials:.1%}) trials were only partially mapped
{all_mappable + some_mappable:,}/{n_trials:,} ({(all_mappable + some_mappable)/n_trials:.1%}) trials were either partially or fully mapped
{none_mappable:,}/{n_trials:,} ({none_mappable/n_trials:.1%}) trials were unmapped
{len(unique_chemicals):,}/{len(mesh_chebi_mappings):,} ({len(unique_chemicals)/len(mesh_chebi_mappings):.1%}) ChEBI mappings were used
"""
)

66,838/142,213 (47.0%) trials were fully mapped
33,427/142,213 (23.5%) trials were only partially mapped
100,265/142,213 (70.5%) trials were either partially or fully mapped
41,948/142,213 (29.5%) trials were unmapped
987/2,663 (37.1%) ChEBI mappings were used

