In [3]:
import pandas
import pystow
df = pandas.read_csv(pystow.join('indra', 'cogex', 'clinicaltrials', name='clinical_trials.csv.gz'),
                     skiprows=10)

In [4]:
df.head()

Unnamed: 0,Rank,NCTId,BriefTitle,Condition,ConditionMeshTerm,ConditionMeshId,InterventionName,InterventionType,InterventionMeshTerm,InterventionMeshId
0,1,NCT05477004,Observational Study of Ketamine Infusions for ...,Chronic Pain,Chronic Pain,D000059350,Ketamine,Drug,Ketamine,D000007649
1,2,NCT05476991,EVALUATION OF LOW DOSE COLCHICINE AND TICAGREL...,"Stroke|Stroke, Ischemic|Atherosclerosis|Myocar...",Stroke|Ischemic Stroke|Cerebral Infarction|Myo...,D000020521|D000083242|D000002544|D000009203|D0...,Colchicine 0.5 MG|Ticagrelor 90mg|Aspirin 75-3...,Drug|Drug|Drug,Aspirin|Colchicine|Ticagrelor,D000001241|D000003078|D000077486
2,3,NCT05476978,Artificial Intelligence in EUS for Diagnosing ...,"Pancreatic Ductal Adenocarcinoma|Pancreatitis,...",Neuroendocrine Tumors|Pancreatitis|Autoimmune ...,D000018358|D000010195|D000081012|D000050500,EUS-AI model,Diagnostic Test,,
3,4,NCT05476965,Selected De-escalation Radiotherapy for Postop...,Head and Neck Squamous Cell Carcinoma|Radiothe...,"Carcinoma|Carcinoma, Squamous Cell|Squamous Ce...",D000002277|D000002294|D000077195,induction therapy; surgery; radiotherapy,Combination Product,,
4,5,NCT05476952,Lean Body Weight-adjusted Rocuronium Dose and ...,There is no Consensus on Use for Use in Intuba...,Body Weight,D000001835,he patients who are administered rocuronium ac...,Procedure,Rocuronium,D000077123


In [5]:
from indra_cogex.sources.clinicaltrials import get_correct_mesh_id

conditions = []
interventions = []
for row in df.itertuples():
    if not pandas.isna(row.ConditionMeshTerm):
        for mesh_id, mesh_term in zip(row.ConditionMeshId.split('|'),
                                      row.ConditionMeshTerm.split('|')):
            fixed_mesh_id = get_correct_mesh_id(mesh_id, mesh_term)
            if not fixed_mesh_id:
                continue
            conditions.append(fixed_mesh_id)
    if not pandas.isna(row.InterventionMeshTerm):
        for mesh_id, mesh_term in zip(row.InterventionMeshId.split('|'),
                                      row.InterventionMeshTerm.split('|')):
            fixed_mesh_id = get_correct_mesh_id(mesh_id, mesh_term)
            if not fixed_mesh_id:
                continue
            interventions.append(fixed_mesh_id)

In [6]:
print('Total condition instances: %d out of which %d are unique' % (len(conditions), len(set(conditions))))
print('Total intervention instances: %d out of which %d are unique' % (len(interventions), len(set(interventions))))

Total condition instances: 721997 out of which 4181 are unique
Total intervention instances: 279610 out of which 3614 are unique


In [8]:
from biomappings import load_mappings, load_predictions
mappings = load_mappings()
predictions = load_predictions()

In [9]:
# Get MeSH mappings and make sure order is canonicalized
mesh_mappings = [
    (mapping['source prefix'], mapping['source identifier'], mapping['target prefix'], mapping['target identifier'])
    if mapping['source prefix'] == 'mesh' else
    (mapping['target prefix'], mapping['target identifier'], mapping['source prefix'], mapping['source identifier'])
    for mapping in mappings
    if mapping['relation'] == 'skos:exactMatch'
    and (mapping['source prefix'] == 'mesh'
         or mapping['target prefix'] == 'mesh')
]

In [10]:
mesh_predictions = [
    (mapping['source prefix'], mapping['source identifier'], mapping['target prefix'], mapping['target identifier'])
    if mapping['source prefix'] == 'mesh' else
    (mapping['target prefix'], mapping['target identifier'], mapping['source prefix'], mapping['source identifier'])
    for mapping in predictions
    if mapping['relation'] == 'skos:exactMatch'
    and (mapping['source prefix'] == 'mesh'
         or mapping['target prefix'] == 'mesh')
]

In [11]:
# Some sanity checks
assert len(mesh_mappings) == len(set(mesh_mappings))
assert len(mesh_predictions) == len(set(mesh_predictions))
assert not set(mesh_mappings) & set(mesh_predictions)

In [12]:
print('We have a total of %d curated MeSH mappings and %d uncurated predictions.' %
      (len(mesh_mappings), len(mesh_predictions)))

We have a total of 6179 curated MeSH mappings and 37287 uncurated predictions.


Curated mappings are to the following prefixes:

In [13]:
Counter([mapping[2] for mapping in mesh_mappings]).most_common()

[('chebi', 2466),
 ('doid', 1454),
 ('ncit', 799),
 ('go', 404),
 ('uniprot', 378),
 ('mondo', 206),
 ('uberon', 131),
 ('hgnc', 102),
 ('efo', 97),
 ('cl', 81),
 ('kegg.pathway', 25),
 ('pubchem.compound', 17),
 ('hp', 16),
 ('umls', 2),
 ('pfam', 1)]

Uncurated predictions are to the following prefixes:

In [15]:
Counter([mapping[2] for mapping in mesh_predictions]).most_common()

[('chebi', 12300),
 ('ncit', 11844),
 ('uniprot', 10970),
 ('hgnc', 1375),
 ('hp', 356),
 ('efo', 188),
 ('mondo', 81),
 ('uberon', 68),
 ('doid', 61),
 ('go', 44)]

In [16]:
trial_sets = {
    'interventions': Counter(interventions),
    'conditions': Counter(conditions)
}

biomappings_sets = {
    'mappings': mesh_mappings,
    'predictions': mesh_predictions
    }

biomappings_cnt = {
    'mappings': {m[1] for m in mesh_mappings},
    'predictions': {m[1] for m in mesh_predictions}
}

ranked_curations = {}
p80 = {}
import numpy as np
for trial_set, mesh_id_cnt in trial_sets.items():
    print(trial_set)
    print('-------')
    mapping_overlap = set(mesh_id_cnt) & biomappings_cnt['mappings']
    prediction_overlap = set(mesh_id_cnt) & biomappings_cnt['predictions']
    
    ids_to_curate = prediction_overlap - biomappings_cnt['mappings']
    print(
        ('There are %d MeSH IDs out of which %d have mappings and %d predictions. '
         'Out of these %d IDs don\'t have any curated mappings.') % 
          (len(mesh_id_cnt),
           len(mapping_overlap),
           len(prediction_overlap),
           len(ids_to_curate)
          )
    )
    ranked_curations[trial_set] = sorted([(k, v) for k, v in mesh_id_cnt.items()
                                          if k in ids_to_curate],
                                         key=lambda x: x[1],
                                         reverse=True)
    cumsum_curations = np.cumsum([c[1] for c in ranked_curations[trial_set]])
    print('If all %d predictions were curated, we could have mappings for a max of %d additional instances.'
          % (len(ids_to_curate), cumsum_curations[-1]))
    
    p80[trial_set] = sum(cumsum_curations < cumsum_curations[-1]*0.8)
    print('We can cover 80%% of these instances by curating just %d predictions.' % p80[trial_set])
    print()

interventions
-------
There are 3614 MeSH IDs out of which 853 have mappings and 1928 predictions. Out of these 1663 IDs don't have any curated mappings.
If all 1663 predictions were curated, we could have mappings for a max of 150512 additional instances.
We can cover 80% of these instances by curating just 282 predictions.

conditions
-------
There are 4181 MeSH IDs out of which 488 have mappings and 2153 predictions. Out of these 2021 IDs don't have any curated mappings.
If all 2021 predictions were curated, we could have mappings for a max of 359848 additional instances.
We can cover 80% of these instances by curating just 278 predictions.



In [17]:
mesh_ids_to_curate = set()
for trial_set, ranked_curs in ranked_curations.items():
    mesh_ids_to_curate |= {c[0] for c in ranked_curs[:p80[trial_set]]}

In [18]:
print('We need to curate predictions for these %d MeSH IDs:' % len(mesh_ids_to_curate))
print(sorted(mesh_ids_to_curate))

We need to curate predictions for these 560 MeSH IDs:
['C000604908', 'C005900', 'C008958', 'C009022', 'C018038', 'C021322', 'C021650', 'C024352', 'C030262', 'C033563', 'C034759', 'C042382', 'C042705', 'C055085', 'C056516', 'C068538', 'C081222', 'C082598', 'C093875', 'C100416', 'C485206', 'C512542', 'C520025', 'C522181', 'C527517', 'C530716', 'C533178', 'C543333', 'C545685', 'C548400', 'C549068', 'C562325', 'C570240', 'D000022', 'D000067877', 'D000068256', 'D000068258', 'D000068437', 'D000068736', 'D000068800', 'D000068817', 'D000068818', 'D000068877', 'D000068878', 'D000068879', 'D000068896', 'D000068900', 'D000069036', 'D000069057', 'D000069444', 'D000069448', 'D000069461', 'D000069549', 'D000069579', 'D000069594', 'D000070636', 'D000070642', 'D000072836', 'D000074323', 'D000074324', 'D000077143', 'D000077190', 'D000077209', 'D000077212', 'D000077216', 'D000077237', 'D000077270', 'D000077274', 'D000077544', 'D000077555', 'D000077594', 'D000079963', 'D000082', 'D000111', 'D000152', 'D0

In [20]:
ranked_curations['conditions']

[('D003920', 9147),
 ('D013577', 8760),
 ('D007938', 5613),
 ('D008223', 5505),
 ('D006973', 5482),
 ('D004194', 5012),
 ('D020521', 4627),
 ('D003324', 4534),
 ('D003866', 4507),
 ('D014947', 4194),
 ('D007674', 3504),
 ('D001168', 3366),
 ('D006505', 3340),
 ('D006331', 3311),
 ('D001249', 3212),
 ('D009765', 3178),
 ('D010300', 3139),
 ('D010149', 3116),
 ('D012598', 3110),
 ('D012559', 2915),
 ('D009101', 2915),
 ('D017202', 2908),
 ('D003922', 2733),
 ('D006506', 2715),
 ('D009103', 2600),
 ('D006470', 2578),
 ('D007238', 2482),
 ('D009362', 2477),
 ('D054219', 2467),
 ('D008545', 2433),
 ('D007511', 2389),
 ('D001172', 2339),
 ('D009203', 2337),
 ('D006526', 2276),
 ('D051436', 2198),
 ('D050177', 2155),
 ('D007945', 2054),
 ('D007246', 2018),
 ('D000230', 2005),
 ('D014456', 1985),
 ('D017116', 1925),
 ('D050723', 1923),
 ('D047928', 1903),
 ('D001930', 1840),
 ('D001523', 1760),
 ('D007676', 1703),
 ('D011014', 1688),
 ('D000163', 1648),
 ('D011565', 1576),
 ('D000077216', 1555