In [1]:
import csv
from collections import Counter
from therapy import PROJECT_ROOT
from therapy.normalizers import Wikidata
import json

In [2]:
drug_claim_records = list()
with open(PROJECT_ROOT / 'data' / 'drugs.tsv', 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for record in reader:
        if record['drug_claim_source'] == 'PubChem':
            continue
        drug_claim_records.append(record)
    

In [3]:
drug_claim_records[0]

{'drug_claim_name': 'DCL001024',
 'drug_name': 'OLEIC ACID',
 'chembl_id': 'CHEMBL8659',
 'drug_claim_source': 'TTD'}

In [4]:
w = Wikidata()

In [13]:
c = Counter()
match = list()
chembl_discordant = list()
discordant = list()
no_match = list()
ambiguous = list()
for record in drug_claim_records:
    resp1 = w.normalize(record['drug_claim_name'])
    record['dcn_w'] = resp1
    resp2 = w.normalize(record['drug_name'])
    record['dn_w'] = resp2
    
    if resp1.match_type is None or resp1.match_type == 'ambiguous':
        if resp2.match_type is None:
            t = resp1.match_type
        else:
            t = resp2.match_type
    elif resp2.match_type is None or resp2.match_type == 'ambiguous':
        if resp1.match_type is None:
            t = resp2.match_type
        else:
            t = resp1.match_type
    else:
        label1 = resp1.normalized_label
        label2 = resp2.normalized_label
        if label1 == label2:
            t = 'concordant'
        else:
            t = 'discordant'
    if t == 'discordant':
        discordant.append(record)
    elif t is None:
        no_match.append(record)
    elif t == 'ambiguous':
        ambiguous.append(record)
    else:
        if record['chembl_id'] and record['chembl_id'] in resp1.aliases.get('chembl', set()):
            match.append(record)
        else:
            chembl_discordant.append(record)
            t = 'chembl-discordant'
    c[t] += 1
c

Counter({None: 11071,
         'concordant': 4175,
         'chembl-discordant': 3628,
         'ambiguous': 172,
         'case-insensitive-match': 25,
         'discordant': 145,
         'match': 49})

In [22]:
chembl_discordant_counts = Counter([x['drug_claim_source'] for x in chembl_discordant])
no_match_counts = Counter([x['drug_claim_source'] for x in no_match])
total_counts = Counter([x['drug_claim_source'] for x in drug_claim_records])

In [23]:
for k in chembl_discordant_counts:
    v = chembl_discordant_counts[k]
    s = total_counts[k]
    print(f'{k}: {v} / {s} ({v/s*100:.1f}%)')

TdgClinicalTrial: 153 / 2419 (6.3%)
GuideToPharmacologyInteractions: 1634 / 6495 (25.2%)
CGI: 24 / 155 (15.5%)
TTD: 1226 / 2720 (45.1%)
ChemblInteractions: 145 / 2716 (5.3%)
MyCancerGenome: 19 / 252 (7.5%)
NCI: 191 / 1519 (12.6%)
ClearityFoundationClinicalTrial: 7 / 115 (6.1%)
OncoKB: 15 / 99 (15.2%)
TALC: 18 / 274 (6.6%)
CKB: 47 / 733 (6.4%)
CIViC: 30 / 280 (10.7%)
CancerCommons: 8 / 80 (10.0%)
TEND: 76 / 989 (7.7%)
FDA: 20 / 204 (9.8%)
MyCancerGenomeClinicalTrial: 10 / 112 (8.9%)
ClearityFoundationBiomarkers: 5 / 64 (7.8%)


The above clearly demonstrates that a few resources are big outliers here: TTD and GtP. Reviewing / cleaning up the import process for those may help considerably with ChEMBL alignment.

In [15]:
chembl_discordant[0]

{'drug_claim_name': 'PEMETREXED',
 'drug_name': 'PEMETREXED (CHEMBL1201258)',
 'chembl_id': 'CHEMBL1201258',
 'drug_claim_source': 'TdgClinicalTrial',
 'dcn_w': NormalizerResponse(input_term='PEMETREXED', normalized_label={'pemetrexed'}, aliases={'item': {'http://www.wikidata.org/entity/Q415220'}, 'itemLabel': {'pemetrexed'}, 'casRegistry': {'137281-23-3'}, 'pubchemCompound': {'446556'}, 'chembl': {'CHEMBL225072'}, 'rxnorm': {'68446'}, 'drugbank': {'00642'}, 'altLabel': {'Pemetrexed', 'Alimta®', 'LY-231514'}}, match_type='case-insensitive-match'),
 'dn_w': NormalizerResponse(input_term='PEMETREXED (CHEMBL1201258)', normalized_label=None, aliases={}, match_type=None)}

Investigating this above record, I found that both ChEMBL IDs are valid, but distinct stereoisoforms. The Wikidata ID was the only one listed on the medication entry, and this is in fact the stereoisoform that has made it to approved usage.

In [25]:
for k in no_match_counts:
    v = no_match_counts[k]
    s = total_counts[k]
    print(f'{k}: {v} / {s} ({v/s*100:.1f}%)')

TTD: 1465 / 2720 (53.9%)
GuideToPharmacologyInteractions: 4528 / 6495 (69.7%)
NCI: 810 / 1519 (53.3%)
TdgClinicalTrial: 1182 / 2419 (48.9%)
CKB: 521 / 733 (71.1%)
ChemblInteractions: 1851 / 2716 (68.2%)
TALC: 172 / 274 (62.8%)
MyCancerGenomeClinicalTrial: 39 / 112 (34.8%)
CGI: 33 / 155 (21.3%)
ClearityFoundationBiomarkers: 20 / 64 (31.2%)
OncoKB: 23 / 99 (23.2%)
TEND: 59 / 989 (6.0%)
MyCancerGenome: 120 / 252 (47.6%)
ClearityFoundationClinicalTrial: 59 / 115 (51.3%)
FDA: 19 / 204 (9.3%)
CIViC: 130 / 280 (46.4%)
CancerCommons: 37 / 80 (46.2%)
DoCM: 3 / 39 (7.7%)


Across the board, we are missing big swaths of terms. Not great. However, this is currently only items in Wikidata that are an instance of medication (or a medication subclass). There are many chemicals in Wikidata that are not labeled as "medication", and including those as a secondary normalization step will likely improve these values.