In [23]:
import csv
from collections import Counter
from operator import attrgetter
from therapy import PROJECT_ROOT
from therapy.normalizers import Wikidata
import json

In [24]:
drug_claim_records = list()
with open(PROJECT_ROOT / 'data' / 'drugs.tsv', 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for record in reader:
        if record['drug_claim_source'] == 'PubChem':
            continue
        drug_claim_records.append(record)
    

In [25]:
drug_claim_records[0]

{'drug_claim_name': 'DCL001024',
 'drug_name': 'OLEIC ACID',
 'chembl_id': 'CHEMBL8659',
 'drug_claim_source': 'TTD'}

In [26]:
w = Wikidata()

In [27]:
c = Counter()
match = list()
chembl_discordant = list()
discordant = list()
no_match = list()
ambiguous = list()
for record in drug_claim_records:
    resp1 = w.normalize(record['drug_claim_name'])
    record['dcn_w'] = resp1
    resp2 = w.normalize(record['drug_name'])
    record['dn_w'] = resp2
    
    if resp1.match_type is None or resp1.match_type == 'ambiguous':
        if resp2.match_type is None:
            t = resp1.match_type
        else:
            t = resp2.match_type
            respx = resp2
    elif resp2.match_type is None or resp2.match_type == 'ambiguous':
        if resp1.match_type is None:
            t = resp2.match_type
        else:
            t = resp1.match_type
            respx = resp1
    else:
        assert len(resp1.therapy_records) == 1
        assert len(resp2.therapy_records) == 1
        label1 = resp1.therapy_records[0].concept_identifier
        label2 = resp2.therapy_records[0].concept_identifier
        if label1 == label2:
            t = 'concordant'
            respx = resp1
        else:
            t = 'discordant'
    if t == 'discordant':
        discordant.append(record)
    elif t is None:
        no_match.append(record)
    elif t == 'ambiguous':
        ambiguous.append(record)
    else:
        match.append(record)
    c[t] += 1
c

Counter({None: 11063,
         'concordant': 4426,
         'case-insensitive-match': 2625,
         'match': 830,
         'ambiguous': 176,
         'discordant': 145})

In [31]:
c = Counter(map(lambda x: x['drug_claim_source'], no_match))

In [32]:
c

Counter({'TTD': 1464,
         'GuideToPharmacologyInteractions': 4528,
         'NCI': 805,
         'TdgClinicalTrial': 1182,
         'CKB': 521,
         'ChemblInteractions': 1850,
         'TALC': 172,
         'MyCancerGenomeClinicalTrial': 39,
         'CGI': 33,
         'ClearityFoundationBiomarkers': 20,
         'OncoKB': 23,
         'TEND': 59,
         'MyCancerGenome': 120,
         'ClearityFoundationClinicalTrial': 59,
         'FDA': 18,
         'CIViC': 130,
         'CancerCommons': 37,
         'DoCM': 3})