In [80]:
from collections import Counter
import sys

sys.path.append('..')

In [81]:
from filter_clinvar_xml import filter_xml, pprint, iterate_cvs_from_xml
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io import *

In [78]:
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)
pd.set_option('display.min_rows', 100)

In [148]:
from collections import Counter, defaultdict
import json

## Part 1: ClinVar

In [2]:
drug_xml = '/home/april/projects/opentargets/drug-response.xml.gz'

In [33]:
def is_pgkb(raw_cvs_xml):
    scvs = find_elements(raw_cvs_xml, './ClinVarAssertion/ClinVarSubmissionID')
    submitters = {scv.attrib.get('submitter') for scv in scvs}
    return 'PharmGKB' in submitters

In [147]:
results = []
for record, raw_cvs_xml in zip(ClinVarDataset(drug_xml), iterate_cvs_from_xml(drug_xml)):
    if not record.measure or not record.traits_with_valid_names:
        continue
    results.append({
        'rcv': record.accession,
        'variant': record.measure.vcf_full_coords,
        'hgvs': record.measure.preferred_current_hgvs.text if record.measure.preferred_current_hgvs else None,
        'all_traits': [name.strip().lower() for t in record.traits_with_valid_names for name in t.all_valid_names],
        'preferred_names': [t.preferred_or_other_valid_name.strip().lower() for t in record.traits_with_valid_names],
        'is_pgkb': is_pgkb(raw_cvs_xml)
    })

In [149]:
trait_counts = Counter()
trait_pgkb = defaultdict(bool)
preferred_counts = Counter()
preferred_pgkb = defaultdict(bool)
for r in results:
    for t in r['all_traits']:
        trait_counts[t] += 1
        trait_pgkb[t] |= r['is_pgkb']
    for t in r['preferred_names']:
        preferred_counts[t] += 1
        preferred_pgkb[t] |= r['is_pgkb']

In [150]:
with open('/home/april/projects/opentargets/drug-response/preferred-traits.csv', 'w+') as output_file:
    for x,y in preferred_counts.items():
        output_file.write(f'"{x}",{y},{preferred_pgkb[x]}\n')

In [145]:
with open('/home/april/projects/opentargets/drug-response/traits.csv', 'w+') as output_file:
    for x,y in trait_counts.items():
        output_file.write(f'"{x}",{y},{trait_pgkb[x]}\n')

In [151]:
df = pd.DataFrame(results)
df = df.set_index('rcv')

In [152]:
df

Unnamed: 0_level_0,variant,hgvs,all_traits,preferred_names,is_pgkb
rcv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RCV000211201,10_94761900_C_A,NC_000010.11:g.94761900C>A,"[clopidogrel response - dosage, efficacy, toxicity/adr]","[clopidogrel response - dosage, efficacy, toxicity/adr]",True
RCV000211253,10_94761900_C_A,NC_000010.11:g.94761900C>A,[citalopram response - metabolism/pk],[citalopram response - metabolism/pk],True
RCV000211375,10_94761900_C_A,NC_000010.11:g.94761900C>A,[escitalopram response - metabolism/pk],[escitalopram response - metabolism/pk],True
RCV000000759,8_18400344_T_C,NC_000008.11:g.18400344T>C,"[acetylation, slow, inh inactivation, slow, isoniazid inactivation, slow, slow acetylator phenotype, slow acetylator due to n-acetyltransferase enzyme variant]",[slow acetylator due to n-acetyltransferase enzyme variant],False
RCV000000761,8_18400860_G_A,NC_000008.11:g.18400860G>A,"[acetylation, slow, inh inactivation, slow, isoniazid inactivation, slow, slow acetylator phenotype, slow acetylator due to n-acetyltransferase enzyme variant]",[slow acetylator due to n-acetyltransferase enzyme variant],False
RCV000000762,8_18400194_G_A,NC_000008.11:g.18400194G>A,"[acetylation, slow, inh inactivation, slow, isoniazid inactivation, slow, slow acetylator phenotype, slow acetylator due to n-acetyltransferase enzyme variant]",[slow acetylator due to n-acetyltransferase enzyme variant],False
RCV000002294,16_31091243_A_C,NC_000016.10:g.31091243A>C,"[coumarin sensitivity, coumarin, poor metabolism of, coumadin response, coumarin resistance, cyp2a6, v1, warfarin resistance, warfarin response, warfarin sensitivity]",[warfarin response],False
RCV000002738,7_44540234_C_A,NC_000007.14:g.44540234C>A,"[ezetimibe, nonresponse to, ezetimibe response]",[ezetimibe response],False
RCV000002739,7_44516100_A_T,NC_000007.14:g.44516100A>T,"[ezetimibe, nonresponse to, ezetimibe response]",[ezetimibe response],False
RCV000003038,7_141973545_C_G,NC_000007.14:g.141973545C>G,[phenylthiocarbamide tasting],[phenylthiocarbamide tasting],False


In [154]:
set(trait_counts.keys()) - set(preferred_counts.keys())

{'1-deoxygalactonojirimycin response',
 '5-fluorouracil toxicity',
 '5-fluorouracil toxicity, included',
 '5-fu response',
 'acetylation, slow',
 'all trans retinoic acid response',
 'ansaid response',
 'anti-pdl1 response',
 'atra response',
 'austedo response',
 'azasan response',
 'camptosar response',
 'carcinoma, squamous cell of head and neck',
 'clopidogrel, poor metabolism of',
 'codeine, ultrarapid metabolism of',
 'corticosteroid response',
 'coumadin response',
 'coumarin resistance',
 'coumarin sensitivity',
 'coumarin, poor metabolism of',
 'cyp2a6, v1',
 'deafness, streptomycin-induced',
 'dilantin response',
 'dopamine agonist response',
 'efavirenz central nervous system toxicity, susceptibility to',
 'efavirenz, poor metabolism of',
 'ezetimibe, nonresponse to',
 'feldene response',
 'galafold response',
 'gemzar response',
 'gentamicin sulfate response',
 'gleevec response',
 'glipizide poor metabolizer',
 'head and neck squamous cell carcinoma',
 'hypersensitivity sy

In [112]:
with open('/home/april/projects/opentargets/drug-response/evidence_strings/evidence_strings.json') as input_file:
    evidence_strs = input_file.readlines()
    
evidence = [json.loads(e.strip()) for e in evidence_strs]

In [168]:
f_evidence = [e for e in evidence if 'fluorouracil' in e['diseaseFromSource'].lower() and e['targetFromSourceId'] != 'ENSG00000188641']

In [169]:
f_evidence

[]

In [172]:
i_evidence = [e for e in evidence if 'ivacaftor' in e['diseaseFromSource'].lower()]

In [173]:
i_evidence

[{'alleleOrigins': ['germline'],
  'datasourceId': 'eva',
  'datatypeId': 'genetic_association',
  'clinicalSignificances': ['drug response'],
  'confidence': 'reviewed by expert panel',
  'studyId': 'RCV000211129',
  'targetFromSourceId': 'ENSG00000001626',
  'variantFunctionalConsequenceId': 'SO_0001583',
  'variantId': '7_117587801_T_G',
  'variantRsId': 'rs121909005',
  'cohortPhenotypes': ['ivacaftor response - Efficacy'],
  'diseaseFromSource': 'ivacaftor response - Efficacy',
  'diseaseFromSourceId': 'CN236562',
  'diseaseFromSourceMappedId': 'EFO_0009166',
  'variantHgvsId': 'NC_000007.14:g.117587801T>G'},
 {'alleleOrigins': ['germline'],
  'datasourceId': 'eva',
  'datatypeId': 'genetic_association',
  'clinicalSignificances': ['drug response'],
  'confidence': 'reviewed by expert panel',
  'studyId': 'RCV000211150',
  'targetFromSourceId': 'ENSG00000001626',
  'variantFunctionalConsequenceId': 'SO_0001583',
  'variantId': '7_117642451_G_A',
  'variantRsId': 'rs267606723',
  '

In [155]:
evidence[0]

{'alleleOrigins': ['germline'],
 'datasourceId': 'eva',
 'datatypeId': 'genetic_association',
 'clinicalSignificances': ['drug response'],
 'confidence': 'reviewed by expert panel',
 'studyId': 'RCV000211201',
 'targetFromSourceId': 'ENSG00000165841',
 'variantFunctionalConsequenceId': 'SO_0001631',
 'variantId': '10_94761900_C_A',
 'variantRsId': 'rs12248560',
 'cohortPhenotypes': ['clopidogrel response - Dosage, Efficacy, Toxicity/ADR'],
 'diseaseFromSource': 'clopidogrel response - Dosage, Efficacy, Toxicity/ADR',
 'diseaseFromSourceId': 'CN236507',
 'variantHgvsId': 'NC_000010.11:g.94761900C>A'}

#### 1. Single specific category

a)
```
{'alleleOrigins': ['germline'],
  'datasourceId': 'eva',
  'datatypeId': 'genetic_association',
  'clinicalSignificances': ['drug response'],
  'confidence': 'reviewed by expert panel',
  'studyId': 'RCV001787916',
  'targetFromSourceId': 'ENSG00000188641',
  'variantFunctionalConsequenceId': 'SO_0001583',
  'variantId': '1_97699535_T_C',
  'variantRsId': 'rs2297595',
  'cohortPhenotypes': ['fluorouracil response - Toxicity'],
  'diseaseFromSource': 'fluorouracil response - Toxicity',
  'variantHgvsId': 'NC_000001.11:g.97699535T>C'}
```

Proposal:
- add `drugFromSource`, `drugResponse` fields with appropriate ontology terms / ids:
```
'drugFromSource': 'CHEMBL185',
'drugResponse': 'EFO:0011061'
```
- remove `diseaseFromSource` (?) - it's not meaningful in this case

b)

```
{'alleleOrigins': ['germline'],
  'datasourceId': 'eva',
  'datatypeId': 'genetic_association',
  'clinicalSignificances': ['pathogenic'],
  'confidence': 'no assertion criteria provided',
  'literature': ['1648430', '9010022'],
  'studyId': 'RCV000030869',
  'targetFromSourceId': 'ENSG00000188641',
  'variantFunctionalConsequenceId': 'SO_0001583',
  'variantId': '1_97079133_T_A',
  'variantRsId': 'rs72547602',
  'cohortPhenotypes': ['5-FU response',
   '5-fluorouracil toxicity',
   '5-fluorouracil toxicity, included',
   'Fluorouracil response'],
  'diseaseFromSource': 'Fluorouracil response',
  'diseaseFromSourceId': 'CN077983',
  'variantHgvsId': 'NC_000001.11:g.97079133T>A'}
 ```

#### 2. Multiple specific categories

a)
```
{'alleleOrigins': ['germline'],
  'datasourceId': 'eva',
  'datatypeId': 'genetic_association',
  'clinicalSignificances': ['drug response'],
  'confidence': 'reviewed by expert panel',
  'studyId': 'RCV000211201',
  'targetFromSourceId': 'ENSG00000165841',
  'variantFunctionalConsequenceId': 'SO_0001631',
  'variantId': '10_94761900_C_A',
  'variantRsId': 'rs12248560',
  'cohortPhenotypes': ['clopidogrel response - Dosage, Efficacy, Toxicity/ADR'],
  'diseaseFromSource': 'clopidogrel response - Dosage, Efficacy, Toxicity/ADR',
  'diseaseFromSourceId': 'CN236507',
  'variantHgvsId': 'NC_000010.11:g.94761900C>A'}
  ```
  
Proposal:
- explode evidence along drug response, i.e. this would generate three evidence strings

b)
```
{'alleleOrigins': ['germline'],
  'datasourceId': 'eva',
  'datatypeId': 'genetic_association',
  'clinicalSignificances': ['drug response'],
  'confidence': 'no assertion criteria provided',
  'literature': ['23418033'],
  'studyId': 'RCV000106297',
  'targetFromSourceId': 'ENSG00000197408',
  'variantFunctionalConsequenceId': 'SO_0001583',
  'variantId': '19_41006968_T_G',
  'variantRsId': 'rs373489637',
  'cohortPhenotypes': ['Efavirenz central nervous system toxicity, susceptibility to',
   'Efavirenz response',
   'Efavirenz, poor metabolism of'],
  'diseaseFromSource': 'Efavirenz response',
  'diseaseFromSourceId': 'C3281153',
  'diseaseFromSourceMappedId': 'EFO_0006904',
  'variantHgvsId': 'NC_000019.10:g.41006968T>G'}
  ```

#### 3. "Other" or otherwise vague categories

a)
```
{'alleleOrigins': ['germline'],
  'datasourceId': 'eva',
  'datatypeId': 'genetic_association',
  'clinicalSignificances': ['drug response'],
  'confidence': 'reviewed by expert panel',
  'studyId': 'RCV001788460',
  'targetFromSourceId': 'ENSG00000188641',
  'variantFunctionalConsequenceId': 'SO_0001587',
  'variantId': '1_97573943_C_A',
  'variantRsId': 'rs78060119',
  'cohortPhenotypes': ['fluorouracil response - Other'],
  'diseaseFromSource': 'fluorouracil response - Other',
  'variantHgvsId': 'NC_000001.11:g.97573943C>A'}
```

Proposal:
- use a generic "response to drug" EFO term in `drugResponse` field
- alternatives: use a specific child term ("response to fluorouracil"), omit the field entirely, omit the evidence entirely

b)
```
{'alleleOrigins': ['somatic'],
  'datasourceId': 'eva_somatic',
  'datatypeId': 'somatic_mutation',
  'clinicalSignificances': ['drug response'],
  'confidence': 'no assertion criteria provided',
  'studyId': 'RCV000225084',
  'targetFromSourceId': 'ENSG00000137672',
  'variantFunctionalConsequenceId': 'SO_0001583',
  'variantId': '11_101469456_G_A',
  'variantRsId': 'rs878853187',
  'cohortPhenotypes': ['Prednisolone response'],
  'diseaseFromSource': 'Prednisolone response',
  'diseaseFromSourceId': 'CN236771',
  'diseaseFromSourceMappedId': 'EFO_0009168',
  'variantHgvsId': 'NC_000011.10:g.101469456G>A'}
  ```
 
#### 4. Other records

Do some checks to ensure we don't break or otherwise confuse our existing disease records
```
{
    "alleleOrigins": ["somatic"], 
    "datasourceId": "eva_somatic",
    "datatypeId": "somatic_mutation",
    "clinicalSignificances": ["pathogenic"],
    "confidence": "no assertion criteria provided",
    "literature": ["10097140"],
    "studyId": "RCV000004467",
    "targetFromSourceId": "ENSG00000061337",
    "variantFunctionalConsequenceId": "SO_0001583",
    "variantId": "8_20255097_A_G",
    "variantRsId": "rs28937897",
    "cohortPhenotypes": ["Esophageal squamous cell carcinoma, somatic"],
    "diseaseFromSource": "Esophageal squamous cell carcinoma, somatic",
    "diseaseFromSourceId": "C4016881",
    "diseaseFromSourceMappedId": "EFO_0005922"
}
```

## Questions / comments

Comparing this proposal with the "background trait" focused one..
* overall I'm less worried about this one - we can more directly report what's in the data without needing to make as many inferences regarding gene/disease/drug connections
    * "Phenotypes would also be useful but I think sometimes this is the disease of the patient rather than the drug response phenotype?"
    * this one is also conceivable to do without parsing SCVs or using PGKB data (at least some parts)
* Other annotations more specific to pharmacogenetics I don't think we can get from ClinVar
    * in particular direct drug target, prescribing information, drug label, drug-centred pathway
* Similarly if we want the background trait, this is only rarely present in ClinVar and then only in the SCVs as previously discussed
    
Focusing on phenotype category (toxicity etc.)
* If we only rely on PGKB we can use their categorisation with virtually no curation (I think), as long as we have an EFO term to map to
* If we include ClinVar we will need manual curation to extract drug + phenotype category from trait name, and we need to assess if this is at all feasible and/or worthwhile
    * See [spreadsheet](https://docs.google.com/spreadsheets/d/1HuOPMFoqTGWXimh5a2C-5FZU-CJpCNcuL5y9g9N_1is/edit?usp=sharing) as well as notes on trait names below.

Note on trait names: we provide them all in the evidence (`cohortPhenotypes`) but only preferred names are exported for manual curation and thus will appear in `diseaseFromSource`.  Linking at the end is done by looking at all trait names though.

Previous analysis focused exclusively on preferred names.

On inspection I found that many records had more precise drug response info *not* as the preferred name, so just to get an idea of the scope I output all the trait names for all drug response records from June 2022, which explains the duplicate counts in the spreadsheet.

#### From meeting with Tim

* background trait - need or not?
    * if so need to use SCV or PGKB
    * if not we just should annotate things that we already provide and agree on model, EFO terms, etc
* example one
    * drug = fluorouracil, present in ClinVar trait name, could be annotated by us
    * gene = DPYD, correctly annotated by us
    * category = toxicity, present in ClinVar trait name, could be annotated by us
    * background trait = neoplasms, present in ClinVar SCV
* example two
    * drug = ivacaftor, present
    * gene = CFTR, correctly annotated by us
    * category = efficacy, present
    * background trait = cystic fibrosis, present
* drug label & pathway - not present in ClinVar

## Part 2: PharmGKB

Questions to address:
* Can we get functional consequences (e.g. via PharmVar)
* Where do we get some of these other annotations, e.g. "testing required", "direct drug target"
* example evidence generated from PGKB data

In [157]:
import os

In [158]:
pharmgkb_root = '/home/april/projects/opentargets/pharmgkb'
clinical_annotations = pd.read_csv(os.path.join(pharmgkb_root, 'clinical', 'clinical_annotations.tsv'), sep='\t')

In [160]:
clinical_annotations[clinical_annotations['Gene'] == 'DPYD']

Unnamed: 0,Clinical Annotation ID,Variant/Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
548,1183679833,rs17376848,DPYD,1A,,Tier 1 VIP,94.5,Toxicity,11,13,fluorouracil,Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1183679833,
747,1183679818,rs1760217,DPYD,3,,Tier 1 VIP,2.5,Efficacy,1,1,antineoplastic agents,Pancreatic Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1183679818,
1703,1449575633,rs188052243,DPYD,3,,Rare Variant; Tier 1 VIP,0.0,Other,2,2,fluorouracil,,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1449575633,
1705,1449575668,rs748620513,DPYD,3,,Rare Variant; Tier 1 VIP,0.0,Other,1,2,fluorouracil,,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1449575668,
1706,1449575690,rs143879757,DPYD,3,,Rare Variant; Tier 1 VIP,0.0,Other,1,2,fluorouracil,,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1449575690,
1707,1449575696,rs1212037891,DPYD,3,,Rare Variant; Tier 1 VIP,0.0,Other,1,1,fluorouracil,,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1449575696,
1709,1449575656,rs59086055,DPYD,1A,,Rare Variant; Tier 1 VIP,100.0,Other,2,3,fluorouracil,,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1449575656,
1710,1449575662,rs148994843,DPYD,1A,,Rare Variant; Tier 1 VIP,100.0,Other,2,3,fluorouracil,,2021-05-25,https://www.pharmgkb.org/clinicalAnnotation/1449575662,
1758,981201535,rs2297595,DPYD,1A,Yes: Assigned level 1A because variant is included in CPIC guideline,Tier 1 VIP,75.5,Toxicity,17,19,fluorouracil,Neoplasms,2021-03-29,https://www.pharmgkb.org/clinicalAnnotation/981201535,
1759,981201981,rs1801265,DPYD,1A,,Tier 1 VIP,93.25,Toxicity,17,19,fluorouracil,Neoplasms,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/981201981,Pediatric


In [163]:
clinical_annotations[~clinical_annotations['Variant/Haplotypes'].str.contains('rs')]

Unnamed: 0,Clinical Annotation ID,Variant/Haplotypes,Gene,Level of Evidence,Level Override,Level Modifiers,Score,Phenotype Category,PMID Count,Evidence Count,Drug(s),Phenotype(s),Latest History Date (YYYY-MM-DD),URL,Specialty Population
7,981419266,HLA-B*15:02:01,HLA-B,1A,,Tier 1 VIP,315.7500,Toxicity,18,23,phenytoin,"drug reaction with eosinophilia and systemic symptoms;Epidermal Necrolysis, Toxic;severe cutaneous adverse reactions;Stevens-Johnson Syndrome",2022-06-22,https://www.pharmgkb.org/clinicalAnnotation/981419266,Pediatric
8,1451259580,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*10, CYP2D6*41",CYP2D6,1A,,Tier 1 VIP,211.3750,Toxicity,6,9,amitriptyline,Depressive Disorder,2021-04-23,https://www.pharmgkb.org/clinicalAnnotation/1451259580,
9,1451265560,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2xN, CYP2D6*4, CYP2D6*5",CYP2D6,1A,,Tier 1 VIP,208.1250,Dosage,3,6,imipramine,Depressive Disorder,2021-04-23,https://www.pharmgkb.org/clinicalAnnotation/1451265560,
10,1451288200,"CYP2D6*1, CYP2D6*1xN, CYP2D6*2, CYP2D6*2xN",CYP2D6,1A,,Tier 1 VIP,203.6250,Toxicity,8,11,codeine,Pain,2021-04-23,https://www.pharmgkb.org/clinicalAnnotation/1451288200,Pediatric
11,1451282240,"CYP2D6*1, CYP2D6*3, CYP2D6*4, CYP2D6*4xN, CYP2D6*5, CYP2D6*6",CYP2D6,1A,,Tier 1 VIP,201.7500,Toxicity,2,4,atomoxetine,Attention Deficit Disorder with Hyperactivity,2021-04-29,https://www.pharmgkb.org/clinicalAnnotation/1451282240,
12,1451285240,"CYP2D6*1, CYP2D6*2, CYP2D6*3, CYP2D6*4, CYP2D6*5, CYP2D6*6, CYP2D6*7, CYP2D6*9, CYP2D6*10, CYP2D6*10x2, CYP2D6*11, CYP2D6*17, CYP2D6*21, CYP2D6*36, CYP2D6*41",CYP2D6,1A,,Tier 1 VIP,226.5625,Efficacy,56,143,tamoxifen,Breast Neoplasms,2021-04-29,https://www.pharmgkb.org/clinicalAnnotation/1451285240,
19,1451340300,"UGT1A3*1, UGT1A3*2, UGT1A3*3",UGT1A3,3,,,4.0000,Metabolism/PK,1,2,telmisartan,,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1451340300,
27,655386913,"CYP2C19*1, CYP2C19*17",CYP2C19,3,,Tier 1 VIP,6.0000,Toxicity,15,16,clopidogrel,Acute coronary syndrome;Coronary Artery Disease;Hemorrhage;Myocardial Infarction,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/655386913,
37,1451240880,HLA-B*15:02:01,HLA-B,3,,Tier 1 VIP,4.0000,Toxicity,1,2,dapsone,severe cutaneous adverse reactions;Stevens-Johnson Syndrome,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1451240880,
44,1451350801,HLA-B*55:01:01,HLA-B,3,,Tier 1 VIP,2.5000,Toxicity,1,1,penicillin g;penicillin v,Drug Hypersensitivity,2021-03-24,https://www.pharmgkb.org/clinicalAnnotation/1451350801,
