In [1]:
from collections import Counter
import sys

sys.path.append('..')

In [2]:
from filter_clinvar_xml import filter_xml, pprint, iterate_cvs_from_xml
from clinvar_xml_io.clinvar_xml_io import *

In [3]:
drug_xml = '/home/april/projects/opentargets/drug-response.xml.gz'

In [4]:
dataset = ClinVarDataset(drug_xml)

## Part 1: Data from ClinVar

Questions to address:

* Can we reliably get the background trait, i.e. the disease that the drug acts on?
* How many records are explicitly reporting efficacy phenotypes?

Example [RCV000211201](https://www.ncbi.nlm.nih.gov/clinvar/RCV000211201/) - contains trait relationship between drug and disease but only in SCV not RCV record.  (Note also there's only one SCV for this RCV.)

**SCV:**

```
<TraitSet Type="DrugResponse">
  <Trait Type="DrugResponse">
    <Name>
      <ElementValue Type="Preferred">clopidogrel response - Dosage, Efficacy, Toxicity/ADR</ElementValue>
    </Name>
    <TraitRelationship Type="DrugResponseAndDisease">
      <Name>
        <ElementValue Type="Preferred">Acute coronary syndrome</ElementValue>
      </Name>
    </TraitRelationship>
    <TraitRelationship Type="DrugResponseAndDisease">
      <Name>
        <ElementValue Type="Preferred">Coronary Artery Disease</ElementValue>
      </Name>
    </TraitRelationship>
    <TraitRelationship Type="DrugResponseAndDisease">
      <Name>
        <ElementValue Type="Preferred">Myocardial Infarction</ElementValue>
      </Name>
    </TraitRelationship>
  </Trait>
</TraitSet>
```

**RCV:**
```
<TraitSet Type="DrugResponse" ID="26824">
  <Trait ID="35423" Type="DrugResponse">
    <Name>
      <ElementValue Type="Preferred">clopidogrel response - Dosage, Efficacy, Toxicity/ADR</ElementValue>
    </Name>
    <XRef ID="CN236507" DB="MedGen" />
  </Trait>
</TraitSet>
```

In [19]:
# Check whether any of the RCV records have this kind of information
for record in dataset:
    if len(record.trait_set) > 1:
        # No trait set with both a drug and a disease
        print(record.accession)
        print([trait.preferred_or_other_valid_name for trait in record.trait_set])
    for trait in record.trait_set:
        # No traits in RCV with relationship element
        relationships = find_elements(trait.trait_xml, './TraitRelationship')
        if relationships:
            print(record.accession)
            pprint(trait.trait_xml)

RCV001824998
['Cabozantinib resistance', 'Entrectinib resistance', 'Larotrectinib resistance', 'Repotrectinib resistance', 'Selitrectinib resistance']


In [49]:
def get_name(x):
    return ClinVarTrait(x, None).preferred_or_other_valid_name


def is_pgkb(raw_cvs_xml):
    scvs = find_elements(raw_cvs_xml, './ClinVarAssertion/ClinVarSubmissionID')
    submitters = {scv.attrib.get('submitter') for scv in scvs}
    return 'PharmGKB' in submitters

In [59]:
# Check whether all the SCV records have this kind of information
n = 0
count_all = 0
count_pgkb = 0
for raw_cvs_xml in iterate_cvs_from_xml(drug_xml):
    n += 1
    elts = find_elements(raw_cvs_xml, './ClinVarAssertion/TraitSet/Trait')
    for e in elts:
        if e.attrib['Type'] == 'DrugResponse':
            relations = find_elements(e, './TraitRelationship')
            name = get_name(e)
            background_traits = []
            for r in relations:
                if r.attrib['Type'] == 'DrugResponseAndDisease':
                    background_traits.append(get_name(r))
            if background_traits:
                count_all += 1
                if is_pgkb(raw_cvs_xml):
                    count_pgkb += 1
                    print(f'*{get_name(e)} => {background_traits}')
                else:
                    print(f'{get_name(e)} => {background_traits}')

*clopidogrel response - Dosage, Efficacy, Toxicity/ADR => ['Acute coronary syndrome', 'Coronary Artery Disease', 'Myocardial Infarction']
efavirenz response => ['HIV']
efavirenz response => ['HIV']
efavirenz response => ['HIV']
Azathioprine intolerance => ['myasthenia gravis']
all trans retinoic acid (ATRA) response => ['Acute promyelocytic leukemia']
all trans retinoic acid (ATRA) response => ['Acute promyelocytic leukemia']
Pazopanib response => ['malignant granular cell tumor']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin response => ['Fabry disease']
deoxygalactonojirimycin resp

Vemurafenib-Cobimetinib Response => ['Melanoma']
Trametinib-Dabrafenib Response => ['Melanoma']
Deutetrabenazine response => ['Chorea', 'Huntington disease', 'Tardive dyskinesia']
Deutetrabenazine response => ['Chorea', 'Huntington disease', 'Tardive dyskinesia']
Deutetrabenazine response => ['Chorea', 'Huntington disease', 'Tardive dyskinesia']
Tamoxifen response => ['Breast cancer']
Tamoxifen response => ['Breast cancer']
Tamoxifen response => ['Breast cancer']
Tamoxifen response => ['Breast cancer']
Corticosteroid response => ['Focal segmental glomerulosclerosis']
Corticosteroid response => ['Focal segmental glomerulosclerosis']
Corticosteroid response => ['Focal segmental glomerulosclerosis']
Corticosteroid response => ['Focal segmental glomerulosclerosis']
Corticosteroid response => ['Minimal change disease']
Corticosteroid response => ['Minimal Change disease']
Corticosteroid response => ['Minimal change disease']
Corticosteroid response => ['Focal segmental glomerulosclerosis']


*methoxyflurane response - Toxicity => ['Malignant Hyperthermia']
*sevoflurane response - Toxicity => ['Malignant Hyperthermia']
*succinylcholine response - Toxicity => ['Malignant Hyperthermia']
*desflurane response - Toxicity => ['Malignant Hyperthermia']
*enflurane response - Toxicity => ['Malignant Hyperthermia']
*halothane response - Toxicity => ['Malignant Hyperthermia']
*isoflurane response - Toxicity => ['Malignant Hyperthermia']
*methoxyflurane response - Toxicity => ['Malignant Hyperthermia']
*sevoflurane response - Toxicity => ['Malignant Hyperthermia']
*succinylcholine response - Toxicity => ['Malignant Hyperthermia']
*gefitinib response - Efficacy => ['Carcinoma, Non-Small-Cell Lung', 'Drug Resistance']
*nicotine response - Toxicity => ['Tobacco Use Disorder']
*hydrochlorothiazide response - Efficacy => ['Essential hypertension', 'Hypertension']
*atorvastatin response - Toxicity => ['statin-related myopathy']
*desflurane response - Toxicity => ['Malignant Hyperthermia']
*e

In [60]:
print(f'Out of {n} records, found {count_all} with drug response & disease relationship ({count_pgkb} from PharmGKB).')

Out of 4970 records, found 576 with drug response & disease relationship (361 from PharmGKB).


In [61]:
count_all = 0
count_pgkb = 0
for raw_cvs_xml in iterate_cvs_from_xml(drug_xml):
    elts = find_elements(raw_cvs_xml, './ClinVarAssertion/TraitSet/Trait')
    for e in elts:
        if e.attrib['Type'] == 'DrugResponse':
            name = get_name(e)
            if name and 'efficacy' in name.lower():
                count_all += 1
                if is_pgkb(raw_cvs_xml):
                    count_pgkb += 1
                    print(f'*{name}')
                else:
                    print(name)

*clopidogrel response - Dosage, Efficacy, Toxicity/ADR
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*clopidogrel response - Efficacy
*interferons, peginterferon alfa-2a, peginterferon alfa-2b and ribavirin response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*peginterferon alfa-2a, peginterferon alfa-2b, ribavirin and telaprevir response - Efficacy
*methotrexate response - Efficacy
*ivacaftor response - Efficacy
*captopril response - Efficacy
*peginterferon alfa-2b response - Efficacy
*ribavirin response - Efficacy
*peginterferon alfa-2a response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*ivacaftor response - Efficacy
*pravastatin response - Efficacy
*methotrexate re

In [62]:
print(f'Out of {n} records, found {count_all} with efficacy phenotype ({count_pgkb} from PharmGKB).')

Out of 4970 records, found 54 with efficacy phenotype (54 from PharmGKB).


### Thoughts

* Is it worth starting to parse SCV for drug response / disease trait relationships?
    * Might be relatively straightforward to do in this restricted case
    * Opens up a can of worms, e.g. what happens if SCVs don't agree?  Do we end up redoing the work of aggregation?
* Why does ClinVar exclude this info from the RCV anyway?
* Counts summary: **4970** drug response records
    * **401** with PharmGKB submission (previous notebook)
    * **576** with drug response & disease relationship (in SCV only)
        * Of these, **361** from PharmGKB
    * **54** with explicit efficacy phenotype, all from PharmGKB

## Part 2: PharmGKB data

Compare this with what PharmGKB submissions contain in ClinVar.

New data model extracted from PharmKGB clinical annotations download file:
* The trait in the evidence will be PharmGKB's “Phenotypes”
* The drug will be extracted from PharmGKB's “Drugs”
* The target will be the target associated with the variant, PharmGKB’s “Gene”
* Filter rows for those whose category is `Efficacy` and has associated `Phenotypes`. The rest drug responses will be dropped in a first stage.
