In [10]:
from collections import Counter
import json
import sys
import os
import re

import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)
pd.set_option('display.min_rows', 100)

In [11]:
pharmgkb_root = '/home/april/projects/opentargets/pharmgkb'
clinical_annotations = pd.read_csv(os.path.join(pharmgkb_root, 'clinical', 'clinical_annotations.tsv'), sep='\t')

In [12]:
variants = pd.read_csv(os.path.join(pharmgkb_root, 'variants', 'variants.tsv'), sep='\t')

In [13]:
clinical_ann_alleles = pd.read_csv(os.path.join(pharmgkb_root, 'clinical', 'clinical_ann_alleles.tsv'), sep='\t')

In [14]:
ca_with_rs = clinical_annotations[clinical_annotations['Variant/Haplotypes'].str.contains('rs')]

In [21]:
calleles_with_rs = clinical_ann_alleles[
    clinical_ann_alleles['Clinical Annotation ID'].isin(ca_with_rs['Clinical Annotation ID'])
]

### Option 1

* Use `clinical_ann_alleles.tsv` but only the column with alleles, not the free text.
* Use this to get the alleles relevant to the given clinical annotation, then resolve which is ref/alt via... (allele frequencies?  order in the column? something else?)
    * Ensembl allele string - assuming first is always ref (?)
* Get chrom & pos from NCBI/Ensembl or `variants.tsv`
* Won't work for a few rsid records (5013 total > 4416 with rsid > 4350 with exactly 2 alleles)

In [22]:
len(clinical_annotations)

5013

In [23]:
len(ca_with_rs)

4416

In [24]:
results_2_alleles = set()
results_more_alleles = set()
results_not_snps = set()
# Check whether only 2 alleles involved - if so we can determine which is ref (probably)
for x in ca_with_rs['Clinical Annotation ID']:
    alleles = set()
    not_snp = False
    for ga in calleles_with_rs[calleles_with_rs['Clinical Annotation ID'] == x]['Genotype/Allele']:
        # x chrom
        if len(ga) == 1:
            alleles.add(ga)
            continue
        # snp
        if len(ga) == 2:
            alleles.add(ga[0])
            alleles.add(ga[1])
            continue
        # "simple" allele
        m = re.match('([ACGT]+|del)/([ACGT]+|del)', ga, re.IGNORECASE)
        if not m:
            not_snp = True
            break
        alleles.add(m.group(1))
        alleles.add(m.group(2))
    if not_snp:
        results_not_snps.add(x)
    elif len(alleles) <= 2:
        results_2_alleles.add(x)
    else:
        results_more_alleles.add(x)

In [25]:
len(results_2_alleles)

4350

In [26]:
len(results_more_alleles)

51

In [27]:
len(results_not_snps)

15

In [28]:
calleles_with_rs[calleles_with_rs['Clinical Annotation ID'].isin(results_not_snps)]

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function
2303,1183617783,(CCCACCCGA)9/(CCCACCCGA)9,Patients with the (CCCACCCGA)9/(CCCACCCGA)9 genotype and depression who are treated with antidepressants 1) may have a reduced risk of adverse drug reactions after switching treatment for the second time 2) may be less likely to have a lack of a response to treatment as compared to patients with the (CCCACCCGA)12/(CCCACCCGA)12 genotype. Other genetic and clinical factors may also influence a patient's response to treatment.,
2304,1183617783,(CCCACCCGA)10/(CCCACCCGA)9,Patients with the (CCCACCCGA)10/(CCCACCCGA)9 genotype and depression who are treated with antidepressants 1) may have a reduced risk of adverse drug reactions after switching treatment for the second time 2) may be less likely to have a lack of a response to treatment as compared to patients with the (CCCACCCGA)12/(CCCACCCGA)12 genotype. Other genetic and clinical factors may also influence a patient's response to treatment.,
2305,1183617783,(CCCACCCGA)10/(CCCACCCGA)10,Patients with the (CCCACCCGA)10/(CCCACCCGA)10 genotype and depression who are treated with antidepressants 1) may have a reduced risk of adverse drug reactions after switching treatment for the second time 2) may be less likely to have a lack of a response to treatment as compared to patients with the (CCCACCCGA)12/(CCCACCCGA)12 genotype. Other genetic and clinical factors may also influence a patient's response to treatment.,
2306,1183617783,(CCCACCCGA)12/(CCCACCCGA)9,Patients with the (CCCACCCGA)12/(CCCACCCGA)9 genotype and depression who are treated with antidepressants 1) may have a reduced risk of adverse drug reactions after switching treatment for the second time 2) may be less likely to have a lack of a response to treatment as compared to patients with the (CCCACCCGA)12/(CCCACCCGA)12 genotype. Other genetic and clinical factors may also influence a patient's response to treatment.,
2307,1183617783,(CCCACCCGA)12/(CCCACCCGA)10,Patients with the (CCCACCCGA)12/(CCCACCCGA)10 genotype and depression who are treated with antidepressants 1) may have a reduced risk of adverse drug reactions after switching treatment for the second time 2) may be less likely to have a lack of a response to treatment as compared to patients with the (CCCACCCGA)12/(CCCACCCGA)12 genotype. Other genetic and clinical factors may also influence a patient's response to treatment.,
2308,1183617783,(CCCACCCGA)12/(CCCACCCGA)12,Patients with the (CCCACCCGA)12/(CCCACCCGA)12 genotype and depression who are treated with antidepressants 1) may have an increased risk of adverse drug reactions after switching treatment for the second time 2) may be more likely to have a lack of a response to treatment as compared to other genotypes. Other genetic and clinical factors may also influence a patient's response to treatment.,
2732,1183615328,(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2,Patients with the rs45445694 2R/2R genotype ((CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2) and colorectal cancer may have a decreased risk of asthenia when treated with irinotecan and raltitrexed as compared to patients with the 2R/3R or 3R/3R genotype. Other genetic and clinical factors may also influence risk of asthenia.,
2733,1183615328,(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3,Patients with the rs45445694 2R/3R genotype ((CCGCGCCACTTGGCCTGCCTCCGTCCCG)2/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3) and colorectal cancer may have an increased risk of asthenia when treated with irinotecan and raltitrexed as compared to patients with the 2R/2R genotype. Other genetic and clinical factors may also influence risk of asthenia.,
2734,1183615328,(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3,Patients with the rs45445694 3R/3R genotype ((CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)3) and colorectal cancer may have an increased risk of asthenia when treated with irinotecan and raltitrexed as compared to patients with the 2R/2R genotype. Other genetic and clinical factors may also influence risk of asthenia.,
3134,1183617765,(CCCACCCGA)9/(CCCACCCGA)9,"Patients with the (CCCACCCGA)9/(CCCACCCGA)9 genotype and depression who are treated with fluoxetine may be less likely to respond to treatment as compared to patients with the (CCCACCCGA)12/(CCCACCCGA)12 genotype. However, a contradictory finding reports no association of the (CCCACCCGA)12 allele and response. Other genetic and clinical factors may also influence a patient's response to treatment.",


In [29]:
calleles_with_rs[calleles_with_rs['Clinical Annotation ID'].isin(results_more_alleles)]

Unnamed: 0,Clinical Annotation ID,Genotype/Allele,Annotation Text,Allele Function
160,1450375701,AA,"Patients with the AA genotype may have a decreased response to allopurinol as compared to patients with the AC, CC or CT genotypes. Other genetic and clinical factors may also affect a patient's response to allopurinol.",
161,1450375701,AC,"Patients with the AC genotype may have an increased response to allopurinol as compared to patients with the AA, AT or TT genotypes. Other genetic and clinical factors may also affect a patient's response to allopurinol.",
162,1450375701,AT,"Patients with the AT genotype may have a decreased response to allopurinol as compared to patients with the AC, CC or CT genotypes. Other genetic and clinical factors may also affect a patient's response to allopurinol.",
163,1450375701,CC,"Patients with the CC genotype may have an increased response to allopurinol as compared to patients with the AA, AT or TT genotypes. Other genetic and clinical factors may also affect a patient's response to allopurinol.",
164,1450375701,CT,"Patients with the CT genotype may have an increased response to allopurinol as compared to patients with the AA, AT or TT genotypes. Other genetic and clinical factors may also affect a patient's response to allopurinol.",
165,1450375701,TT,"Patients with the TT genotype may have a decreased response to allopurinol as compared to patients with the AC, CC or CT genotypes. Other genetic and clinical factors may also affect a patient's response to allopurinol.",
757,1449004745,AA,Patients with Mesothelioma and the AA genotype may have worse overall and progression-free survival when treated with cisplatin and gemcitabine as compared to patients with the CC genotype. Other clinical and genetic factors may also influence response to cisplatin and gemcitabine in patients with mesothelioma.,
758,1449004745,AG,Patients with Mesothelioma and the AC genotype may have worse overall and progression-free survival when treated with cisplatin and gemcitabine as compared to patients with the CC genotype. Other clinical and genetic factors may also influence response to cisplatin and gemcitabine in patients with mesothelioma.,
759,1449004745,CC,Patients with Mesothelioma and the CC genotype may have improved overall and progression-free survival when treated with cisplatin and gemcitabine as compared to patients with the AA and AC genotype. Other clinical and genetic factors may also influence response to cisplatin and gemcitabine in patients with mesothelioma.,
799,1449645498,AA,"Patients with the AA genotype may have an increased rate of sulfation of acetaminophen as compared to patients with the CC, CT, or TT genotypes. Other genetic and clinical factors may also affect acetaminophen sulfation in patients.",


Examples of how to get from clinical annotation to `chr_pos_ref_alt`

In [31]:
import requests

In [62]:
def get_coordinates_for_rs(rsid):
    # can do this in bulk too probably
    if not rsid.startswith('rs'):
        rsid = f'rs{rsid}'
    ensembl_url = f'https://rest.ensembl.org/variation/human/{rsid}?content-type=application/json'
    resp = requests.get(ensembl_url)
    data = resp.json()
    if 'mappings' in data:
        for m in data['mappings']:
            if m['assembly_name'] == 'GRCh38':
                break
        chrom = m['seq_region_name']
        pos = m['start']
        alleles = m['allele_string'].split('/')
        ref = alleles[0]
        alts = alleles[1:]
        return chrom, pos, ref, alts
    return None, None, None, None

In [63]:
def get_coordinates_for_clinical_annotation(caid):
    rs = ca_with_rs[ca_with_rs['Clinical Annotation ID'] == caid]['Variant/Haplotypes'].iloc[0]
    chrom, pos, ref, alts = get_coordinates_for_rs(rs)
    if not chrom or not pos or not ref or not alts:
        return None
    if len(alts) == 1:
        return f'{chrom}_{pos}_{ref}_{alts[0]}'
    # If multiple alts, check what is referred to in the clinical alleles table
    alleles = set()
    for ga in calleles_with_rs[calleles_with_rs['Clinical Annotation ID'] == caid]['Genotype/Allele']:
        # x chrom
        if len(ga) == 1:
            alleles.add(ga)
            continue
        # snp
        if len(ga) == 2:
            alleles.add(ga[0])
            alleles.add(ga[1])
            continue
        # "simple" allele
        m = re.match('([ACGT]+|del)/([ACGT]+|del)', ga, re.IGNORECASE)
        if not m:
            break
        alleles.add(m.group(1))
        alleles.add(m.group(2))
    for a in alleles:
        if a in alts:
            return f'{chrom}_{pos}_{ref}_{a}'
    return None

In [67]:
for caid in ca_with_rs['Clinical Annotation ID'].sample(n=50):
    print(f'https://www.pharmgkb.org/clinicalAnnotation/{caid} => {get_coordinates_for_clinical_annotation(caid)}')

https://www.pharmgkb.org/clinicalAnnotation/1449168385 => 3_23132388_G_A
https://www.pharmgkb.org/clinicalAnnotation/637880534 => 7_87600124_T_C
https://www.pharmgkb.org/clinicalAnnotation/1184472900 => 4_70993476_C_G
https://www.pharmgkb.org/clinicalAnnotation/1444705191 => 12_2207029_T_A
https://www.pharmgkb.org/clinicalAnnotation/1448531661 => 16_15995584_A_G
https://www.pharmgkb.org/clinicalAnnotation/1185000052 => 7_99672916_T_C
https://www.pharmgkb.org/clinicalAnnotation/1448427972 => 9_84673278_G_T
https://www.pharmgkb.org/clinicalAnnotation/1449718283 => 10_99844450_C_T
https://www.pharmgkb.org/clinicalAnnotation/1447982818 => 1_39659467_A_G
https://www.pharmgkb.org/clinicalAnnotation/981237959 => 19_45351661_T_G
https://www.pharmgkb.org/clinicalAnnotation/1450936296 => 16_28608717_G_A
https://www.pharmgkb.org/clinicalAnnotation/1448522748 => 1_153348279_C_T
https://www.pharmgkb.org/clinicalAnnotation/1183631554 => 18_63312127_C_T
https://www.pharmgkb.org/clinicalAnnotation/144

### Option 2

* Use `variants.tsv` for chrom & pos, otherwise parse from HGVS
* Which HGVS to use is the main issue, also weirdly indirect considering how usually well-defined rsids are
* Might generalise better to structural variants but likely not star alleles

In [62]:
variants[variants['Variant Name'].isin(('rs75527207', 'rs45445694'))]

Unnamed: 0,Variant ID,Variant Name,Gene IDs,Gene Symbols,Location,Variant Annotation count,Clinical Annotation count,Level 1/2 Clinical Annotation count,Guideline Annotation count,Label Annotation count,Synonyms
4437,PA166155331,rs45445694,"PA134956204,PA359","C18orf56,TYMS",NC_000018.10:657646_657712,87,7,0,0,1,"NC_000018.9:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[8], NG_028255.1:g.5054GGCCTGCCTCCGTCCCGCCGCGCCACTT[7], NC_000018.9:g.657646_657673CCGCGCCACTTGGCCTGCCTCCGTCCCG[2][3][4][7][8][9], NG_028255.1:g.5043_5109=, NG_028255.1:g.5054GGCCTGCCTCCGTCCCGCCGCGCCACTT[1], NC_000018.9:g.657646_657712=, NC_000018.9:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[4], NG_028255.1:g.5054GGCCTGCCTCCGTCCCGCCGCGCCACTT[3], TSER, NC_000018.10:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[7], NC_000018.10:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[1], NC_000018.10:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[3], XM_005258137.1:c.-97_-70CCGCGCCACTTGGCCTGCCTCCGTCCCG[2][3][4][7][8][9], NC_000018.10:g.657657_657712del, NC_000018.10:g.657646_657712=, NC_000018.9:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[9], NG_028255.1:g.5054GGCCTGCCTCCGTCCCGCCGCGCCACTT[9], NC_000018.10:g.657646_657673CCGCGCCACTTGGCCTGCCTCCGTCCCG[2][3][4][7][8][9], NG_028255.1:g.5054GGCCTGCCTCCGTCCCGCCGCGCCACTT[4], XM_005258138.1:c.-97_-70CCGCGCCACTTGGCCTGCCTCCGTCCCG[2][3][4][7][8][9], NC_000018.9:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[7], 45445694, NC_000018.9:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[1], NC_000018.9:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[3], NM_001071.2:c.-97_-70CCGCGCCACTTGGCCTGCCTCCGTCCCG[2][3][4][7][8][9], NC_000018.10:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[8], NG_028255.1:g.5054_5109del, NG_028255.1:g.5043_5070CCGCGCCACTTGGCCTGCCTCCGTCCCG[2][3][4][7][8][9], NC_000018.10:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[4], rs45445694, NC_000018.10:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[9], NC_000018.9:g.657657_657712del, NG_028255.1:g.5054GGCCTGCCTCCGTCCCGCCGCGCCACTT[8], NM_001012716.2:c.*34+169_*34+196CGGGACGGAGGCAGGCCAAGTGGCGCGG[2][3][4][7][8][9]"
5839,PA166157516,rs75527207,PA109,CFTR,NC_000007.14:117587806,32,2,1,1,1,"XM_011515754.1:c.1409G>A, XP_011514056.1:p.Gly470Asp, NG_056131.3:g.761G>A, NG_016465.4:g.127023G>A, NM_000492.3:c.1652G>A, XM_011515753.1:c.1409G>A, NP_000483.3:p.Gly551Asp, NC_000007.13:g.117227860=, NC_000007.14:g.117587806G>A, XM_011515752.1:c.1742G>A, XP_011514053.1:p.Gly581Asp, XP_011514055.1:p.Gly470Asp, 75527207, XM_011515751.1:c.1742G>A, XP_011514054.1:p.Gly581Asp, NC_000007.14:g.117587806=, NG_056131.1:g.130G>A, NG_016465.4:g.127023=, NC_000007.13:g.117227860G>A, rs75527207, NG_056131.1:g.130=, NP_000483.3:p.Gly551=, NG_056131.3:g.761="


In [1]:
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.hgvs_variant import HgvsVariant, any_sequence_type_regex

In [8]:
h = HgvsVariant('NC_000007.14:g.117587806G>A')

In [9]:
h.start, h.stop, h.sequence_type, h.reference_sequence, h.ref, h.alt

(117587806, 117587806, <SequenceType.GENOMIC: 2>, 'NC_000007.14', 'G', 'A')

In [16]:
h2 = HgvsVariant('NC_000018.10:g.657657GGCCTGCCTCCGTCCCGCCGCGCCACTT[7]')

In [17]:
h2.start, h2.stop, h2.reference_sequence, h2.repeat_sequence

(657657, 657657, 'NC_000018.10', 'GGCCTGCCTCCGTCCCGCCGCGCCACTT')

In [18]:
h.variant_type

<VariantType.SUBSTITUTION: 1>

In [19]:
h2.variant_type