### How to associate consequences derived from HGVS expressions with ClinVar records?

In [1]:
from collections import defaultdict, Counter
from itertools import zip_longest
import json
import os
import re
import sys
import urllib

import numpy as np
import requests

from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io import *

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.hgvs_variant import HgvsVariant

In [3]:
PROJECT_ROOT = '/home/april/projects/opentargets/'
clinvar_path = os.path.join(PROJECT_ROOT, 'ClinVarFullRelease_00-latest.xml.gz')
# clinvar_path = os.path.join(PROJECT_ROOT, 'clinvar-sample.xml.gz')
dataset = ClinVarDataset(clinvar_path)

In [4]:
def vcv(record):
    if record.measure:
        return find_mandatory_unique_element(record.rcv, './MeasureSet').attrib['Acc']

In [30]:
limit = None

# for each HGVS that structural variants pipeline would process, how many rcvs/vcvs are associated?
#  and would any of them potentially get annotated by the simple VEP pipeline? (assuming repeats override complex)
complex_hgvs_to_complex_rcv = defaultdict(list)
complex_hgvs_to_complex_vcv = defaultdict(list)

complex_hgvs_to_other_rcv = defaultdict(list)
complex_hgvs_to_other_vcv = defaultdict(list)

i = 0
for r in dataset:
    if pipeline.can_process(r):
        complex_hgvs = [h for h in r.measure.current_hgvs if h is not None]
        for h in complex_hgvs:
            complex_hgvs_to_complex_rcv[h].append(r.accession)
            complex_hgvs_to_complex_vcv[h].append(vcv(r))
    else:
        if r.measure and r.measure.current_hgvs:
            other_hgvs = [h for h in r.measure.current_hgvs if h is not None]
            for h in other_hgvs:
                if h in complex_hgvs_to_complex_rcv:
                    complex_hgvs_to_other_rcv[h].append(r.accession)
                    complex_hgvs_to_other_vcv[h].append(vcv(r))
    i += 1
    if limit and i > limit:
        break

### Part 1: "Problem" RCVs

RCVs with and without coordinates that are associated with the same genomic HGVS.

In [48]:
from eva_cttv_pipeline.clinvar_xml_io.clinvar_xml_io.hgvs_variant import SequenceType

problem_rcvs = []
for h in complex_hgvs_to_other_rcv.keys():
    if HgvsVariant(h).sequence_type == SequenceType.GENOMIC:
        problem_rcvs.extend(complex_hgvs_to_other_rcv[h])
        problem_rcvs.extend(complex_hgvs_to_complex_rcv[h])
problem_rcvs = set(problem_rcvs)

In [49]:
problem_rcvs  # includes both complex and "other" rcvs

{'RCV000020022',
 'RCV000235026',
 'RCV000516854',
 'RCV000556621',
 'RCV001169937',
 'RCV001192398',
 'RCV001780504',
 'RCV001783376'}

In [54]:
for r in dataset:
    if r.accession in problem_rcvs:
        print(r.accession)
        print(vcv(r))
        print(r.measure.current_hgvs)
        print(r.measure.vcf_full_coords)
        print('\n=========\n')

RCV001780504
VCV001322084
['LRG_1254t1:c.1327del', 'NM_000080.4:c.1327del', 'LRG_1254:g.9185del', 'NG_028005.1:g.70553del', 'NG_008029.2:g.9185del', 'NC_000017.11:g.4898892del', None, 'LRG_1254p1:p.Glu443fs', 'NP_000071.1:p.Glu443fs']
None


RCV001783376
VCV001323012
['LRG_672t1:c.803_806del', 'NM_001199974.2:c.177+6741_177+6744del', 'NM_001199973.2:c.300+3106_300+3109del', 'NM_000169.3:c.803_806del', 'LRG_672:g.14398_14401del', 'NG_007119.1:g.14398_14401del', 'NC_000023.11:g.101398566_101398569del', 'NR_164783.1:n.882_885del', None, None, 'NP_000160.1:p.Met267_Leu268insTer']
None


RCV001192398
VCV000222420
['NM_001199974.2:c.177+6744_177+6747del', 'NM_001199973.2:c.300+3109_300+3112del', 'LRG_672:g.14398_14401del', 'NG_007119.1:g.14398_14401del', 'NC_000023.11:g.101398566_101398569del', 'NC_000023.10:g.100653551_100653554del', None, 'p.L268X']
X_101398562_CACTA_C


RCV000020022
VCV000243031
['NM_000080.4:c.1327delG', 'LRG_1254t1:c.1327del', 'LRG_1254:g.9185del', 'NG_028005.1:g.70553d

In [None]:
for h, vcvs in complex_hgvs_to_complex_vcv.items():
    num_vcvs = len(set(vcvs))
    if num_vcvs > 1 and HgvsVariant(h).sequence_type == SequenceType.GENOMIC:
        print(h)
        print(set(vcvs))
        print('\n========\n')

In [61]:
# for two sets of HGVS identifiers associated with two different VCVs, what's the intersection & set difference?
with_coordinates = {'NM_000080.4:c.1327delG', 'LRG_1254t1:c.1327del', 'LRG_1254:g.9185del', 'NG_028005.1:g.70553del', 'NG_008029.2:g.9185del', 'NC_000017.11:g.4898892del', 'NC_000017.10:g.4802186del', None, None, 'p.Glu443Lysfs*64', 'NP_000071.1:p.Glu443LysfsTer64'}
no_coordinates = {'LRG_1254t1:c.1327del', 'NM_000080.4:c.1327del', 'LRG_1254:g.9185del', 'NG_028005.1:g.70553del', 'NG_008029.2:g.9185del', 'NC_000017.11:g.4898892del', None, 'LRG_1254p1:p.Glu443fs', 'NP_000071.1:p.Glu443fs'}

In [64]:
with_coordinates & no_coordinates

{'LRG_1254:g.9185del',
 'LRG_1254t1:c.1327del',
 'NC_000017.11:g.4898892del',
 'NG_008029.2:g.9185del',
 'NG_028005.1:g.70553del',
 None}

In [65]:
with_coordinates - no_coordinates

{'NC_000017.10:g.4802186del',
 'NM_000080.4:c.1327delG',
 'NP_000071.1:p.Glu443LysfsTer64',
 'p.Glu443Lysfs*64'}

In [66]:
no_coordinates - with_coordinates

{'LRG_1254p1:p.Glu443fs', 'NM_000080.4:c.1327del', 'NP_000071.1:p.Glu443fs'}

### Notes
* Is the same measure (VCV or HGVS) ever associated with different RCVs?
    * Yes - unlike for repeat expansion variants, these structural ones (no coordinates but with HGVS) can have the same measure/VCV associated with multiple RCVs
    * hence _cannot_ use RCV to connect records to consequences
* Is the same HGVS ever associated with RCVs with and without complete coordinates?
    * Yes, surprisingly...
    * when this happens it's via different VCVs
* If complex HGVS is associated with another RCV that does have complete coordinates, are these aligned?
    * Not always exactly...
    * e.g. `NC_000017.11:g.4898892del` is associated with an RCV with no coordinates and one with coordinates `17_4898890_TC_T`
* Is the same HGVS ever associated with different VCVs?
    * Yes, very often actually
* Is the same _set_ of HGVS ever associated with different VCVs?
    * Not for the above case but probably can't tell for certain (and probably not worth relying on this)

Bonus question: how is ClinVar associating HGVS with VCV???

### Part 2: Consequences of "problem" RCVs

What happens if we actually run the two relevant consequence mapping pipelines on these RCVs?

In [21]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

from consequence_prediction.structural_variants import pipeline as structural_pipeline
from consequence_prediction.vep_mapping_pipeline.consequence_mapping import colon_based_id_to_vep_id, process_variants

In [6]:
problem_path = os.path.join(PROJECT_ROOT, 'complex-events/rcvs_sharing_hgvs.xml.gz')
problem_dataset = ClinVarDataset(problem_path)

In [16]:
# convert VEP pipeline to be more usable...
IUPAC_AMBIGUOUS_SEQUENCE = re.compile(r'[^ACGT]')

def vep_pipeline_main(clinvar_xml):
    variants = []
    for clinvar_record in ClinVarDataset(clinvar_xml):
        if clinvar_record.measure is None or not clinvar_record.measure.has_complete_coordinates:
            continue
        m = clinvar_record.measure
        if IUPAC_AMBIGUOUS_SEQUENCE.search(m.vcf_ref + m.vcf_alt):
            continue
        variants.append(f'{m.chr}:{m.vcf_pos}:{m.vcf_ref}:{m.vcf_alt}')

    variants_to_query = [colon_based_id_to_vep_id(v) for v in variants]
    variant_results = process_variants(variants_to_query)
    variant_data = []
    for variant_id, gene_id, gene_symbol, consequence_term, distance in variant_results:
        variant_data.append((variant_id, '1', gene_id, gene_symbol, consequence_term, distance))
    consequences = pd.DataFrame(variant_data, columns=('VariantID', 'PlaceholderOnes', 'EnsemblGeneID',
                                               'EnsemblGeneName', 'ConsequenceTerm', 'Distance'))
    return consequences

In [17]:
vep_consequences = vep_pipeline_main(problem_path)

In [22]:
vep_consequences

Unnamed: 0,VariantID,PlaceholderOnes,EnsemblGeneID,EnsemblGeneName,ConsequenceTerm,Distance
0,X 101398562 . CACTA C,1,ENSG00000102393,GLA,frameshift_variant,0
1,17 4898890 . TC T,1,ENSG00000108556,CHRNE,frameshift_variant,0


In [33]:
struct_consequences = structural_pipeline.main(problem_path)

INFO:consequence_prediction.structural_variants.pipeline:2 records processed with 17 HGVS expressions
INFO:consequence_prediction.structural_variants.pipeline:3 parsed into chrom/start/end/type
INFO:consequence_prediction.structural_variants.pipeline:Done with batch 1


In [34]:
# haven't implemented the single base deletion case as it's not a range, but I think we'd get the following
# https://rest.ensembl.org/vep/human/region/NC_000017.11:4898892-4898892:1/DEL?content-type=application/json
struct_consequences = struct_consequences.append(
    pd.DataFrame(
        [['NC_000017.11 4898892 4898892 DEL +', 1, 'ENSG00000108556', 'CHRNE', 'frameshift_variant', 0]],
        columns=('VariantID', 'PlaceholderOnes', 'EnsemblGeneID', 'EnsemblGeneName', 'ConsequenceTerm', 'Distance')
    )
)

In [35]:
struct_consequences

Unnamed: 0,VariantID,PlaceholderOnes,EnsemblGeneID,EnsemblGeneName,ConsequenceTerm,Distance
0,NC_000023.11 101398566 101398569 DEL + NC_000023.11:g.101398566_101398569del,1,ENSG00000102393,GLA,coding_sequence_variant,0
0,NC_000017.11 4898892 4898892 DEL +,1,ENSG00000108556,CHRNE,frameshift_variant,0
