# ClinVar variants
This script matches ClinVar variants introducing PTCs to NMD annotations

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_context("talk")

In [5]:
%%bash
dx download -f -o ../data/ data/variant_summary.txt.gz
gunzip -f ../data/variant_summary.txt.gz

In [26]:
usecols = [
    "Type",
    "GeneSymbol",
    "ClinicalSignificance",
    "Origin",
    "Assembly",
    "Chromosome",
    "PositionVCF",
    "ReferenceAlleleVCF",
    "AlternateAlleleVCF",
    "ReviewStatus",
]
names = [
    "type",
    "hgnc",
    "acmg",
    "origin",
    "assembly",
    "chr",
    "pos",
    "ref",
    "alt",
    "review",
]
chrom = [str(x) for x in list(range(1, 23))] + ["X", "Y"]

cv = pd.read_csv(
    "../data/variant_summary.txt",
    sep="\t",
    nrows=100,
    usecols=usecols,
    low_memory=False,
    dtype={"Chromosome":str},
).rename(columns={x: y for x, y in zip(usecols, names)}).query(f"chr.isin({chrom})")
cv.head()

Unnamed: 0,type,hgnc,acmg,origin,assembly,chr,review,pos,ref,alt
0,Indel,AP5Z1,Pathogenic,germline;unknown,GRCh37,7,"criteria provided, single submitter",4820844,GGAT,TGCTGTAAACTGTAACTGTAAA
1,Indel,AP5Z1,Pathogenic,germline;unknown,GRCh38,7,"criteria provided, single submitter",4781213,GGAT,TGCTGTAAACTGTAACTGTAAA
2,Deletion,AP5Z1,Pathogenic,germline,GRCh37,7,no assertion criteria provided,4827360,GCTGCTGGACCTGCC,G
3,Deletion,AP5Z1,Pathogenic,germline,GRCh38,7,no assertion criteria provided,4787729,GCTGCTGGACCTGCC,G
4,single nucleotide variant,ZNF592,Uncertain significance,germline,GRCh37,15,no assertion criteria provided,85342440,G,A


In [None]:
df = pd.read_csv(
    "../outputs/clinvar_variants.tsv",
    sep="\t",
    header=None,
    low_memory=False,
    names=names,
).query(f"chr.isin({chrom})")

In [8]:
null_review = [
    "no_assertion",
    "no_interpretation",
]
null_acmg = [
    "benign",
    "uncertain",
    "not_provided",
    "drug_response",
    "other",
    "risk",
    "low_penetrance",
    "conflicting",
]

m1 = ~df.review.str.lower().str.contains("|".join(null_review))
m2 = ~df.acmg.str.lower().str.contains("|".join(null_acmg))

df = df[m1 & m2]

In [9]:
df.hgnc = df.hgnc.str.split(":").str[0]

In [14]:
df.csq = pd.Series([x.split(",") for x in df.csq], index=df.index)