# ClinVar variants
This script matches ClinVar variants introducing PTCs to NMD annotations

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_context("talk")

In [2]:
%%bash
dx download -f -o ../data/ data/variant_summary.txt.gz
gunzip -f ../data/variant_summary.txt.gz

In [10]:
usecols = [
    "Type",
    "GeneSymbol",
    "ClinicalSignificance",
    "Assembly",
    "Chromosome",
    "PositionVCF",
    "ReferenceAlleleVCF",
    "AlternateAlleleVCF",
    "ReviewStatus",
]
names = [
    "type",
    "hgnc",
    "acmg",
    "assembly",
    "chr",
    "pos",
    "ref",
    "alt",
    "review",
]
chrom = [str(x) for x in list(range(1, 23))] + ["X", "Y"]

cv = (
    pd.read_csv(
        "../data/variant_summary.txt",
        sep="\t",
        usecols=usecols,
        low_memory=False,
        dtype={"Chromosome": str},
    )
    .rename(columns={x: y for x, y in zip(usecols, names)})
    .query(f"chr.isin({chrom})")
    .query("assembly == 'GRCh38'")
    .drop("assembly", axis=1)
)
cv["chr"] = "chr" + cv["chr"]

In [11]:
null_review = [
    "no assertion",
    "no interpretation",
]
null_acmg = [
    "benign",
    "uncertain",
    "not provided",
    "drug response",
    "other",
    "risk",
    "low penetrance",
    "conflicting",
    "affects",
    "association",
    "protective",
    "confers sensitivity",
]

m1 = ~cv.review.str.lower().str.contains("|".join(null_review))
m2 = ~cv.acmg.str.lower().str.contains("|".join(null_acmg))

cv = cv[m1 & m2].reset_index()

In [12]:
cv = cv[["chr", "pos", "ref", "alt", "hgnc", "acmg", "review"]]
cv.to_csv("../outputs/clinvar_variants_selected.tsv", sep="\t", index=False)

In [17]:
vcf = cv.assign(
    _id=cv.index,
    qual=".",
    _filter=".",
    info=".",
)[["chr", "pos", "_id", "ref", "alt", "qual", "_filter", "info"]]

vcf.to_csv(
    "../outputs/clinvar_variants_selected.vcf", sep="\t", index=False, header=False
)