# ClinVar variants in constrained transcripts
This script describes LoF variants in ClinVar in transcripts with regional nonsense constraint

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_context("talk")

In [2]:
vep = (
    pd.read_csv(
        "../outputs/clinvar_variants_vep.tsv",
        sep="\t",
        comment="#",
        header=None,
        names=["chr", "pos", "id", "ref", "alt", "qual", "filter", "info"],
        usecols=["chr", "pos", "ref", "alt", "info"],
    )
    .replace(".", np.nan)
    .dropna()
)

In [3]:
i = (
    vep["info"]
    .str.split("|", expand=True)
    .replace("", np.nan)
    .iloc[:, [1, 3, 4, 6]]
    .rename(columns={1: "csq", 3: "hgnc", 4: "ensg", 6: "enst"})
)

m1 = i.csq.str.contains("stop_gained")
m2 = i.csq.str.contains("frameshift")

i = i[m1 | m2]

# These categories are simplistic; they capture many complex variant consequences...
i.loc[m1, "csq"] = "nonsense"  # Not strictly nonsense SNVs
i.loc[m2, "csq"] = "frameshift"

In [4]:
vep = vep.drop("info", axis=1).merge(i, how="inner", left_index=True, right_index=True)
vep = vep[["hgnc", "ensg", "enst", "chr", "pos", "ref", "alt", "csq"]]

In [5]:
cv = pd.read_csv("../outputs/clinvar_variants_selected.tsv", sep="\t")
cv["hgnc"] = cv.hgnc.str.split(";")
cv = cv.explode("hgnc")

In [6]:
df = vep.merge(cv, how="inner", on=["chr", "pos", "ref", "alt", "hgnc"])