# ClinVar variants in constrained transcripts
This script describes LoF variants in ClinVar in transcripts with regional nonsense constraint

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_context("talk")

In [2]:
%%bash
dx download -f -o ../outputs/ outputs/nmd_annotations.tsv

In [3]:
vep = (
    pd.read_csv(
        "../outputs/clinvar_variants_vep.vcf",
        sep="\t",
        comment="#",
        header=None,
        names=["chr", "pos", "id", "ref", "alt", "qual", "filter", "info"],
        usecols=["chr", "pos", "ref", "alt", "info"],
    )
    .replace(".", np.nan)
    .dropna()
)

print(f"Variants with a VEP annotation: {len(vep)}")

Variants with a VEP annotation: 1958798


In [4]:
i = (
    vep["info"]
    .str.split("|", expand=True)
    .replace("", np.nan)
    .iloc[:, [1, 3, 4, 6]]
    .rename(columns={1: "csq", 3: "hgnc", 4: "ensg", 6: "enst"})
)

In [5]:
# Only unambiguous frameshifts / nonsense variants are kept
i = i[(i.csq == "stop_gained") | (i.csq == "frameshift_variant")]

print(f"Nonsense variants retained: {(i.csq=='stop_gained').sum()}")
print(f"Frameshift variants retained: {(i.csq=='frameshift_variant').sum()}")
print(f"Total retained: {len(i)}")

Nonsense variants retained: 43220
Frameshift variants retained: 73724
Total retained: 116944


In [6]:
vep = vep.drop("info", axis=1).merge(i, how="inner", left_index=True, right_index=True)
vep = vep[["hgnc", "ensg", "enst", "chr", "pos", "ref", "alt", "csq"]]

In [7]:
cv = pd.read_csv("../outputs/clinvar_variants_selected.tsv", sep="\t")
cv["hgnc"] = cv.hgnc.str.split(";")
cv = cv.explode("hgnc")

In [8]:
df = vep.merge(cv, how="inner", on=["chr", "pos", "ref", "alt", "hgnc"])

print(f"Variants where the VEP annotation matches the ClinVar gene: {len(df)}")

Variants where the VEP annotation matches the ClinVar gene: 116786


In [9]:
df_c = df.copy()

In [91]:
df = df_c.copy()

In [92]:
nmd = (
    pd.read_csv(
        "../outputs/nmd_annotations.tsv",
        sep="\t",
        usecols=["chr", "pos", "transcript_id", "nmd_definitive"],
    )
    .rename(columns={"transcript_id": "enst", "nmd_definitive": "variant_region"})
    .replace({"nmd_target": "nmd", "distal_nmd": "distal"})
)

In [93]:
df = df.merge(nmd, how="inner")
del nmd

print(f"Variants in canonical transcripts with an NMD annotation: {len(df)}")

Variants in canonical transcripts with an NMD annotation: 116758


In [94]:
reg = pd.read_csv("../outputs/constrained_transcripts_all_regions.tsv", sep="\t")

In [95]:
m1 = reg.pli >= 0.9
m2 = reg.loeuf <= 0.35

reg.loc[~reg.pli.isna(), "gnomad"] = "tolerant"
reg.loc[(m1 | m2), "gnomad"] = "constrained"  # Overwrites the line above

In [96]:
regions = [
    "nmd",
    "distal",
    "long_exon",
]

for region in regions:
    m0 = reg["non_" + region + "_p"] >= 0.001
    m1 = reg["non_" + region + "_p"] < 0.001
    m2 = reg["non_" + region + "_p"] < 0.01
    m3 = reg["non_" + region + "_n_obs"] == 0
    m4 = reg["syn_" + region + "_z"] > -1

    reg.loc[m0 & m4, region] = "tolerant"
    reg.loc[m4 & (m1 | (m2 & m3)), region] = "constrained"

In [97]:
reg = reg[["enst", "gnomad", "nmd", "distal", "long_exon"]]

In [98]:
a = df.merge(reg, how="inner").replace(
    {
        "Pathogenic": "P/LP",
        "Likely pathogenic": "P/LP",
        "Uncertain significance": "VUS",
        "Benign": "B/LB",
        "Likely benign": "B/LB",
    }
)

In [100]:
b = a.melt(
    id_vars=["chr", "pos", "ref", "alt", "enst", "variant_region", "acmg"],
    value_vars=["gnomad", "nmd", "distal", "long_exon"],
    var_name="constraint_region",
    value_name="constraint",
)