# ClinVar variants
This script processes the ClinVar summary text file.
Variants are filtered and reformated.
A TSV file and VCF file are written to output, for downstream annotation with VEP.

## Preliminaries
Import modules and download data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_context("talk")

In [2]:
%%bash
dx download -f -o ../data/ data/variant_summary.txt.gz
gunzip -f ../data/variant_summary.txt.gz

## Read and reformat the ClinVar text file

In [3]:
# Column names
usecols = [
    "Type",
    "GeneSymbol",
    "ClinicalSignificance",
    "Assembly",
    "Chromosome",
    "PositionVCF",
    "ReferenceAlleleVCF",
    "AlternateAlleleVCF",
    "ReviewStatus",
]
# Simplified names for columns
names = [
    "type",
    "hgnc",
    "acmg",
    "assembly",
    "chr",
    "pos",
    "ref",
    "alt",
    "review",
]

# Main chromosomes only
chrom = [str(x) for x in list(range(1, 23))] + ["X", "Y"]

# Read and reformat the text file
cv = (
    pd.read_csv(
        "../data/variant_summary.txt",
        sep="\t",
        usecols=usecols,
        low_memory=False,
        dtype={"Chromosome": str},
    )
    .rename(columns={x: y for x, y in zip(usecols, names)})
    .query(f"chr.isin({chrom})")
    .query("assembly == 'GRCh38'")
    .drop("assembly", axis=1)
)

# Add "chr" prefix to chromosome numbers
cv["chr"] = "chr" + cv["chr"]

## Filter ClinVar variants
Based on review status and ACMG category

In [4]:
# Review strings to exclude
null_review = [
    "no assertion",
    "no interpretation",
]

# Clinical significance string to exclude
null_acmg = [
    "not provided",
    "drug response",
    "other",
    "risk",
    "low penetrance",
    "conflicting",
    "affects",
    "association",
    "protective",
    "confers sensitivity",
]

# Create masks for the above
m1 = ~cv.review.str.lower().str.contains("|".join(null_review))
m2 = ~cv.acmg.str.lower().str.contains("|".join(null_acmg))

# Filter the dataframe
cv = cv[m1 & m2].reset_index().replace({"Benign/Likely benign":"Likely benign", "Pathogenic/Likely pathogenic":"Likely pathogenic"})

## Save to output

In [5]:
# TSV format
cv = cv[["chr", "pos", "ref", "alt", "hgnc", "acmg", "review"]]
cv.to_csv("../outputs/clinvar_variants_selected.tsv", sep="\t", index=False)

In [6]:
# VCF format for downstream annotation with VEP
vcf = cv.assign(
    _id=cv.index,
    qual=".",
    _filter=".",
    info=".",
)[["chr", "pos", "_id", "ref", "alt", "qual", "_filter", "info"]]

vcf.to_csv(
    "../outputs/clinvar_variants_selected.vcf", sep="\t", index=False, header=False
)