# ClinVar ascertainment statistics
A simple notebook, run as a script, to generate statistics about variant ascertainment in ClinVar.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Imports
import pandas as pd

from src import constants as C
from src.statistics_for_plots import utils

## Genomic footprint of NMD regions
Find the genomic footprint of NMD regions.

In [3]:
# Read NMD annotations
nmd = pd.read_csv(
    C.NMD_ANNOTATIONS,
    sep="\t",
    usecols=["nmd_definitive"],
    dtype="category",
)

In [4]:
# Get relative CDS footprint
footprint = (
    nmd["nmd_definitive"]
    .value_counts(normalize=True)
    .rename("footprint")
    .pipe(utils.sort_region_index)
)

# Save to output
footprint.to_csv(C.STATS_NMD_FOOTPRINT, sep="\t")

print("Relative genomic footprint of NMD regions:")
print(f"{footprint}")

Relative genomic footprint of NMD regions:
region
NMD target        0.613294
Start proximal    0.087547
Long exon         0.079182
Distal            0.219977
Name: footprint, dtype: float64


## Variant ascertainment in ClinVar
Find the number of truncating variants in each NMD region in ClinVar. Normalise by the total genomic footprint of each region.

In [5]:
# Get ClinVar truncating variants.
cv = pd.read_csv(
    C.CLINVAR_LOF_ANNOTATED,
    sep="\t",
)

# The proportion of ClinVar variants in each region (ascertainment)
cv_asc = cv.region.value_counts(normalize=True).rename("proportion_variants").pipe(utils.sort_region_index)

# Normalize by the footprint of the region
cv_asc_norm = (cv_asc / footprint).rename("prop_norm")

# Save to output
cv_asc_norm.to_csv(C.STATS_CLINVAR_ASCERTAINMENT, sep="\t")

print("Normalised ascertainment of ClinVar variants:")
print(f"{cv_asc_norm}")

Normalised ascertainment of ClinVar variants:
region
NMD target        1.141767
Start proximal    0.672733
Long exon         1.308184
Distal            0.624069
Name: prop_norm, dtype: float64


## Proportion of VUS in ClinVar by NMD region
For truncating variants in ClinVar, find the proportion of VUS in each NMD region.

In [6]:
# Find the relative proportion of P/LP/VUS/LB/B variants in ClinVar by region
cv_acmg = (
    cv.groupby("region")["acmg"]
    .value_counts(normalize=True)
    .rename("proportion")
    .reset_index()
    .pipe(utils.sort_region_column)
)

cv_vus = cv_acmg[cv_acmg.acmg == "VUS"].copy()

# Save to output
cv_acmg.to_csv(C.STATS_CLINVAR_ACMG_REGION, sep="\t", index=False)
cv_vus.to_csv(C.STATS_CLINVAR_VUS_REGION, sep="\t", index=False)

print("Proportion of truncating variants by ACMG annotation in each region:")
print(f"{cv_acmg}")

Proportion of truncating variants by ACMG annotation in each region:
            region  acmg  proportion
6       NMD target  P/LP    0.904278
7       NMD target   VUS    0.091098
8       NMD target  B/LB    0.004624
9   Start proximal  P/LP    0.856116
10  Start proximal   VUS    0.132030
11  Start proximal  B/LB    0.011854
3        Long exon  P/LP    0.940847
4        Long exon   VUS    0.056141
5        Long exon  B/LB    0.003011
0           Distal  P/LP    0.659543
1           Distal   VUS    0.317897
2           Distal  B/LB    0.022560
