#### ClinVar Missense Pathogenicity Analysis

In [14]:
import pandas as pd
import numpy as np
import requests
import gzip
import shutil
from pathlib import Path
import re
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

#### Fetch and Filter Data

In [15]:
# Data source configuration
CLINVAR_URL = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
CLINVAR_GZ = "variant_summary.txt.gz"
CLINVAR_TXT = "variant_summary.txt"
OUTPUT_CSV = "clinvar_missense_variants.csv"

# Filtering criteria
VALID_SIGNIFICANCE = {"Pathogenic", "Likely pathogenic", "Benign", "Likely benign"}
GENOME_BUILD = "GRCh38"  # Compatible with AlphaFold, Ensembl, UniProt
CHUNK_SIZE = 100_000

# Precompiled regex patterns for efficiency
HAS_P_PROTEIN_CHANGE = re.compile(r"\(p\.", re.ASCII)
EXTRACT_PROTEIN_CHANGE = re.compile(r"\(p\.(.*?)\)", re.ASCII)
IS_MISSENSE = re.compile(r"^[A-Z][a-z]{2}\d+[A-Z][a-z]{2}$", re.ASCII)

# Columns to read (memory optimization)
READ_COLS = ["Assembly", "Type", "ClinicalSignificance", "Name", "GeneSymbol", "HGNC_ID"]

In [16]:
def download_and_extract_clinvar() -> None:
    if not Path(CLINVAR_GZ).exists():
        with requests.get(CLINVAR_URL, stream=True) as r:
            r.raise_for_status()
            with open(CLINVAR_GZ, "wb") as f_out:
                shutil.copyfileobj(r.raw, f_out)

    if not Path(CLINVAR_TXT).exists():
        with gzip.open(CLINVAR_GZ, "rb") as f_in, open(CLINVAR_TXT, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

In [17]:
def filter_chunk(chunk: pd.DataFrame) -> pd.DataFrame:
    """
    Filter ClinVar chunk for protein-coding missense SNVs with clear clinical significance, using GRCh38 coordinates.
    """
    # Initial filters for relevant vatiant type

    chunk = chunk[
        (chunk["Assembly"] == GENOME_BUILD)
        & (chunk["Type"] == "single nucleotide variant")
        & (chunk["ClinicalSignificance"].isin(VALID_SIGNIFICANCE))
        & (chunk["Name"].str.contains(HAS_P_PROTEIN_CHANGE, na=False))
    ].copy()

    # Extract protein change string
    
    chunk["protein_change"] = chunk["Name"].str.extract(EXTRACT_PROTEIN_CHANGE)

    # Keep only missense variants
    # Match: 3-letter AA, position, and new AA

    missense_mask = chunk["protein_change"].str.match(IS_MISSENSE, na=False)
    chunk = chunk[missense_mask].copy()

    return chunk[
        ["GeneSymbol", "HGNC_ID", "ClinicalSignificance", "Name", "protein_change"]
    ].rename(
        columns={"GeneSymbol": "gene"}
    )


In [18]:
def filter_missense_variants_streaming() -> None:
    reader = pd.read_csv(
        CLINVAR_TXT,
        sep="\t",
        chunksize=CHUNK_SIZE,
        low_memory=False,
        usecols=READ_COLS,
        dtype={"Assembly": "category", "Type": "category", "ClinicalSignificance": "category"},
    )

    is_first = True
    total = 0

    for chunk in reader:
        filtered = filter_chunk(chunk)
        if not filtered.empty:
            filtered.to_csv(OUTPUT_CSV, mode="w" if is_first else "a", header=is_first, index=False)
            is_first = False
            total += len(filtered)
    
    print(f"Saved {total} missense variants to {OUTPUT_CSV}.")

In [19]:
# Main execution

if __name__ == "__main__":
    download_and_extract_clinvar()
    filter_missense_variants_streaming()

Saved 239112 missense variants to clinvar_missense_variants.csv.
