In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

DATA_PATH = Path('data/clinvar/variant_summary.txt.gz')
ALT_PATH = Path('../Project/data/clinvar/variant_summary.txt.gz')

if not DATA_PATH.exists() and ALT_PATH.exists():
    print('Using course workspace ClinVar file:', ALT_PATH)
    DATA_PATH = ALT_PATH

DATA_PATH

PosixPath('data/clinvar/variant_summary.txt.gz')

In [3]:
# Read a small sample first (fast sanity check)
if DATA_PATH.exists():
	df_sample = pd.read_csv(DATA_PATH, sep='\t', compression='gzip', nrows=50_000, low_memory=False)
elif ALT_PATH.exists():
	print('Using alternate path:', ALT_PATH)
	df_sample = pd.read_csv(ALT_PATH, sep='\t', compression='gzip', nrows=50_000, low_memory=False)
else:
	raise FileNotFoundError(f"Neither {DATA_PATH} nor {ALT_PATH} exists.")

df_sample.shape  # (50000, 86)


FileNotFoundError: Neither data/clinvar/variant_summary.txt.gz nor ../Project/data/clinvar/variant_summary.txt.gz exists.

In [None]:
df_sample.columns.tolist()[:25]

In [None]:
# Key fields we care about
cols = [
    'ClinicalSignificance', 'ReviewStatus', 'ClinSigSimple',
    'Assembly', 'Chromosome', 'PositionVCF', 'ReferenceAlleleVCF', 'AlternateAlleleVCF',
    'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele',
]
[c for c in cols if c in df_sample.columns]

In [None]:
# Assembly distribution (sample)
if 'Assembly' in df_sample.columns:
    display(df_sample['Assembly'].value_counts(dropna=False).head(10))

In [None]:
# Clinical significance (sample)
if 'ClinicalSignificance' in df_sample.columns:
    display(df_sample['ClinicalSignificance'].value_counts(dropna=False).head(20))

In [None]:
# Chromosome distribution (sample)
if 'Chromosome' in df_sample.columns:
    display(df_sample['Chromosome'].astype(str).str.replace('chr', '', regex=False).value_counts().head(30))

## Next steps (after this notebook)

1. Define *high-confidence* label mapping rules for your curated dataset.
2. Decide the assembly for v1 (default: GRCh38).
3. Create a curated Parquet with a binary `label` column and a single assembly.
4. Run VEP on a pilot set to define the non-coding subset.
5. Use `scripts/make_splits.py` to generate chromosome holdout splits.