In [2]:
import re
import pandas as pd

#df = pd.read_csv(r"C:\Users\shiri\Documents\Aarnav Stanford Project\gene-variant-annotator\example_variants.csv")

df = pd.read_csv(r"https://raw.githubusercontent.com/aarnavp009/AarnavProjects/main/example_variants.csv")

# Input field names from original file 

HGVS_COL = "HGVS"
DBSNP_COL = "dbSNP"
FREQ_COL = "gnomAD frequency"

def split_hgvs(h: str):

    # -- split HGVS_COL field to get gene,cdna,kind and transcript
   
    if not isinstance(h, str):
        return (None, None, None, None)

    h = h.strip()

    m = re.match(r"^(NM_\d+\.\d+)\(([^)]+)\):(.+)$", h)
    if not m:
        return (None, None, None, None)

    transcript, gene, change = m.groups()
    change = change.strip()

    # simple classification
    kind = "cdna_other"
    if change.startswith("c."):
        # intronic/splice-like patterns often have + or -
        if re.search(r"c\.\d+[+-]\d+", change):
            kind = "intronic_or_splice"
        elif re.search(r"c\.\d+[ACGT]>[ACGT]", change):
            kind = "coding_substitution"
        elif "del" in change:
            kind = "deletion"
        elif "ins" in change:
            kind = "insertion"
        elif "dup" in change:
            kind = "duplication"
        elif "delins" in change:
            kind = "delins"

    return (transcript, gene, change, kind)

parsed = df[HGVS_COL].apply(split_hgvs)
df[["transcript", "gene", "cdna_change", "variant_type"]] = pd.DataFrame(parsed.tolist(), index=df.index)

trimmed = df[["gene", "transcript", "cdna_change", "variant_type", DBSNP_COL, FREQ_COL]].rename(
    columns={
        DBSNP_COL: "dbsnp",
        FREQ_COL: "gnomad_frequency"
    }
)

trimmed.to_csv("cftr_trimmed_from_table.csv", index=False)
print("Saved: cftr_trimmed_from_table.csv")
print(trimmed.head(10).to_string(index=False))

import os
print(os.getcwd())

Saved: cftr_trimmed_from_table.csv
gene  transcript             cdna_change        variant_type      dbsnp  gnomad_frequency
CFTR NM_000492.4              c.1210-12=  intronic_or_splice  rs1805177           0.97144
CFTR NM_000492.4 c.1408G>A (p.Val470Met) coding_substitution   rs213950           0.56732
CFTR NM_000492.4           c.1680-870T>A  intronic_or_splice   rs213965           0.55332
CFTR NM_000492.4          c.1585-9218G>A  intronic_or_splice   rs213953           0.54845
CFTR NM_000492.4   c.2562T>G (p.Thr854=) coding_substitution  rs1042077           0.44737
CFTR NM_000492.4           c.1766+152T>A  intronic_or_splice  rs4148711           0.43526
CFTR NM_000492.4     c.1408= (p.Val470=)          cdna_other   rs213950           0.43268
CFTR NM_000492.4            c.1393-61A>G  intronic_or_splice rs34855237           0.25914
CFTR NM_000492.4           c.4137-139G>A  intronic_or_splice  rs4727855           0.25573
CFTR NM_000492.4            c.274-179G>A  intronic_or_splice  rs1