In [1]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from Bio import SeqIO
from io import StringIO
import requests

# ---- paths ----
# NOTE: Path() cannot point to a URL. Keep your "DATA" as a base URL string,
# and keep OUT as a LOCAL folder you can write to.
DATA = "https://raw.githubusercontent.com/aarnavp009/AarnavProjects/main"
OUT = Path("outputs")
OUT.mkdir(parents=True, exist_ok=True)

protein_fasta = f"{DATA}/CFTR_protein.fasta"
variants_csv  = f"{DATA}/cftr_trimmed_from_table.csv"

# ---- load protein ----
# SeqIO.parse needs a local file or file-handle, so we fetch the URL and parse from memory.
r = requests.get(protein_fasta, timeout=30)
r.raise_for_status()
protein_seq = str(next(SeqIO.parse(StringIO(r.text), "fasta")).seq)
protein_length = len(protein_seq)

# ---- load variants ----
df = pd.read_csv(variants_csv)

# (optional but usually necessary) ensure numeric
df["gnomad_frequency"] = pd.to_numeric(df["gnomad_frequency"], errors="coerce")

# ---- basic stats ----
summary = {
    "protein_length_aa": protein_length,
    "total_variants": len(df),
    "variant_types": df["variant_type"].value_counts().to_dict(),
    "mean_gnomad_frequency": df["gnomad_frequency"].mean()
}

summary_df = pd.DataFrame(summary.items(), columns=["metric", "value"])
summary_df.to_csv(OUT / "cftr_summary.csv", index=False)

# ---- plot ----
plt.figure()
df["gnomad_frequency"].dropna().hist(bins=30)
plt.title("gnomAD frequency distribution")
plt.xlabel("gnomad_frequency")
plt.ylabel("count")
plt.tight_layout()
plt.savefig(OUT / "gnomad_frequency_hist.png", dpi=200)
plt.close()

print("Saved:", OUT / "cftr_summary.csv")
print("Saved:", OUT / "gnomad_frequency_hist.png")
print(summary_df)

Saved: outputs\cftr_summary.csv
Saved: outputs\gnomad_frequency_hist.png
                  metric                                              value
0      protein_length_aa                                               1480
1         total_variants                                               4586
2          variant_types  {'coding_substitution': 2903, 'intronic_or_spl...
3  mean_gnomad_frequency                                           0.007198
