In [None]:
from Bio import SeqIO
import pandas as pd

# -------------------------------------------------------------
# CONFIG
# -------------------------------------------------------------
protein_fasta = "../data/ensembl/Homo_sapiens.GRCh38.pep.all.fa"

# -------------------------------------------------------------
# PARSE FASTA
# -------------------------------------------------------------
records = []
missing_symbol, malformed = 0, 0

for rec in SeqIO.parse(protein_fasta, "fasta"):
    try:
        # Extract attributes in the FASTA description (key:value pairs)
        parts = {k: v for k, v in (tok.split(":", 1) for tok in rec.description.split() if ":" in tok)}
        biotype = parts.get("gene_biotype", "").strip()
        gene_symbol = parts.get("gene_symbol", "").strip()

        # Only keep valid protein_coding genes
        if biotype == "protein_coding" and gene_symbol:
            records.append({
                "gene_symbol": gene_symbol,
                "length": len(rec.seq),
                "seq": str(rec.seq).strip().upper().replace("*", "")
            })
        elif not gene_symbol:
            missing_symbol += 1
    except Exception:
        malformed += 1

# -------------------------------------------------------------
# RESULTS
# -------------------------------------------------------------
df = pd.DataFrame(records).drop_duplicates("gene_symbol").reset_index(drop=True)

print(f"‚úÖ Parsed {len(df):,} unique protein-coding genes")
print(f"‚ö†Ô∏è Missing gene_symbol: {missing_symbol}")
print(f"‚ö†Ô∏è Malformed entries:   {malformed}")

# Optional: quick summary
print("\nSequence length stats:")
print(df["length"].describe([0.5, 0.9, 0.95, 0.99]).to_string())

# Save for later use 
out_path = "../data/ensembl/protein_coding_genes.csv"
df.to_csv(out_path, index=False)
print(f"\nüíæ Saved: {out_path}")


‚úÖ Parsed 19,477 unique protein-coding genes
‚ö†Ô∏è Missing gene_symbol: 1533
‚ö†Ô∏è Malformed entries:   0

Sequence length stats:
count    19477.000000
mean       484.967551
std        554.951638
min          1.000000
50%        346.000000
90%        976.000000
95%       1332.000000
99%       2525.320000
max      27118.000000

üíæ Saved: ../data/ensembl/protein_coding_genes.csv


In [15]:
# 17944 genes with symbols
df.shape

(19477, 3)