# FASTA Generation and bHLH Domain Extraction

This notebook retrieves CDS sequences, translates them to protein, selects the longest isoform per gene, and extracts bHLH domain regions.

**Inputs**
- `data/intermediate/Metadata_CSVs/InterPro_Domains_cleaned.csv`
- `data/intermediate/Metadata_CSVs/Pfam_Domains_cleaned.csv`

**Outputs**
- `data/intermediate/interpro/InterPro_Domains_bHLH_filtered.csv`
- `data/intermediate/bHLH_transcripts_CDS.fasta`
- `data/intermediate/bHLH_transcripts_protein.fasta`
- `data/intermediate/longest_isoform.fasta`
- `data/intermediate/bHLH_domains.fasta`

**Note**: Set `BHLH_PROJECT_ROOT` if running from a different working directory.


In [None]:
from Bio.Seq import Seq
from Bio import SeqIO
import pandas as pd
import requests
from pathlib import Path

project_root = Path(__import__("os").getenv("BHLH_PROJECT_ROOT", ".")).resolve()

def p(*parts):
    return str(project_root.joinpath(*parts))


## 1) Filter InterPro bHLH domains (IPR011598)

In [None]:
interpro = pd.read_csv(p("data", "intermediate", "Metadata_CSVs", "InterPro_Domains_cleaned.csv"))
interpro_bhlh = interpro[interpro["interpro"] == "IPR011598"].copy()

out_csv = p("data", "intermediate", "interpro", "InterPro_Domains_bHLH_filtered.csv")
Path(out_csv).parent.mkdir(parents=True, exist_ok=True)
interpro_bhlh.to_csv(out_csv, index=False)

print("Filtered InterPro rows:", interpro_bhlh.shape)
print("Unique transcripts:", interpro_bhlh["ensembl_transcript_id"].nunique())


## 2) Retrieve CDS sequences from Ensembl REST

In [None]:
def get_cds_sequence(transcript_id: str) -> str | None:
    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?type=cds"
    headers = {"Content-Type": "text/plain"}
    response = requests.get(url, headers=headers, timeout=30)
    if response.status_code == 200:
        return response.text.strip()
    print(f"Ensembl error for {transcript_id}: {response.status_code}")
    return None

input_csv = p("data", "intermediate", "interpro", "InterPro_Domains_bHLH_filtered.csv")

# Map transcript_id -> HGNC symbol
bhlh_df = pd.read_csv(input_csv)
transcript_to_hgnc = bhlh_df.set_index("ensembl_transcript_id")["HGNC symbol"].to_dict()
transcript_ids = bhlh_df["ensembl_transcript_id"].dropna().unique()

output_fasta = p("data", "intermediate", "bHLH_transcripts_CDS.fasta")
with open(output_fasta, "w") as fasta_file:
    for transcript_id in transcript_ids:
        sequence = get_cds_sequence(transcript_id)
        if sequence:
            hgnc_symbol = transcript_to_hgnc.get(transcript_id, "Unknown")
            fasta_file.write(f">{transcript_id}|{hgnc_symbol}
{sequence}
")

print(f"Saved: {output_fasta}")


## 3) Translate CDS to protein

In [None]:
input_fasta = p("data", "intermediate", "bHLH_transcripts_CDS.fasta")
output_fasta = p("data", "intermediate", "bHLH_transcripts_protein.fasta")

with open(output_fasta, "w") as out_fasta:
    for record in SeqIO.parse(input_fasta, "fasta"):
        coding_dna = Seq(str(record.seq).strip())
        seq_length = len(coding_dna)

        if seq_length % 3 != 0:
            last_codon = coding_dna[-5:] if seq_length >= 3 else "Too short"
            print(f"Issue with {record.id}: length {seq_length} (not multiple of 3), last codon: {last_codon}")

        protein_seq = coding_dna.translate(to_stop=True)
        out_fasta.write(f">{record.id}
{protein_seq}
")

print(f"Saved: {output_fasta}")


## 4) Keep longest isoform and extract bHLH domain

In [None]:
def filter_longest_isoform(nucleotide_fasta: str, output_fasta: str) -> None:
    longest_transcripts = {}

    for record in SeqIO.parse(nucleotide_fasta, "fasta"):
        header_parts = record.description.split("|")
        transcript_id = header_parts[0]
        hgnc_symbol = header_parts[1] if len(header_parts) > 1 else "Unknown"

        seq = str(record.seq)
        if len(seq) % 3 != 0:
            continue  # skip non-multiple-of-3 CDS

        if hgnc_symbol not in longest_transcripts or len(seq) > len(longest_transcripts[hgnc_symbol][1]):
            longest_transcripts[hgnc_symbol] = (transcript_id, seq)

    with open(output_fasta, "w") as output_handle:
        for hgnc_symbol, (transcript_id, seq) in longest_transcripts.items():
            protein_seq = str(Seq(seq).translate(to_stop=True))
            output_handle.write(f">{transcript_id} {hgnc_symbol}
{protein_seq}
")


def extract_bHLH_domain(protein_fasta: str, interpro_csv: str, output_fasta: str) -> None:
    df = pd.read_csv(interpro_csv)
    domain_ranges = {}

    for _, row in df.iterrows():
        transcript_id = row["ensembl_transcript_id"]
        hgnc_symbol = row["HGNC symbol"]
        start, end = int(row["interpro_start"]), int(row["interpro_end"])

        if transcript_id not in domain_ranges:
            domain_ranges[transcript_id] = {"HGNC_symbol": hgnc_symbol, "start": start, "end": end}
        else:
            domain_ranges[transcript_id]["start"] = min(domain_ranges[transcript_id]["start"], start)
            domain_ranges[transcript_id]["end"] = max(domain_ranges[transcript_id]["end"], end)

    sequences = SeqIO.to_dict(SeqIO.parse(protein_fasta, "fasta"))

    with open(output_fasta, "w") as output_handle:
        for transcript_id, info in domain_ranges.items():
            if transcript_id in sequences:
                seq = sequences[transcript_id].seq
                domain_seq = seq[info["start"] - 1 : info["end"]]  # 1-based -> 0-based
                output_handle.write(f">{info['HGNC_symbol']}
{domain_seq}
")

filter_longest_isoform(
    p("data", "intermediate", "bHLH_transcripts_CDS.fasta"),
    p("data", "intermediate", "longest_isoform.fasta"),
)

extract_bHLH_domain(
    p("data", "intermediate", "longest_isoform.fasta"),
    p("data", "intermediate", "interpro", "InterPro_Domains_bHLH_filtered.csv"),
    p("data", "intermediate", "bHLH_domains.fasta"),
)


## Exploratory notes (optional)

- Some CDS sequences do not have a length that is a multiple of three. In the original run, 14 out of 356 fell into this category.
- Example issues that were observed:
  - ENST00000591024|MLX: length 745 (not multiple of 3), last codon fragment TAAGC. Since 745/3 = 248.33 and the last full codon is not a stop, this was treated as non-coding.
  - ENST00000427413|NPAS2: length 623 (not multiple of 3), last codon fragment TATAA. This contains a stop motif but the final codon boundary is ambiguous; this sequence was excluded.
- Current choice: exclude all CDS with length not divisible by 3 from the longest-isoform set.
- The resulting bHLH domain FASTA is intended for downstream multiple sequence alignment and comparison with Lambert et al. (2018).
