# Ortholog Retrieval and InterProScan Integration

This notebook retrieves orthologs via Ensembl REST, prepares the FASTA for InterProScan, parses InterProScan output, and merges ortholog metadata with bHLH domain coordinates.

**Inputs**
- `data/intermediate/table_input.csv`
- `data/intermediate/Metadata_CSVs/InterPro_Domains_cleaned.csv`
- `data/intermediate/Metadata_CSVs/Transcript_Attributes_cleaned.csv`
- `data/intermediate/interpro/interpro_output.tsv` (from InterProScan)

**Outputs**
- `data/intermediate/orthologs/homology_results.csv`
- `data/intermediate/interpro/interpro_input.fasta`
- `data/intermediate/interpro/OutputInterProScan.tsv`
- `data/intermediate/interpro/IPR011598_IPRScan_final.tsv`
- `data/intermediate/orthologs/in&out_bHLH_data.csv`
- `data/intermediate/orthologs/annotated_bHLH_merged_data.csv`

**Note**: Set `BHLH_PROJECT_ROOT` if running from a different working directory.


In [None]:
import requests
import pandas as pd
import time
from pathlib import Path

project_root = Path(__import__("os").getenv("BHLH_PROJECT_ROOT", ".")).resolve()

def p(*parts):
    return str(project_root.joinpath(*parts))


## 1) Load inputs and get ENSG list

In [None]:
transcripts = pd.read_csv(p("data", "intermediate", "Metadata_CSVs", "Transcript_Attributes_cleaned.csv"))
interpro = pd.read_csv(p("data", "intermediate", "Metadata_CSVs", "InterPro_Domains_cleaned.csv"))

table_input = pd.read_csv(p("data", "intermediate", "table_input.csv"))
ENSG_IDs = table_input["ensembl_gene_id"].dropna().unique().tolist()
print("ENSG IDs:", len(ENSG_IDs))


## 2) Define target species and compara group

In [None]:
target_species = [
    "drosophila_melanogaster", "nematostella_vectensis", "helobdella_robusta",
    "daphnia_magna", "tribolium_castaneum", "branchiostoma_floridae", "bos_taurus",
    "canis_lupus_familiaris", "danio_rerio", "gallus_gallus", "gorilla_gorilla",
    "lepisosteus_oculatus", "macaca_mulatta", "monodelphis_domestica", "mus_musculus",
    "oryzias_latipes", "pan_troglodytes", "rattus_norvegicus", "xenopus_tropicalis",
    "anolis_carolinensis", "anopheles_gambiae", "caenorhabditis_elegans",
    "ciona_intestinalis", "ixodes_scapularis", "monosiga_brevicollis",
    "saccharomyces_cerevisiae", "schizosaccharomyces_pombe", "neurospora_crassa",
    "candida_albicans"
]

vertebrate_species = {
    "bos_taurus", "canis_lupus_familiaris", "danio_rerio", "gallus_gallus",
    "gorilla_gorilla", "lepisosteus_oculatus", "macaca_mulatta", "monodelphis_domestica",
    "mus_musculus", "oryzias_latipes", "pan_troglodytes", "rattus_norvegicus",
    "xenopus_tropicalis", "anolis_carolinensis", "homo_sapiens"
}

def get_compara_group(species: str) -> str:
    return "vertebrates" if species in vertebrate_species else "pan_homology"


## 3) Fetch orthologs via Ensembl REST

In [None]:
base_url = "https://rest.ensembl.org/homology/id"
headers = {"Content-Type": "application/json"}

results = []

for gene_id in ENSG_IDs:
    for species in target_species:
        compara = get_compara_group(species)
        url = f"{base_url}/homo_sapiens/{gene_id}"
        params = {
            "target_species": species,
            "compara": compara,
            "type": "all",
            "sequence": "protein",
            "aligned": 0,
            "cigar_line": 1,
            "format": "full",
        }

        response = requests.get(url, params=params, headers=headers)

        if response.ok:
            data = response.json()
            for homology in data.get("data", [])[0].get("homologies", []):
                row = {
                    "query_gene": gene_id,
                    "target_species": species,
                    "compara": compara,
                    "homology_type": homology.get("type"),
                    "taxonomy_level": homology.get("taxonomy_level"),
                    "dn_ds": homology.get("dn_ds"),
                    "source_id": homology["source"].get("id"),
                    "source_protein": homology["source"].get("protein_id"),
                    "source_seq": homology["source"].get("seq"),
                    "source_species": homology["source"].get("species"),
                    "source_perc_id": homology["source"].get("perc_id"),
                    "source_perc_pos": homology["source"].get("perc_pos"),
                    "target_id": homology["target"].get("id"),
                    "target_protein": homology["target"].get("protein_id"),
                    "target_seq": homology["target"].get("seq"),
                    "target_species_name": homology["target"].get("species"),
                    "target_perc_id": homology["target"].get("perc_id"),
                    "target_perc_pos": homology["target"].get("perc_pos"),
                    "cigar_line": homology["target"].get("cigar_line"),
                }
                results.append(row)
        else:
            print(f"Error for {gene_id} -> {species} ({compara}): {response.status_code}")

        time.sleep(0.1)

results_df = pd.DataFrame(results)

# Move sequences to the end for readability
cols = list(results_df.columns)
for col in ["source_seq", "target_seq"]:
    if col in cols:
        cols.remove(col)
        cols.append(col)
results_df = results_df[cols]

out_csv = p("data", "intermediate", "orthologs", "homology_results.csv")
Path(out_csv).parent.mkdir(parents=True, exist_ok=True)
results_df.to_csv(out_csv, index=False)
print("Saved:", out_csv)


## 4) Build InterProScan input FASTA

In [None]:
written_queries = set()

out_fasta = p("data", "intermediate", "interpro", "interpro_input.fasta")
Path(out_fasta).parent.mkdir(parents=True, exist_ok=True)

with open(out_fasta, "w") as f:
    for _, row in results_df.iterrows():
        query_key = f"{row['query_gene']}_{row['source_id']}"
        target_key = f"{row['query_gene']}_{row['target_id']}"

        if query_key not in written_queries:
            f.write(f">{query_key}
{row['source_seq']}
")
            written_queries.add(query_key)

        f.write(f">{target_key}
{row['target_seq']}
")

print("Saved:", out_fasta)


## 5) Parse InterProScan output

In [None]:
interpro_out = pd.read_csv(p("data", "intermediate", "interpro", "interpro_output.tsv"), sep="	", header=None)
interpro_out.columns = [
    "FASTA Header", "MD5 Code", "Length_target", "Analysis", "Signature Accession",
    "Signature Description", "Start_T", "Stop_T", "Score", "Status", "Date",
    "InterPro ID", "InterPro Description"
]

out_tsv = p("data", "intermediate", "interpro", "OutputInterProScan.tsv")
interpro_out.to_csv(out_tsv, sep="	", index=False)
print("Saved:", out_tsv)


## 6) Filter bHLH domains (IPR011598) and recover PF00010

In [None]:
ipr_df = interpro_out[interpro_out["InterPro ID"] == "IPR011598"].copy()
pf00010_df = ipr_df[ipr_df["Signature Accession"] == "PF00010"].copy()

pf00010_df["domain_length"] = pf00010_df["Stop_T"] - pf00010_df["Start_T"]

pf00010_headers = set(pf00010_df["FASTA Header"])
all_headers = set(ipr_df["FASTA Header"])

missing_pf00010_headers = all_headers - pf00010_headers
recovery_rows = ipr_df[
    ipr_df["FASTA Header"].isin(missing_pf00010_headers) &
    ipr_df["Start_T"].notnull() &
    ipr_df["Stop_T"].notnull()
].copy()

recovery_rows["domain_length"] = recovery_rows["Stop_T"] - recovery_rows["Start_T"]

best_recovery = (
    recovery_rows
    .sort_values("domain_length", ascending=False)
    .drop_duplicates("FASTA Header")
)

final_df = pd.concat([pf00010_df, best_recovery])

out_final = p("data", "intermediate", "interpro", "IPR011598_IPRScan_final.tsv")
final_df.to_csv(out_final, sep="	", index=False)
print("Saved:", out_final)


## 7) Merge InterProScan with orthologs

In [None]:
interpro_file_path = p("data", "intermediate", "interpro", "IPR011598_IPRScan_final.tsv")
ortholog_file_path = p("data", "intermediate", "orthologs", "homology_results.csv")
output_file_path = p("data", "intermediate", "orthologs", "in&out_bHLH_data.csv")

df_interpro = pd.read_csv(interpro_file_path, sep="	")
df_ortholog = pd.read_csv(ortholog_file_path)

if df_ortholog.columns[0].strip() == "" or "Unnamed" in df_ortholog.columns[0]:
    df_ortholog = df_ortholog.iloc[:, 1:]

split_header = df_interpro["FASTA Header"].str.split("_", n=1, expand=True)
df_interpro["InterPro_QueryGene"] = split_header[0]
df_interpro["InterPro_ScannedProteinID"] = split_header[1]

merged_df = pd.merge(
    df_ortholog,
    df_interpro,
    left_on=["query_gene", "target_id"],
    right_on=["InterPro_QueryGene", "InterPro_ScannedProteinID"],
    how="left",
)

for col in ["InterPro_QueryGene", "InterPro_ScannedProteinID"]:
    if col in merged_df.columns:
        merged_df = merged_df.drop(columns=[col])

# Move sequences to the end
cols = list(merged_df.columns)
for col in ["source_seq", "target_seq"]:
    if col in cols:
        cols.remove(col)
        cols.append(col)
merged_df = merged_df[cols]

merged_df.to_csv(output_file_path, index=False)
print("Saved:", output_file_path)


## 8) Enrich with human bHLH coordinates and export final table

In [None]:
all_data_df = pd.read_csv(p("data", "intermediate", "orthologs", "in&out_bHLH_data.csv"))
info_df = pd.read_csv(p("data", "intermediate", "table_input.csv"))

columns_to_add = ["ensembl_gene_id", "HGNC symbol", "interpro_start", "interpro_end", "length"]
info_subset = info_df[columns_to_add].copy()

df_merged = pd.merge(
    all_data_df,
    info_subset,
    left_on="query_gene",
    right_on="ensembl_gene_id",
    how="left",
).drop(columns=["ensembl_gene_id"])

# Rename and compute relative positions

df_merged = df_merged.rename(columns={
    "interpro_start": "Start_Q",
    "interpro_end": "Stop_Q",
    "length": "Length_query",
})

# Relative positions

df_merged["Rel_start_Q"] = df_merged["Start_Q"] / df_merged["Length_query"]
df_merged["Rel_end_Q"] = df_merged["Stop_Q"] / df_merged["Length_query"]
df_merged["Rel_start_T"] = df_merged["Start_T"] / df_merged["Length_target"]
df_merged["Rel_end_T"] = df_merged["Stop_T"] / df_merged["Length_target"]

# Domain lengths

df_merged["bHLH_length_Q"] = df_merged["Stop_Q"] - df_merged["Start_Q"]
df_merged["bHLH_length_T"] = df_merged["Stop_T"] - df_merged["Start_T"]
df_merged["bHLH_rel_length_Q"] = df_merged["bHLH_length_Q"] / df_merged["Length_query"]
df_merged["bHLH_rel_length_T"] = df_merged["bHLH_length_T"] / df_merged["Length_target"]

col_order = [
    "HGNC symbol", "query_gene", "target_id",
    "Start_Q", "Stop_Q", "Rel_start_Q", "Rel_end_Q",
    "Start_T", "Stop_T", "Rel_start_T", "Rel_end_T",
    "bHLH_length_Q", "bHLH_rel_length_Q",
    "bHLH_length_T", "bHLH_rel_length_T",
    "Length_query", "Length_target", "target_perc_id", "target_perc_pos", "domain_length",
    "homology_type", "taxonomy_level", "compara",
    "source_id", "source_protein", "source_perc_id", "source_perc_pos",
    "target_protein", "target_species", "target_species_name", "Analysis", "Signature Accession",
    "Signature Description", "Score", "cigar_line", "FASTA Header", "MD5 Code",
    "source_seq", "target_seq",
]

remaining_cols = [c for c in df_merged.columns if c not in col_order]
final_cols = col_order + remaining_cols

output_path = p("data", "intermediate", "orthologs", "annotated_bHLH_merged_data.csv")
df_merged[final_cols].to_csv(output_path, index=False)
print("Saved:", output_path)


## Exploratory notes (optional)

- A missing gene was temporarily added (`ENSG00000120669`) to fix a discrepancy, but later inspection suggested a duplicated SOHLH entry was the real reason for the mismatch.
- The set of target species listed above represents the current analysis scope.
- Use `print(results_df["target_species_name"].unique())` to verify the final species included.
