# Ortholog Heatmaps and TOGA Integration

This notebook builds homology heatmaps from Ensembl Compara and integrates TOGA inferred orthologs for mammalian species.

**Inputs**
- `data/intermediate/orthologs/annotated_bHLH_merged_data.csv`
- `data/raw/TOGA_orthologs/*.tsv`

**Outputs**
- `data/intermediate/orthologs/annotated_bHLH_merged_data_with_gene_names.csv`
- `data/intermediate/orthologs/TOGA_orthologs_allSpecies.csv`
- `outputs/figures/Homology_type.svg`
- `outputs/figures/TOGAheatmap.svg`

**Note**: Set `BHLH_PROJECT_ROOT` if running from a different working directory.


In [None]:
import pandas as pd
import requests
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.patches import Patch, Rectangle
from pathlib import Path

project_root = Path(__import__("os").getenv("BHLH_PROJECT_ROOT", ".")).resolve()

def p(*parts):
    return str(project_root.joinpath(*parts))


## 1) Add common gene names (one-time enrichment)

In [None]:
data = pd.read_csv(p("data", "intermediate", "orthologs", "annotated_bHLH_merged_data.csv"))

# One-time lookup for display names (rate-limited)

def get_gene_name(ensembl_id):
    server = "https://rest.ensembl.org"
    ext = f"/lookup/id/{ensembl_id}?"
    headers = {"Content-Type": "application/json"}
    response = requests.get(server + ext, headers=headers)
    if not response.ok:
        return None
    return response.json().get("display_name")

from tqdm import tqdm

tqdm.pandas()

data["target_gene_name"] = data["target_id"].progress_apply(get_gene_name)

out_gene_names = p("data", "intermediate", "orthologs", "annotated_bHLH_merged_data_with_gene_names.csv")
data.to_csv(out_gene_names, index=False)
print("Saved:", out_gene_names)


## 2) Build homology heatmap (Ensembl Compara)

In [None]:
input_data = p("data", "intermediate", "orthologs", "annotated_bHLH_merged_data_with_gene_names.csv")
data_df = pd.read_csv(input_data)

phylo_order = [
    'pan_troglodytes', 'gorilla_gorilla', 'macaca_mulatta', 'bos_taurus',
    'canis_lupus_familiaris', 'rattus_norvegicus', 'mus_musculus', 'monodelphis_domestica',
    'gallus_gallus', 'anolis_carolinensis', 'lepisosteus_oculatus', 'oryzias_latipes',
    'danio_rerio', 'xenopus_tropicalis', 'drosophila_melanogaster', 'tribolium_castaneum',
    'anopheles_gambiae', 'helobdella_robusta', 'caenorhabditis_elegans', 'neurospora_crassa',
    'schizosaccharomyces_pombe', 'saccharomyces_cerevisiae'
]

ordered_list = [
    "TFAP4", "MLX", "MLXIPL", "MLXIP", "TCFL5", "SOHLH1", "SOHLH2", "MYC", "MYCN", "MYCL", "MAX", "MNT",
    "MXD3", "MXD4", "MXI1", "MXD1", "SREBF2", "SREBF1", "MITF", "TFE3", "TFEC", "TFEB", "USF3", "USF2", "USF1",
    "NCOA1", "NCOA2", "NCOA3", "NPAS2", "CLOCK", "ARNTL2", "ARNTL", "ARNT2", "ARNT", "NPAS4", "AHRR", "AHR",
    "SIM2", "SIM1", "NPAS3", "NPAS1", "HIF1A", "HIF3A", "EPAS1", "HELT", "BHLHE41", "BHLHE40", "HEYL", "HEY2",
    "HEY1", "HES7", "HES6", "HES5", "HES3", "HES2", "HES4", "HES1", "ATOH8", "TCF4", "TCF3", "TCF12", "MYOG",
    "MYF6", "MYOD1", "MYF5", "FIGLA", "ID1", "ID4", "ID3", "ID2", "ASCL2", "ASCL1", "ASCL4", "ASCL5", "ASCL3",
    "TAL1", "LYL1", "TAL2", "NHLH2", "NHLH1", "MESP2", "MSGN1", "MESP1", "PTF1A", "FERD3L", "ATOH7", "ATOH1",
    "BHLHA9", "BHLHA15", "BHLHE23", "BHLHE22", "OLIG1", "OLIG3", "OLIG2", "NEUROG2", "NEUROG3", "NEUROG1",
    "NEUROD2", "NEUROD6", "NEUROD4", "NEUROD1", "TCF21", "MSC", "TCF24", "TCF23", "TWIST2", "TWIST1", "HAND2",
    "HAND1", "TCF15", "SCX"
]

linnaean_names = {
    'pan_troglodytes': 'Pan troglodytes',
    'gorilla_gorilla': 'Gorilla gorilla',
    'macaca_mulatta': 'Macaca mulatta',
    'bos_taurus': 'Bos taurus',
    'canis_lupus_familiaris': 'Canis lupus familiaris',
    'rattus_norvegicus': 'Rattus norvegicus',
    'mus_musculus': 'Mus musculus',
    'monodelphis_domestica': 'Monodelphis domestica',
    'gallus_gallus': 'Gallus gallus',
    'anolis_carolinensis': 'Anolis carolinensis',
    'lepisosteus_oculatus': 'Lepisosteus oculatus',
    'oryzias_latipes': 'Oryzias latipes',
    'danio_rerio': 'Danio rerio',
    'xenopus_tropicalis': 'Xenopus tropicalis',
    'drosophila_melanogaster': 'Drosophila melanogaster',
    'tribolium_castaneum': 'Tribolium castaneum',
    'anopheles_gambiae': 'Anopheles gambiae',
    'helobdella_robusta': 'Helobdella robusta',
    'caenorhabditis_elegans': 'Caenorhabditis elegans',
    'neurospora_crassa': 'Neurospora crassa',
    'schizosaccharomyces_pombe': 'Schizosaccharomyces pombe',
    'saccharomyces_cerevisiae': 'Saccharomyces cerevisiae'
}

unique_homology_types = sorted(data_df['homology_type'].dropna().unique())
if 'ortholog_one2one' in unique_homology_types:
    unique_homology_types.remove('ortholog_one2one')
    unique_homology_types = ['ortholog_one2one'] + unique_homology_types
homology_map = {ht: i for i, ht in enumerate(unique_homology_types)}

color_pivot = data_df.pivot_table(
    index='HGNC symbol',
    columns='target_species_name',
    values='homology_type',
    aggfunc='first'
)

color_matrix = color_pivot.map(lambda x: homology_map.get(x, np.nan))
color_matrix = color_matrix.reindex(columns=phylo_order)
color_matrix = color_matrix.reindex(index=ordered_list)

plt.figure(figsize=(25, 20))
ax = sns.heatmap(
    color_matrix.astype(float),
    cmap='plasma',
    cbar=False,
    linewidths=0.5,
    linecolor='gray',
    annot=False
)

plt.xticks(
    ticks=np.arange(len(phylo_order)) + 0.5,
    labels=[linnaean_names[sp] for sp in phylo_order],
    rotation=90,
    fontsize=9
)
plt.title('Homology Types', fontsize=20, pad=20)
plt.xlabel('Species (Linnaean)', fontsize=18)
plt.ylabel('Human Gene Name', fontsize=18)
plt.yticks(
    ticks=np.arange(len(color_matrix.index)) + 0.5,
    labels=color_matrix.index.tolist(),
    fontsize=8
)

cmap_obj = plt.colormaps.get_cmap('plasma')
colors = cmap_obj(np.linspace(0, 1, len(unique_homology_types)))
handles = [Patch(color=colors[i], label=label) for i, label in enumerate(unique_homology_types)]

plt.legend(
    handles=handles,
    title="Homology Types",
    bbox_to_anchor=(1.04, 1),
    loc='upper left',
    fontsize=12,
    title_fontsize='14'
)

plt.tight_layout(rect=[0, 0, 0.9, 1])

out_svg = p('outputs', 'figures', 'Homology_type.svg')
Path(out_svg).parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_svg, dpi=600, bbox_inches='tight', format='svg')
plt.show()


## 3) TOGA inferred orthologs (mammals only)

In [None]:
zoonomia_species = [
    'pan_troglodytes', 'gorilla_gorilla', 'macaca_mulatta', 'bos_taurus',
    'canis_lupus_familiaris', 'rattus_norvegicus', 'mus_musculus', 'monodelphis_domestica'
]


### Load TOGA ortholog tables

In [None]:
bos_df = pd.read_csv(p("data", "raw", "TOGA_orthologs", "Bos_Taurus_orthologs.tsv"), sep="	")
gorilla_df = pd.read_csv(p("data", "raw", "TOGA_orthologs", "Gorilla_gorilla_orthologs.tsv"), sep="	")
chimp_df = pd.read_csv(p("data", "raw", "TOGA_orthologs", "Pan_troglodytes_orthologs.tsv"), sep="	")
macaca_df = pd.read_csv(p("data", "raw", "TOGA_orthologs", "Macaca_mulatta_orthologs.tsv"), sep="	")
opossum_df = pd.read_csv(p("data", "raw", "TOGA_orthologs", "Monodelphis_domestica.tsv"), sep="	")
dog_df = pd.read_csv(p("data", "raw", "TOGA_orthologs", "Canis_lupus_familiaris_orthologs.tsv"), sep="	")
rat_df = pd.read_csv(p("data", "raw", "TOGA_orthologs", "Rattus_Norvegicus.tsv"), sep="	")
mus_df = pd.read_csv(p("data", "raw", "TOGA_orthologs", "Mus_musculus_orthologs.tsv"), sep="	")


### Combine TOGA tables

In [None]:
species_names = [
    "bos_taurus", "gorilla_gorilla", "pan_troglodytes", "macaca_mulatta",
    "monodelphis_domestica", "canis_lupus_familiaris", "rattus_norvegicus", "mus_musculus"
]

species_df_list = [bos_df, gorilla_df, chimp_df, macaca_df, opossum_df, dog_df, rat_df, mus_df]

for df, species in zip(species_df_list, species_names):
    df['target_species'] = species

# Use ENSG list from annotated data

data_df_filtered = data_df[data_df['target_species'].isin(species_names)].copy()
ENSG_IDs = data_df_filtered["query_gene"].unique().tolist()

# Concatenate and filter

df_TOGA = pd.concat(species_df_list, ignore_index=True)
df_TOGA = df_TOGA[df_TOGA['t_gene'].isin(ENSG_IDs)].copy()

# Map HGNC symbol from Ensembl Compara

gene_to_symbol = data_df_filtered.set_index('query_gene')['HGNC symbol'].to_dict()
df_TOGA['HGNC symbol'] = df_TOGA['t_gene'].map(gene_to_symbol)

# Normalize naming

df_TOGA.rename(columns={"orthology_class": "homology_type"}, inplace=True)

out_toga = p('data', 'intermediate', 'orthologs', 'TOGA_orthologs_allSpecies.csv')
Path(out_toga).parent.mkdir(parents=True, exist_ok=True)
df_TOGA.to_csv(out_toga, index=False)
print("Saved:", out_toga)


## 4) Compare Ensembl vs TOGA homology calls

In [None]:
# Normalize HGNC symbols

data_df_filtered["HGNC symbol"] = data_df_filtered["HGNC symbol"].str.upper()
df_TOGA["HGNC symbol"] = df_TOGA["HGNC symbol"].str.upper()

old_keys = set(zip(data_df_filtered["HGNC symbol"], data_df_filtered["target_species"]))
toga_keys = set(zip(df_TOGA["HGNC symbol"], df_TOGA["target_species"]))
all_keys = sorted(old_keys.union(toga_keys))

rows = []
for symbol, species in all_keys:
    row = {
        "HGNC symbol": symbol,
        "target_species": species,
        "presenza_old": int((symbol, species) in old_keys),
        "presenza_toga": int((symbol, species) in toga_keys),
    }

    old_row = data_df_filtered[
        (data_df_filtered["HGNC symbol"] == symbol) &
        (data_df_filtered["target_species"] == species)
    ]
    row["homology_type_old"] = old_row["homology_type"].values[0] if not old_row.empty else None

    toga_row = df_TOGA[
        (df_TOGA["HGNC symbol"] == symbol) &
        (df_TOGA["target_species"] == species)
    ]
    row["homology_type_toga"] = toga_row["homology_type"].values[0] if not toga_row.empty else None

    rows.append(row)

# Combined table

df_merged = pd.DataFrame(rows)

df_merged["homology_type_old"] = df_merged["homology_type_old"].str.replace("ortholog_", "", regex=False)
df_merged["homology_type_old"] = df_merged["homology_type_old"].fillna("missing")


## 5) TOGA heatmap with differences overlay

In [None]:
linnean_mammals_names = {
    'pan_troglodytes': 'Pan troglodytes',
    'gorilla_gorilla': 'Gorilla gorilla',
    'macaca_mulatta': 'Macaca mulatta',
    'bos_taurus': 'Bos taurus',
    'canis_lupus_familiaris': 'Canis lupus familiaris',
    'rattus_norvegicus': 'Rattus norvegicus',
    'mus_musculus': 'Mus musculus',
    'monodelphis_domestica': 'Monodelphis domestica',
}
phylo_mammal_order = list(linnean_mammals_names.keys())

homology_gray_map = {
    'one2one': 0,
    'one2many': 1,
    'many2one': 2,
    'many2many': 3,
    np.nan: -1
}

# Map to grayscale

df_merged['homology_gray'] = df_merged['homology_type_old'].map(homology_gray_map).fillna(-1)

symbol_map = {
    'one2one': 'o',
    'one2many': '^',
    'many2one': 's',
    'many2many': 'D',
    'one2zero': 'X'
}

mask_diff = (
    df_merged["homology_type_old"].fillna("missing") !=
    df_merged["homology_type_toga"].fillna("missing")
)

df_merged['symbol'] = np.where(
    mask_diff,
    df_merged["homology_type_toga"].map(symbol_map).fillna('X'),
    None
)

genes = [gene for gene in ordered_list if gene in df_merged['HGNC symbol'].unique()]

color_matrix = df_merged.pivot_table(
    index='HGNC symbol',
    columns='target_species',
    values='homology_gray',
    aggfunc='first'
).reindex(index=genes, columns=phylo_mammal_order)

symbol_matrix = df_merged.pivot_table(
    index='HGNC symbol',
    columns='target_species',
    values='symbol',
    aggfunc='first'
).reindex(index=genes, columns=phylo_mammal_order)

plt.figure(figsize=(25, 50))
cmap = plt.cm.Greys
bounds = [-1, 0, 1, 2, 3, 4]
norm = mcolors.BoundaryNorm(bounds, cmap.N)

ax = sns.heatmap(
    color_matrix,
    cmap=cmap,
    norm=norm,
    linewidths=0.5,
    linecolor='gray',
    cbar=False
)

ax.add_patch(
    Rectangle(
        (0, 0),
        len(color_matrix.columns),
        len(color_matrix.index),
        fill=False,
        edgecolor='black',
        linewidth=2
    )
)

for i, gene in enumerate(color_matrix.index):
    for j, species in enumerate(color_matrix.columns):
        symb = symbol_matrix.loc[gene, species]
        if pd.notna(symb):
            ax.scatter(
                j + 0.5, i + 0.5,
                marker=symb,
                color='green',
                s=50,
                linewidths=0.5,
                edgecolors='black'
            )

plt.xticks(
    ticks=np.arange(len(phylo_mammal_order)) + 0.5,
    labels=[linnean_mammals_names[sp] for sp in phylo_mammal_order],
    rotation=90,
    fontsize=10
)
plt.yticks(
    ticks=np.arange(len(color_matrix.index)) + 0.5,
    labels=color_matrix.index,
    fontsize=9
)
plt.title('Homology Type ENSEMBL Compara + TOGA Differences', fontsize=18, pad=20)

legend_elements = [
    Patch(facecolor='lightgray', edgecolor='black', label='Unvaried in TOGA'),
    plt.Line2D([0], [0], marker='o', color='w', label='TOGA: one2one',
               markerfacecolor='green', markeredgecolor='black', markersize=10),
    plt.Line2D([0], [0], marker='^', color='w', label='TOGA: one2many',
               markerfacecolor='green', markeredgecolor='black', markersize=10),
    plt.Line2D([0], [0], marker='s', color='w', label='TOGA: many2one',
               markerfacecolor='green', markeredgecolor='black', markersize=10),
    plt.Line2D([0], [0], marker='D', color='w', label='TOGA: many2many',
               markerfacecolor='green', markeredgecolor='black', markersize=10),
    plt.Line2D([0], [0], marker='X', color='w', label='TOGA: one2zero',
               markerfacecolor='green', markeredgecolor='black', markersize=10),
]

leg1 = plt.legend(
    handles=legend_elements,
    title='TOGA Homology Types (if different)',
    bbox_to_anchor=(1.02, 1),
    loc='upper left',
    fontsize=10,
    title_fontsize=12
)

plt.gca().add_artist(leg1)

inv_homology_gray_map = {
    0: 'one2one',
    1: 'one2many',
    2: 'many2one',
    3: 'many2many',
    -1: 'Missing'
}

gray_colors = [cmap(norm(v)) for v in bounds[:-1]]

color_legend_elements = [
    Patch(facecolor=gray_colors[i], edgecolor='black', label=inv_homology_gray_map[bounds[i]])
    for i in range(len(bounds) - 1)
]

plt.legend(
    handles=color_legend_elements,
    title='Old Homology Types',
    bbox_to_anchor=(1.02, 0.96),
    loc='upper left',
    fontsize=10,
    title_fontsize=12
)

plt.tight_layout(rect=[0, 0, 0.85, 1])

out_svg = p('outputs', 'figures', 'TOGAheatmap.svg')
Path(out_svg).parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_svg, format='svg', dpi=500, bbox_inches='tight')
plt.show()


## Exploratory notes (optional)

- The gene-name enrichment step (`get_gene_name`) is rate-limited; run once and cache results in `annotated_bHLH_merged_data_with_gene_names.csv`.
- The Ensembl Compara heatmap uses `homology_type` to encode the relationship strength; `ortholog_one2one` is forced to the first color bin for readability.
- The TOGA comparison focuses on the 8 mammalian species common to both datasets.
- Use the `df_merged` table to inspect disagreements between Ensembl and TOGA homology calls.
- Suggested checks:
  - number of unique genes per dataset
  - list of genes with missing TOGA entries
  - distribution of homology types in both datasets


In [None]:
# Optional checks
print("Unique HGNC symbols (Ensembl):", data_df_filtered['HGNC symbol'].nunique())
print("Unique HGNC symbols (TOGA):", df_TOGA['HGNC symbol'].nunique())

missing_toga = df_merged[df_merged['homology_type_toga'].isna()]
print("Missing TOGA entries:", missing_toga.shape)

print("Ensembl homology types:")
print(data_df_filtered['homology_type'].value_counts(dropna=False))

print("TOGA homology types:")
print(df_TOGA['homology_type'].value_counts(dropna=False))
