# Orthology Plots (Exploratory)

This notebook generates exploratory plots for orthology-derived features, including domain positions and homology group summaries.

**Input**
- `data/intermediate/orthologs/annotated_bHLH_merged_data.csv`

**Outputs**
- `outputs/figures/a3_heatmap.svg` (final A3 heatmap)

**Note**: Set `BHLH_PROJECT_ROOT` if running from a different working directory.


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

project_root = Path(__import__("os").getenv("BHLH_PROJECT_ROOT", ".")).resolve()

def p(*parts):
    return str(project_root.joinpath(*parts))


## 1) Load data

In [None]:
data = pd.read_csv(p("data", "intermediate", "orthologs", "annotated_bHLH_merged_data.csv"))
print(data.describe())
print(data.columns)


## 2) Taxonomic classification

In [None]:
chordates = [
    'bos_taurus', 'canis_lupus_familiaris', 'danio_rerio', 'gallus_gallus',
    'gorilla_gorilla', 'lepisosteus_oculatus', 'macaca_mulatta',
    'monodelphis_domestica', 'mus_musculus', 'oryzias_latipes',
    'pan_troglodytes', 'rattus_norvegicus', 'xenopus_tropicalis',
    'anolis_carolinensis'
]

arthropods = ['drosophila_melanogaster', 'anopheles_gambiae', 'tribolium_castaneum']
nematodes = ['caenorhabditis_elegans']
annelids = ['helobdella_robusta']
fungi = ['saccharomyces_cerevisiae', 'schizosaccharomyces_pombe', 'neurospora_crassa']

def assign_tax_group(species):
    if species in chordates:
        return "Chordata"
    if species in arthropods:
        return "Arthropoda"
    if species in nematodes:
        return "Nematoda"
    if species in annelids:
        return "Annelida"
    if species in fungi:
        return "Fungi"
    return "Other"

data["taxonomic_group"] = data["target_species"].apply(assign_tax_group)

protostomes = ['drosophila_melanogaster', 'helobdella_robusta']
deuterostomes = [
    'bos_taurus', 'canis_lupus_familiaris', 'danio_rerio', 'gallus_gallus',
    'gorilla_gorilla', 'lepisosteus_oculatus', 'macaca_mulatta',
    'monodelphis_domestica', 'mus_musculus', 'oryzias_latipes',
    'pan_troglodytes', 'rattus_norvegicus', 'xenopus_tropicalis',
    'anolis_carolinensis', 'caenorhabditis_elegans',
    'tribolium_castaneum', 'anopheles_gambiae',
    'saccharomyces_cerevisiae', 'schizosaccharomyces_pombe',
    'neurospora_crassa'
]

def classify_supergroup(species):
    if species in protostomes:
        return 'Protostome'
    if species in deuterostomes:
        return 'Deuterostome'
    return 'Other'

data['taxonomy_supergroup'] = data['target_species'].apply(classify_supergroup)

vertebrates = [
    'bos_taurus', 'canis_lupus_familiaris', 'danio_rerio', 'gallus_gallus',
    'gorilla_gorilla', 'lepisosteus_oculatus', 'macaca_mulatta',
    'monodelphis_domestica', 'mus_musculus', 'oryzias_latipes',
    'pan_troglodytes', 'rattus_norvegicus', 'xenopus_tropicalis',
    'anolis_carolinensis'
]

def classify_vertebrates(species):
    return 'Vertebrate' if species in vertebrates else 'Invertebrate'

data['Vertebrate'] = data['target_species'].apply(classify_vertebrates)


## 3) Domain position features

In [None]:
data['Rel_middle_T'] = (data['Rel_start_T'] + data['Rel_end_T']) / 2

data['Rel_middle_Q'] = (data['Rel_start_Q'] + data['Rel_end_Q']) / 2


## 4) Plots by taxonomy

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='taxonomy_level', y='Rel_start_T', data=data)
plt.xticks(rotation=45)
plt.title('Relative starting position of bHLH domain vs taxonomic level')
plt.ylabel('Relative starting position')
plt.xlabel('Taxonomic level')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
sns.violinplot(x='taxonomy_level', y='Rel_middle_T', data=data)
plt.xticks(rotation=45)
plt.title('Relative middle position of bHLH domain vs taxonomic level')
plt.ylabel('Relative middle position')
plt.xlabel('Taxonomic level')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='taxonomic_group', y='Rel_start_T', data=data)
plt.xticks(rotation=45)
plt.title('Relative starting position of bHLH domain vs taxonomic group')
plt.ylabel('Relative starting position')
plt.xlabel('Taxonomic group')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
sns.violinplot(x='taxonomic_group', y='Rel_middle_T', data=data)
plt.xticks(rotation=45)
plt.title('Relative middle position of bHLH domain vs taxonomic group')
plt.ylabel('Relative middle position')
plt.xlabel('Taxonomic group')
plt.tight_layout()
plt.show()


## 5) Distributions and scatter plots

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=data, x='Rel_middle_T', hue='taxonomic_group', multiple='stack', palette='viridis', bins=50,
             edgecolor='black', linewidth=0.3)
plt.title('Relative middle position of bHLH domain by taxonomic group')
plt.xlabel('Relative middle position')
plt.ylabel('Frequency')
plt.legend(title="Taxonomic group", bbox_to_anchor=(1.0, 1.0), loc='upper left')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
sns.histplot(data=data, x='Rel_middle_T', bins=30, kde=True, color='mediumslateblue', edgecolor='black', linewidth=0.5)
plt.title('Distribution of the relative middle position of bHLH domain')
plt.xlabel('Relative middle position')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(data['Rel_start_T'], kde=False, bins=30)
plt.title('Distribution of the relative start position of bHLH domains')
plt.xlabel('Rel_start_T')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='bHLH_length_T', y='Rel_start_T', hue='taxonomy_level')
plt.title('Relative start position vs bHLH length (target)')
plt.xlabel('bHLH length (target)')
plt.ylabel('Relative start position (Rel_start_T)')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='bHLH_length_Q', y='bHLH_length_T', hue='taxonomy_level')
plt.plot([data['bHLH_length_Q'].min(), data['bHLH_length_Q'].max()],
         [data['bHLH_length_Q'].min(), data['bHLH_length_Q'].max()],
         color='gray', linestyle='--')
plt.title('bHLH domain length: query vs target')
plt.xlabel('bHLH length (query)')
plt.ylabel('bHLH length (target)')
plt.tight_layout()
plt.show()


## 6) Supergroup / vertebrate comparisons

In [None]:
sns.boxplot(x='taxonomy_supergroup', y='Rel_start_T', data=data)
plt.title('Relative start position by supergroup')
plt.tight_layout()
plt.show()

sns.scatterplot(data=data, x='bHLH_length_Q', y='bHLH_length_T', hue='taxonomy_supergroup')
plt.title('bHLH length: query vs target by supergroup')
plt.tight_layout()
plt.show()

sns.boxplot(x='Vertebrate', y='Rel_start_T', data=data)
plt.title('Relative start position by vertebrate / invertebrate')
plt.tight_layout()
plt.show()

sns.scatterplot(data=data, x='bHLH_length_Q', y='bHLH_length_T', hue='Vertebrate')
plt.title('bHLH length: query vs target by vertebrate / invertebrate')
plt.tight_layout()
plt.show()


## 7) Quartile heatmap (final A3 version)

In [None]:
ordered_genes = data.groupby('HGNC symbol')['Rel_middle_T'].median().sort_values().index

# Bin relative start positions into quartiles

data['quartile'] = pd.qcut(data['Rel_start_T'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
heatmap_data = pd.crosstab(
    index=data['HGNC symbol'],
    columns=data['quartile'],
    normalize='index'
)

heatmap_data_sorted = heatmap_data.sort_values(by='Q1', ascending=False)

# Final A3-sized figure
sns.set_theme(style="white", rc={'font.family': 'sans-serif', 'font.sans-serif': ['Arial', 'Helvetica']})
a3_size_inches = (11.7, 16.5)

fig, ax = plt.subplots(figsize=a3_size_inches)

sns.heatmap(
    heatmap_data_sorted,
    cmap="YlGnBu",
    cbar_kws={'label': 'Proportion', 'shrink': 0.5},
    ax=ax
)

ax.set_title(
    "Domain Position Quartile Distribution per bHLH Gene",
    fontsize=20,
    fontweight='bold',
    pad=20
)
ax.set_xlabel(
    "Domain quartile: Q1 = N-terminal, Q4 = C-terminal",
    fontsize=14,
    labelpad=15
)
ax.set_ylabel("HGNC Symbol", fontsize=14, labelpad=15)

ax.set_yticks(range(len(heatmap_data_sorted)))
ax.set_yticklabels(heatmap_data_sorted.index)

ax.tick_params(axis='y', rotation=0, labelsize=10, length=0)
ax.tick_params(axis='x', rotation=0, labelsize=12)

plt.tight_layout()

out_svg = p("outputs", "figures", "a3_heatmap.svg")
Path(out_svg).parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_svg, format="svg")
print("Saved:", out_svg)


## Exploratory notes (optional)

- This notebook is intentionally plot-heavy for visual inspection.
- For the quartile heatmap, keep only the final A3 version saved to `outputs/figures/a3_heatmap.svg`.
- The dendrogram and faceted KDE plots were removed to reduce clutter; re-add if needed.
