# 3'UTR and RBP Analysis

This notebook processes 3'UTR data for bHLH genes, computes UTR statistics, and builds a binary RBP binding matrix for downstream clustering.

**Inputs**
- `data/raw/RBP_3UTR_data/3UTR_merged_clean.csv`
- `data/raw/LS_classes.csv`

**Outputs**
- `data/intermediate/rbp/Violin_plot_3UTR.csv`
- `data/intermediate/rbp/RBP_binary_matrix.csv`

**Note**: Set `BHLH_PROJECT_ROOT` if running from a different working directory.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

project_root = Path(__import__("os").getenv("BHLH_PROJECT_ROOT", ".")).resolve()

def p(*parts):
    return str(project_root.joinpath(*parts))


## 1) Load input data

In [None]:
UTR_data = pd.read_csv(p("data", "raw", "RBP_3UTR_data", "3UTR_merged_clean.csv"))
LS_classes = pd.read_csv(p("data", "raw", "LS_classes.csv"))

print(UTR_data.head())
print("Unique genes:", UTR_data["Gene name"].nunique())


## 2) Ordered gene list

In [None]:
ordered_list = [
    "TFAP4", "MLX", "MLXIPL", "MLXIP", "TCFL5", "SOHLH1", "SOHLH2", "MYC", "MYCN", "MYCL", "MAX", "MNT",
    "MXD3", "MXD4", "MXI1", "MXD1", "SREBF2", "SREBF1", "MITF", "TFE3", "TFEC", "TFEB", "USF3", "USF2", "USF1",
    "NCOA1", "NCOA2", "NCOA3", "NPAS2", "CLOCK", "ARNTL2", "ARNTL", "ARNT2", "ARNT", "NPAS4", "AHRR", "AHR",
    "SIM2", "SIM1", "NPAS3", "NPAS1", "HIF1A", "HIF3A", "EPAS1", "HELT", "BHLHE41", "BHLHE40", "HEYL", "HEY2",
    "HEY1", "HES7", "HES6", "HES5", "HES3", "HES2", "HES4", "HES1", "ATOH8", "TCF4", "TCF3", "TCF12", "MYOG",
    "MYF6", "MYOD1", "MYF5", "FIGLA", "ID1", "ID4", "ID3", "ID2", "ASCL2", "ASCL1", "ASCL4", "ASCL5", "ASCL3",
    "TAL1", "LYL1", "TAL2", "NHLH2", "NHLH1", "MESP2", "MSGN1", "MESP1", "PTF1A", "FERD3L", "ATOH7", "ATOH1",
    "BHLHA9", "BHLHA15", "BHLHE23", "BHLHE22", "OLIG1", "OLIG3", "OLIG2", "NEUROG2", "NEUROG3", "NEUROG1",
    "NEUROD2", "NEUROD6", "NEUROD4", "NEUROD1", "TCF21", "MSC", "TCF24", "TCF23", "TWIST2", "TWIST1", "HAND2",
    "HAND1", "TCF15", "SCX"
]


## 3) UTR lengths per transcript and gene

In [None]:
UTR_only = UTR_data[["Gene name", "3' UTR end", "3' UTR start", "Transcript stable ID", "Gene stable ID"]].copy()
UTR_only = UTR_only.drop_duplicates()

UTR_only["UTR_length"] = UTR_only["3' UTR end"] - UTR_only["3' UTR start"]

UTR_length_tr = (
    UTR_only.groupby(["Gene name", "Transcript stable ID"])["UTR_length"]
    .sum()
    .reset_index()
)

UTR_length_gene = pd.concat(
    [
        UTR_length_tr.groupby("Gene name")["Transcript stable ID"].nunique(),
        UTR_length_tr.groupby("Gene name")["UTR_length"].max(),
        UTR_length_tr.groupby("Gene name")["UTR_length"].median(),
        UTR_length_tr.groupby("Gene name")["UTR_length"].mean(),
        UTR_length_tr.groupby("Gene name")["UTR_length"].std(),
    ],
    axis=1,
)

UTR_length_gene.columns = [
    "n_transcripts",
    "UTR_length_max",
    "UTR_length_median",
    "UTR_length_mean",
    "UTR_length_std",
]

UTR_length_gene = UTR_length_gene.merge(
    right=LS_classes.iloc[:, [0, 2]],
    left_on="Gene name",
    right_on="HGNC symbol",
    how="left",
)

UTR_length_gene.to_csv(p("data", "intermediate", "rbp", "Violin_plot_3UTR.csv"), index=False)
print("Saved: data/intermediate/rbp/Violin_plot_3UTR.csv")


## 4) RBP binary matrix

In [None]:
binary_matrix = (
    UTR_data[["Gene name", "rbp_name"]]
    .drop_duplicates()
    .assign(value=1)
    .pivot(index="Gene name", columns="rbp_name", values="value")
    .fillna(0)
    .astype(int)
)

binary_matrix.to_csv(p("data", "intermediate", "rbp", "RBP_binary_matrix.csv"))
print("Saved: data/intermediate/rbp/RBP_binary_matrix.csv")


## Exploratory analysis (optional)

This section keeps exploratory checks, but can be skipped for production runs.


In [None]:
# Genes present vs ordered list

gene_name_column = UTR_data["Gene name"].unique()

print("Unique genes in ordered list:", len(set(ordered_list)))
print("Genes present in dataframe:", len(set(gene_name_column)))

extra_genes = set(gene_name_column) - set(ordered_list)
print("Extra genes in dataframe:", extra_genes)

absent_genes = set(ordered_list) - set(gene_name_column)
print("Absent genes:", absent_genes)


In [None]:
# Histogram of transcripts per gene
counts = UTR_data.groupby("Gene name")["Transcript stable ID"].nunique()
plt.figure(figsize=(10, 6))
ax = counts.hist(bins=range(1, counts.max() + 2), color="#4C86C6", edgecolor="black")
for p in ax.patches:
    if p.get_height() > 0:
        ax.annotate(int(p.get_height()),
                    (p.get_x() + p.get_width() / 2, p.get_height()),
                    ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
# Dendrogram (optional)
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist

binary_matrix_ordered = binary_matrix.loc[[g for g in ordered_list if g in binary_matrix.index]]

dist_matrix = pdist(binary_matrix_ordered.values, metric='jaccard')
linked = linkage(dist_matrix, method='average')  # ward is not ideal for Jaccard

plt.figure(figsize=(12, 10), dpi=300)

dendrogram(
    linked,
    labels=binary_matrix_ordered.index,
    orientation='left',
    leaf_font_size=6,
)

plt.title('Hierarchical Clustering of Genes Based on RBP Binding')
plt.xlabel('Distance')
plt.ylabel('Genes')
plt.tight_layout()
plt.show()
