# TF–Target WGCNA Clade Relationship Analysis (Python)

This notebook replicates the analysis originally scripted in R, now fully in **Python**.

**Steps covered:**
1. Load edge and WGCNA clade data  
2. Normalise cluster labels  
3. Map each TF and target gene to its WGCNA clade within the same cluster  
4. Flag edges where TF and target fall in the *same* vs *different* clade  
5. Summarise and visualise the counts

The bar plot is exported as **`TF_Target_Clade_Distribution.pdf`**.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from pathlib import Path

# Helper to normalise cluster labels
def norm_cluster(series):
    return series.str.replace('^-?c', '', regex=True).str.replace('-', '_', regex=False)


In [None]:
# Adjust paths as needed if running elsewhere
clade_path = Path("20250417.WGCNA.GenesClustersCladesBreakdown_long_filtered.csv")
edges_path = Path("unshuffled_top95_all_clusters.tsv")

clade_df = pd.read_csv(clade_path)
edges_df = pd.read_csv(edges_path, sep="\t")

print("clade_df shape:", clade_df.shape)
print("edges_df shape:", edges_df.shape)


In [None]:
# 1) Normalise cluster labels
edges_df['cluster'] = norm_cluster(edges_df['cluster'])
clade_df['cluster'] = norm_cluster(clade_df['cluster'])

# 2) Ensure clade_df has a 'gid' column
if 'gid' not in clade_df.columns:
    gene_cols = [c for c in clade_df.columns if c not in ('cluster', 'clade')]
    if len(gene_cols) != 1:
        raise ValueError("Cannot uniquely identify gene ID column in clade_df")
    clade_df = clade_df.rename(columns={gene_cols[0]: 'gid'})


In [None]:
# Build look‑up tables
tf_lookup = clade_df[['gid', 'cluster', 'clade']].rename(columns={'gid': 'TF', 'clade': 'TF_clade'})
target_lookup = clade_df[['gid', 'cluster', 'clade']].rename(columns={'gid': 'target', 'clade': 'target_clade'})

# Join
merged_df = edges_df.merge(tf_lookup, on=['TF', 'cluster'], how='left')                     .merge(target_lookup, on=['target', 'cluster'], how='left')

# Report unmatched
na_tf = merged_df['TF_clade'].isna().sum()
na_target = merged_df['target_clade'].isna().sum()
print(f"Unmatched TF rows: {na_tf}")
print(f"Unmatched target rows: {na_target}")


In [None]:
# Classify relationship
merged_df['Clade_Relationship'] = merged_df.apply(
    lambda r: 'Same clade' if r['TF_clade'] == r['target_clade'] else 'Different clade',
    axis=1
)

# Summarise counts
clade_counts = merged_df['Clade_Relationship'].value_counts().reindex(
    ['Different clade', 'Same clade']).fillna(0).reset_index()
clade_counts.columns = ['Clade_Relationship', 'Count']
clade_counts


In [None]:
plt.figure(figsize=(5,6))
plt.bar(clade_counts['Clade_Relationship'], clade_counts['Count'], color='#2F4F4F')
plt.ylabel('WGCNA Clade Distribution Across Clusters', fontsize=12)
plt.tight_layout()
plt.savefig('TF_Target_Clade_Distribution.pdf')
plt.show()
