# Phase Distribution Plots for Circadian Data
This notebook generates proportional stacked barplots of LAG (phase) distributions from cycling genes and cycling cluster biomarkers.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Load input files
jtk = pd.read_csv('JTKCycling_input.tsv', sep='	')
markers = pd.read_csv('Markers_input.tsv', sep='	', header=None)
markers.columns = ['p_val', 'avg_log2FC', 'pct.1', 'pct.2', 'p_val_adj', 'cluster', 'gid']
markers = markers.iloc[1:].copy()
markers['cluster'] = markers['cluster'].astype(str)
markers['gid'] = markers['gid'].astype(str)
markers['p_val_adj'] = pd.to_numeric(markers['p_val_adj'], errors='coerce')
markers['pct.1'] = pd.to_numeric(markers['pct.1'], errors='coerce')

# Remove 'c' from JTK clusters for comparison
jtk['cluster_stripped'] = jtk['cluster'].str.replace('^c', '', regex=True)

def plot_phase_proportion_ordered_colormap_inverted(df, cluster_col, lag_col, title, xlabel, ylabel, pdfpath, cluster_order):
    df = df.copy()
    df[lag_col] = df[lag_col].astype(str)
    count_table = pd.crosstab(df[cluster_col], df[lag_col])
    prop_table = count_table.div(count_table.sum(axis=1), axis=0)
    clusters = prop_table.index.tolist()
    lags = sorted([int(x) for x in prop_table.columns])
    lag_cols = [str(l) for l in lags]
    prop_table = prop_table[lag_cols]
    fig, ax = plt.subplots(figsize=(18, 6))
    bottom = np.zeros(len(clusters))
    cmap = plt.get_cmap('magma', len(lags))
    for i, lag in enumerate(lags):
        ax.bar(clusters, prop_table[str(lag)], bottom=bottom, 
               color=cmap(i / (len(lags) - 1)), label=str(lag), edgecolor='black', width=0.8)
        bottom += prop_table[str(lag)].fillna(0).values
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=min(lags), vmax=max(lags)))
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax, orientation='vertical', label='LAG (Phase)')
    cbar.set_ticks(lags)
    cbar.set_ticklabels([str(l) for l in lags])
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.set_xticklabels(clusters, rotation=45)
    plt.tight_layout()
    plt.savefig(pdfpath)
    plt.close()

# Plot for all cycling genes
plot_phase_proportion(
    df=jtk[jtk['cluster_stripped'] != '16'], 
    cluster_col='cluster_stripped', 
    lag_col='LAG',
    title='Proportional Phase (LAG) Distribution of All Cycling Genes by Cluster',
    xlabel='Cluster',
    ylabel='Proportion',
    pdfpath='AllCycling_PhaseByCluster.pdf'
)

# Filter for cycling biomarkers
cycling_marker_mask = (markers['p_val_adj'] < 0.05) & (markers['pct.1'] > 0.1)
cycling_markers = markers[cycling_marker_mask].copy()
cycling_biomarkers = cycling_markers.merge(
    jtk[['CycID', 'LAG', 'cluster_stripped']],
    left_on='gid',
    right_on='CycID',
    how='inner'
)

if not cycling_biomarkers.empty:
    plot_phase_proportion(
        df=cycling_biomarkers,
        cluster_col='cluster',
        lag_col='LAG',
        title='Proportional Phase (LAG) Distribution of Cycling Biomarkers by Cluster',
        xlabel='Cluster',
        ylabel='Proportion',
        pdfpath='CyclingBiomarkers_PhaseByCluster.pdf'
    )