In [2]:
#!/usr/bin/env python3
# Two-way ANOVA analysis for genomic oxidation data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multitest import multipletests
from statsmodels.formula.api import ols
import statsmodels.api as sm
import os
import re
from tqdm import tqdm
import multiprocessing as mp
from functools import partial
import warnings
from statsmodels.tools.sm_exceptions import ValueWarning
import traceback

warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=ValueWarning)

In [3]:
def preprocess_data(file_path):

    print(f"Reading data from {file_path}...")
    
    # Extract bin size from file path
    bin_size = re.search(r'Normalized_(\d+)', file_path).group(1)
    print(f"Detected bin size: {bin_size}")
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Create FeatureID by combining Bin, Strand, and Chromosome
    df['FeatureID'] = df['Bin'].astype(str) + '_' + df['Strand'] + df['Chromosome'].astype(str)
    
    # Handle potential duplicates by grouping
    df['Group'] = df.groupby(['Sample', 'FeatureID']).cumcount().astype(str)
    
    # Show sample of the processed data
    print("Sample of the processed dataframe:")
    print(df[['Sample', 'Bin', 'Strand', 'FeatureID', 'Group', 'Median_Normalized_Damage']].head(5))
    
    # Create pivot table: samples as rows, features as columns
    pivot_df = df.pivot_table(
        index='Sample', 
        columns='FeatureID', 
        values='Median_Normalized_Damage',
        aggfunc='mean'
    )
    
    # Extract bin IDs (these will be the column names in the pivot table)
    bin_ids = pivot_df.columns.tolist()
    
    # Extract metadata from sample names
    metadata_df = pd.DataFrame(index=pivot_df.index)
    
    # Extract group (treatment) and time point from sample names
    metadata_df['treatment'] = metadata_df.index.map(lambda x: re.search(r'(CRS|Ctrl)', x).group(1) if re.search(r'(CRS|Ctrl)', x) else "Unknown")
    metadata_df['timepoint'] = metadata_df.index.map(lambda x: re.search(r'(evening|morning)', x).group(1) if re.search(r'(evening|morning)', x) else "Unknown")
    
    # Reset index to make Sample a standard column
    metadata_df = metadata_df.reset_index().rename(columns={'index': 'Sample'})
    
    print(f"Created pivot table with {pivot_df.shape[0]} samples and {pivot_df.shape[1]} genomic features")
    print(f"Extracted metadata with factors: treatment ({metadata_df['treatment'].unique()}) and timepoint ({metadata_df['timepoint'].unique()})")
    print(metadata_df.head(5))
    
    return pivot_df, bin_ids, metadata_df

In [4]:
def perform_two_way_anova(bin_data, factor1_name, factor2_name):
    try:
        # Create the formula for the model
        formula = f"measurement ~ C({factor1_name}) + C({factor2_name}) + C({factor1_name}):C({factor2_name})"
        
        # Fit the model (using Type III SS)
        model = ols(formula, data=bin_data).fit()
        
        # Get ANOVA table
        anova_table = sm.stats.anova_lm(model, typ=3)
        
        # Extract results
        results = {
            'factor1_pvalue': anova_table.loc[f'C({factor1_name})', 'PR(>F)'],
            'factor2_pvalue': anova_table.loc[f'C({factor2_name})', 'PR(>F)'],
            'interaction_pvalue': anova_table.loc[f'C({factor1_name}):C({factor2_name})', 'PR(>F)'],
            'factor1_Fvalue': anova_table.loc[f'C({factor1_name})', 'F'],
            'factor2_Fvalue': anova_table.loc[f'C({factor2_name})', 'F'],
            'interaction_Fvalue': anova_table.loc[f'C({factor1_name}):C({factor2_name})', 'F']
        }
        
        return results
    
    except Exception as e:
        # Return NaN values if error occurs
        return {
            'factor1_pvalue': np.nan,
            'factor2_pvalue': np.nan,
            'interaction_pvalue': np.nan,
            'factor1_Fvalue': np.nan,
            'factor2_Fvalue': np.nan,
            'interaction_Fvalue': np.nan
        }

In [5]:
def process_bin(args):

    bin_idx, bin_id, pivot_df, metadata_df, factor1_name, factor2_name = args
    
    # Extract measurements for this bin
    bin_values = pivot_df.iloc[:, bin_idx].values
    
    # Create DataFrame with measurements and factors
    df = pd.DataFrame({
        'measurement': bin_values,
        'Sample': pivot_df.index.tolist()
    })

    
    # Merge with metadata to get the factors
    df = pd.merge(df, metadata_df, on='Sample')
    
    # Perform ANOVA
    result = perform_two_way_anova(df, factor1_name, factor2_name)
    result['bin_id'] = bin_id
    
    return result

In [6]:
def create_batches(items, batch_size):
    """Split a list of items into batches of specified size"""
    for i in range(0, len(items), batch_size):
        yield items[i:i + batch_size]
        

In [7]:
def process_batch(batch_args):
    """Process a batch of bins"""
    batch_bins_with_idx, pivot_data, meta_data, f1_name, f2_name = batch_args
    batch_results = []
    
    for bin_idx, bin_id in batch_bins_with_idx:
        # Call the original process_bin function with the correct bin_idx
        result = process_bin((bin_idx, bin_id, pivot_data, meta_data, f1_name, f2_name))
        batch_results.append(result)
        
    return batch_results


In [24]:
def analyze_genomic_bins(pivot_df, bin_ids, metadata_df, factor1_name='treatment', factor2_name='timepoint', 
                       figures_dir=None, data_dir=None,
                       n_cores=64, batch_size=1000):

    bin_size = re.search(r'Normalized_(\d+)', file_path).group(1)
    output_prefix = "anova_" + bin_size 

    # Create output directories if specified
    if figures_dir is not None:
        os.makedirs(figures_dir, exist_ok=True)
    
    if data_dir is not None:
        os.makedirs(data_dir, exist_ok=True)
    
    # Determine number of cores for parallel processing
    if n_cores is None:
        n_cores = max(1, mp.cpu_count() - 1)  # Leave one core free
    
    total_bins = len(bin_ids)
    print(f"Processing {total_bins} bins using {n_cores} cores with batch size {batch_size}...")
    
    # Create a list of (bin_idx, bin_id) pairs
    bin_idx_id_pairs = list(enumerate(bin_ids))
    
    # Group these pairs into batches
    bin_batches_with_idx = list(create_batches(bin_idx_id_pairs, batch_size))
    
    # Prepare batch arguments
    batch_args_list = [(batch, pivot_df, metadata_df, factor1_name, factor2_name) 
                      for batch in bin_batches_with_idx]
    
    # Results container
    all_results = []
    
    # Set up parallel processing with chunked work
    with mp.Pool(n_cores) as pool:
        # Use imap to maintain order of batches
        batch_iterator = pool.imap(process_batch, batch_args_list)
        
        # Process batches with progress bar
        with tqdm(total=total_bins) as pbar:
            for batch_results in batch_iterator:
                all_results.extend(batch_results)
                pbar.update(len(batch_results))
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(all_results)


    
    # No need to sort since we maintained the original order with imap
    
    # Apply multiple testing correction
    for col in ['factor1_pvalue', 'factor2_pvalue', 'interaction_pvalue']:
        # Benjamini-Hochberg FDR correction
        mask = ~np.isnan(results_df[col])
        corrected = np.full(len(results_df), np.nan)
        
        if mask.sum() > 0:
            #corrected[mask] = multipletests(results_df.loc[mask, col], method='fdr_bh')[1]
            corrected[mask] = multipletests(results_df.loc[mask, col], method='bonferroni')[1]
            
        results_df[f'{col}_adj'] = corrected
        
        # Add significance flag (True/False)
        results_df[f'{col.replace("pvalue", "significant")}'] = results_df[f'{col}_adj'] < 0.05
    
    # Prepare output path
    if data_dir is not None:
        results_path = os.path.join(data_dir, f"{output_prefix}_results.csv")
    else:
        results_path = f"{output_prefix}_results.csv"

    print("results_df")
    print (results_df.head(5))
    
    # Write results to file
    results_df.to_csv(results_path, index=False)
    
    print(f"Analysis complete. Found:")
    print(f"  - {results_df['factor1_significant'].sum()} bins significant for {factor1_name}")
    print(f"  - {results_df['factor2_significant'].sum()} bins significant for {factor2_name}")
    print(f"  - {results_df['interaction_significant'].sum()} bins significant for interaction")
    print(f"  - Results CSV saved to: {results_path}")
    
    return results_df

In [23]:
def prepare_significant_bins_data(results_df, factor1_name, factor2_name, max_bins=10, file_path=None, table_path=None, data_dir=None):
    
    if file_path:
        bin_size = re.search(r'Normalized_(\d+)', file_path).group(1)
        output_prefix = "anova_" + bin_size
    else:
        output_prefix = "anova"
    
    print(f"Preparing top significant bins data...")
    
    # Load the chromosome naming table
    chromosomes_name_table = pd.read_csv(table_path, sep='\t')

    # Filter chromosomes_name_table
    chromosomes = ['chr' + str(i) for i in np.arange(1, 20, 1)] + ["chrX", "chrY"]
    chromosomes_name_table = chromosomes_name_table[chromosomes_name_table["UCSC style name"].isin(chromosomes)]
    chromosomes_name_table = chromosomes_name_table.loc[:, ["RefSeq seq accession", "UCSC style name"]]

    # Create a dictionary for faster lookups
    chrom_dict = dict(zip(chromosomes_name_table["RefSeq seq accession"], chromosomes_name_table["UCSC style name"]))

    # Function to parse bin_id and create new UCSC style ID
    def create_ucsc_style_id(bin_id):
        match = re.match(r'^(\d+\.\d+)_([+-])(.+)$', bin_id)
        if match:
            index = float(match.group(1))
            strand = match.group(2)
            ref_seq = match.group(3)
            
            if ref_seq in chrom_dict:
                # Convert index to integer
                return f"{chrom_dict[ref_seq]}{strand}{int(index)}"
        return None
    
    # For each factor, get the top significant bins
    factor_cols = [
        (factor1_name, 'factor1_pvalue', 'factor1_significant'),
        (factor2_name, 'factor2_pvalue', 'factor2_significant'),
        ('Interaction', 'interaction_pvalue', 'interaction_significant')
    ]
    
    # Dictionary to store DataFrames with top bins
    top_bins_dict = {}
    for name, pval_col, sig_col in factor_cols:
        # Get significant bins
        sig_bins = results_df[results_df[sig_col] == True].copy()
        
        if len(sig_bins) == 0:
            print(f"No significant bins found for {name}")
            continue
            
        # Sort by p-value
        sig_bins = sig_bins.sort_values(pval_col)
        
        # Take top bins
        top_bins = sig_bins.head(min(max_bins, len(sig_bins))).copy()
        
        # Add UCSC style ID column
        top_bins['ucsc_style_id'] = top_bins['bin_id'].apply(create_ucsc_style_id)
        
        top_bins_dict[f"{name.lower()}_top_bins"] = top_bins
        
        # Save the CSV with the new column if data_dir is provided
        if data_dir:
            top_bins.to_csv(f"{data_dir}/{output_prefix}_{name.lower()}_top_bins.csv", index=False)
    
    # Create a fourth file with the most significant factor for all significant bins
    print("Creating combined significant bins file with most significant factor...")
    
    # Step 1: Identify all bins that are significant for any factor
    all_sig_bins = results_df[
        (results_df['factor1_significant'] == True) | 
        (results_df['factor2_significant'] == True) | 
        (results_df['interaction_significant'] == True)
    ].copy()
    
    if len(all_sig_bins) == 0:
        print("No significant bins found for any factor")
        return top_bins_dict
    
    # Step 2: Add a column to identify the most significant factor for each bin
    def get_most_significant_factor(row):
        # Create a dictionary of F-values (higher is more significant)
        # Only consider factors that are significant
        f_values = {
            factor1_name: row['factor1_Fvalue'] if row['factor1_significant'] else float('-inf'),
            factor2_name: row['factor2_Fvalue'] if row['factor2_significant'] else float('-inf'),
            'Interaction': row['interaction_Fvalue'] if row['interaction_significant'] else float('-inf')
        }
        
        # Get the factor with the highest F-value
        most_sig_factor = max(f_values, key=f_values.get)
        
        # If no factor is significant, return None
        if f_values[most_sig_factor] == float('-inf'):
            return None
        
        return most_sig_factor
    
    # Add columns for most significant factor and its p-value
    all_sig_bins['most_significant_factor'] = all_sig_bins.apply(get_most_significant_factor, axis=1)
    
    # Add F-values for each factor and the maximum F-value
    all_sig_bins['max_f_value'] = all_sig_bins[['factor1_Fvalue', 'factor2_Fvalue', 'interaction_Fvalue']].max(axis=1)
    
    # Add a column indicating which factors are significant for each bin
    def get_significant_factors(row):
        sig_factors = []
        if row['factor1_significant']:
            sig_factors.append(factor1_name)
        if row['factor2_significant']:
            sig_factors.append(factor2_name)
        if row['interaction_significant']:
            sig_factors.append('Interaction')
        return ';'.join(sig_factors)
    
    all_sig_bins['significant_factors'] = all_sig_bins.apply(get_significant_factors, axis=1)
    
    # Add UCSC style ID column
    all_sig_bins['ucsc_style_id'] = all_sig_bins['bin_id'].apply(create_ucsc_style_id)
    
    # Sort by the maximum F-value (descending)
    all_sig_bins = all_sig_bins.sort_values('max_f_value', ascending=False)
    
    # Store in dictionary
    top_bins_dict['all_significant_bins'] = all_sig_bins
    
    # Save the CSV if data_dir is provided
    if data_dir:
        all_sig_bins.to_csv(f"{data_dir}/{output_prefix}_all_significant_bins.csv", index=False)
    
    return top_bins_dict

In [32]:
def create_heatmap_unified_zscore(pivot_df, bin_ids, metadata_df, results_df, factor1_name, factor2_name, top_bins_dict=None, max_bins=50, heatmap_dir=None):
    """
    Creates a combined heatmap of significant bins with z-score normalization.
    Z-scores standardize the data to show how many standard deviations each value 
    is from the mean for each bin, making patterns more visible across bins with different value ranges.
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import os
    from scipy import stats
    
    print(f"Creating a combined heatmap of significant bins with z-score normalization...")
    
    # Get all significant bins from the dictionary
    if 'all_significant_bins' not in top_bins_dict:
        print("No significant bins found, skipping heatmap.")
        return
        
    all_sig_bins_df = top_bins_dict['all_significant_bins']
    
    # Create groups based on significance pattern
    # Group 1: Only timepoint (factor2) is significant
    # Group 2: Only treatment (factor1) is significant
    # Group 3: All other bins (interaction or multiple factors)
    
    all_sig_bins_df['group'] = 'Other'
    
    # Find bins where ONLY factor2 (timepoint) is significant
    only_factor2_mask = (all_sig_bins_df['factor2_significant'] == True) & \
                        (all_sig_bins_df['factor1_significant'] == False) & \
                        (all_sig_bins_df['interaction_significant'] == False)
    all_sig_bins_df.loc[only_factor2_mask, 'group'] = f'Only {factor2_name}'
    
    # Find bins where ONLY factor1 (treatment) is significant
    only_factor1_mask = (all_sig_bins_df['factor1_significant'] == True) & \
                        (all_sig_bins_df['factor2_significant'] == False) & \
                        (all_sig_bins_df['interaction_significant'] == False)
    all_sig_bins_df.loc[only_factor1_mask, 'group'] = f'Only {factor1_name}'
    
    # Get the bin IDs
    bin_ids = all_sig_bins_df['bin_id'].tolist()
    
    # Get the UCSC style IDs for labeling
    if 'ucsc_style_id' in all_sig_bins_df.columns:
        # Create a mapping from bin_id to ucsc_style_id
        id_mapping = dict(zip(all_sig_bins_df['bin_id'], all_sig_bins_df['ucsc_style_id']))
    else:
        id_mapping = {bin_id: bin_id for bin_id in bin_ids}
    
    # Create group mapping for coloring
    group_mapping = dict(zip(all_sig_bins_df['bin_id'], all_sig_bins_df['group']))
    
    # Filter pivot table to only include these bins
    # Check if all bin IDs exist in the pivot table
    existing_bins = [b for b in bin_ids if b in pivot_df.columns]
    
    if len(existing_bins) < 2:
        print(f"Not enough valid bins to create a heatmap, need at least 2.")
        return
        
    bins_pivot = pivot_df[existing_bins]
    
    # Add the factors to the index
    heatmap_df = bins_pivot.copy()
    heatmap_df[factor1_name] = metadata_df.set_index('Sample')[factor1_name]
    heatmap_df[factor2_name] = metadata_df.set_index('Sample')[factor2_name]
    
    # Sort by factors
    heatmap_df = heatmap_df.sort_values([factor1_name, factor2_name])
    
    # Extract the factors for the row colors
    row_colors = pd.DataFrame({
        factor1_name: heatmap_df[factor1_name],
        factor2_name: heatmap_df[factor2_name]
    })
    
    # Remove the factors from the dataframe before plotting
    heatmap_data = heatmap_df.drop([factor1_name, factor2_name], axis=1)
    
    # Reorder columns by group to ensure bins from the same group are together
    # First, create a DataFrame with the group information for sorting
    column_group_df = pd.DataFrame({
        'bin_id': existing_bins,
        'group': [group_mapping.get(bin_id, 'Other') for bin_id in existing_bins]
    })
    
    # Define a custom group order for sorting
    group_order = [f'Only {factor1_name}', f'Only {factor2_name}', 'Other']
    group_order_dict = {group: i for i, group in enumerate(group_order)}
    
    # Sort by group first, then by bin_id
    column_group_df['group_order'] = column_group_df['group'].map(group_order_dict)
    column_group_df = column_group_df.sort_values(['group_order', 'bin_id'])
    
    # Reorder the columns in the heatmap_data
    ordered_bins = column_group_df['bin_id'].tolist()
    heatmap_data = heatmap_data[ordered_bins]
    
    # ===== Z-SCORE CALCULATION (NEW) =====
    # Calculate z-scores for each bin across all samples
    zscore_df = pd.DataFrame(index=heatmap_data.index, columns=heatmap_data.columns)
    
    for bin_id in heatmap_data.columns:
        bin_values = heatmap_data[bin_id].values
        # Check if there's any variation in this bin's values
        if np.std(bin_values) > 0:
            # Calculate z-score for each value
            zscore_df[bin_id] = stats.zscore(bin_values)
        else:
            # If all values are identical, set z-score to 0
            zscore_df[bin_id] = 0
    
    # Log the z-score range
    zmin = zscore_df.min().min()
    zmax = zscore_df.max().max()
    print(f"Z-score range: {zmin:.2f} to {zmax:.2f}")
    
    # Check for any extreme z-scores that might need capping
    extreme_threshold = 5.0  # Consider z-scores beyond +/- 5 as extreme
    extreme_count = ((zscore_df.abs() > extreme_threshold).sum().sum())
    if extreme_count > 0:
        print(f"Note: {extreme_count} extreme z-scores (> |{extreme_threshold}|) detected.")
        # Optionally cap extreme values to make visualization more balanced
        # zscore_df = zscore_df.clip(-extreme_threshold, extreme_threshold)
    # ===== END Z-SCORE CALCULATION =====
    
    # Define color map for the groups - use distinct colors
    group_colors = {
        f'Only {factor1_name}': "#1f77b4",  # Blue
        f'Only {factor2_name}': "#ff7f0e",  # Orange
        'Other': "#7f7f7f"                  # Gray
    }
    
    # Create column colors based on groups
    col_colors = pd.Series({
        id_mapping.get(col, col): group_colors.get(group_mapping.get(col, 'Other'), "#7f7f7f") 
        for col in ordered_bins
    })
    
    # Rename columns from bin_id to ucsc_style_id for better readability
    heatmap_data.columns = [id_mapping.get(col, col) for col in heatmap_data.columns]
    zscore_df.columns = [id_mapping.get(col, col) for col in zscore_df.columns]
    
    # Check if we have enough data to create a heatmap
    if heatmap_data.shape[1] < 2 or heatmap_data.shape[0] < 2:
        print("Not enough data to create a heatmap.")
        return
        
    # Create color map for factors - ENSURE CONSISTENT COLORS
    # Define fixed colors for treatments and timepoints - these will be consistent across plots
    treatment_palette = dict(zip(sorted(row_colors[factor1_name].unique()), 
                                sns.color_palette("Set1", len(row_colors[factor1_name].unique()))))
    timepoint_palette = dict(zip(sorted(row_colors[factor2_name].unique()), 
                                sns.color_palette("Set2", len(row_colors[factor2_name].unique()))))
    
    # Apply color maps using the consistent palettes
    row_colors_mapped = pd.DataFrame({
        factor1_name: row_colors[factor1_name].map(treatment_palette),
        factor2_name: row_colors[factor2_name].map(timepoint_palette)
    })
    
    # Calculate optimal figure size
    # Add extra space for legends
    fig_width = max(12, len(existing_bins) * 0.4) + 2  # Add 2 inches for legend space
    fig_height = max(8, len(heatmap_data) * 0.4)
    
    # Create a new figure for the combined plot
    combined_fig = plt.figure(figsize=(fig_width, fig_height))
    
    # Calculate the width proportions
    row_colors_width = 0.05  # 5% for each row color band
    heatmap_width = 1 - (row_colors_width * 2) - 0.15  # 15% for legend
    
    # Create GridSpec
    gs = plt.GridSpec(1, 4, width_ratios=[row_colors_width, row_colors_width, heatmap_width, 0.15])
    
    # Create axes for each component
    factor1_ax = combined_fig.add_subplot(gs[0, 0])
    factor2_ax = combined_fig.add_subplot(gs[0, 1])
    heatmap_ax = combined_fig.add_subplot(gs[0, 2])
    legend_ax = combined_fig.add_subplot(gs[0, 3])
    
    # Draw factor1 colors
    for i, (idx, row) in enumerate(row_colors_mapped.iterrows()):
        factor1_ax.add_patch(plt.Rectangle((0, i), 1, 1, color=row[factor1_name]))
    
    # Draw factor2 colors
    for i, (idx, row) in enumerate(row_colors_mapped.iterrows()):
        factor2_ax.add_patch(plt.Rectangle((0, i), 1, 1, color=row[factor2_name]))
    
    # Set axes properties for color bands
    for ax, title in [(factor1_ax, factor1_name), (factor2_ax, factor2_name)]:
        ax.set_xlim(0, 1)
        ax.set_ylim(0, len(heatmap_data))
        ax.set_xticks([0.5])
        ax.set_xticklabels([title])
        ax.set_yticks([])
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_visible(False)
    
    # Draw the heatmap in the main section using z-scores
    sns.heatmap(
        zscore_df,  # Using z-scores instead of raw values
        cmap="viridis",  # Use viridis colormap as requested
        vmin=-3,    # Limit color scale to +/- 3 standard deviations
        vmax=3,
        ax=heatmap_ax,
        cbar_ax=legend_ax,
        cbar_kws={"label": "Z-Score (Standard deviations from mean)"}
    )
    
    # Add column colors at the top
    col_colors_ax = combined_fig.add_axes([
        heatmap_ax.get_position().x0, 
        heatmap_ax.get_position().y1, 
        heatmap_ax.get_position().width, 
        0.02
    ])
    
    # Draw the column color patches
    for i, col in enumerate(zscore_df.columns):
        col_colors_ax.add_patch(plt.Rectangle(
            (i, 0), 
            1.0, 
            1.0, 
            color=col_colors.get(col, "#7f7f7f")
        ))
    
    # Set column color axes properties
    col_colors_ax.set_xlim(0, len(zscore_df.columns))
    col_colors_ax.set_ylim(0, 1)
    col_colors_ax.set_xticks([])
    col_colors_ax.set_yticks([])
    col_colors_ax.spines['top'].set_visible(False)
    col_colors_ax.spines['right'].set_visible(False)
    col_colors_ax.spines['bottom'].set_visible(False)
    col_colors_ax.spines['left'].set_visible(False)
    
    # Add title to the combined figure
    combined_fig.suptitle(f"Combined Heatmap of {len(existing_bins)} Significant Bins (Z-Score Normalized)", 
                          fontsize=16, y=0.98)
    
    # Create legend for all color elements
    legend_handles = []
    legend_labels = []
    
    # Add factor1 (treatment) items
    for label in sorted(row_colors[factor1_name].unique()):
        legend_handles.append(plt.Rectangle((0, 0), 1, 1, color=treatment_palette[label]))
        legend_labels.append(f"{factor1_name}: {label}")
    
    # Add factor2 (timepoint) items
    for label in sorted(row_colors[factor2_name].unique()):
        legend_handles.append(plt.Rectangle((0, 0), 1, 1, color=timepoint_palette[label]))
        legend_labels.append(f"{factor2_name}: {label}")
    
    # Add bin group items
    for group, color in group_colors.items():
        if group in group_mapping.values():
            legend_handles.append(plt.Rectangle((0, 0), 1, 1, color=color))
            legend_labels.append(group)
    
    # Add z-score explanation to legend
    legend_handles.extend([
        plt.Rectangle((0, 0), 1, 1, color='#440154'),  # Dark purple (lowest in viridis)
        plt.Rectangle((0, 0), 1, 1, color='#21918c'),  # Teal (middle in viridis)
        plt.Rectangle((0, 0), 1, 1, color='#fde725')   # Yellow (highest in viridis)
    ])
    
    legend_labels.extend([
        "Z-score < 0 (Value below mean)",
        "Z-score = 0 (Mean value)",
        "Z-score > 0 (Value above mean)"
    ])
    
    # Create a separate figure for the legend
    legend_fig = plt.figure(figsize=(3, 6))
    legend_ax = legend_fig.add_subplot(111)
    legend_ax.axis('off')
    
    legend = legend_ax.legend(
        legend_handles, 
        legend_labels, 
        loc='center', 
        frameon=True
    )
    
    # Ensure the output directory exists
    os.makedirs(heatmap_dir, exist_ok=True)
    
    # Save the figures
    combined_fig.savefig(f"{heatmap_dir}/combined_significance_heatmap_zscore.png", 
                         dpi=300, bbox_inches='tight')
    legend_fig.savefig(f"{heatmap_dir}/heatmap_legend_zscore.png", 
                       dpi=300, bbox_inches='tight')
  

    print(f"Z-score normalized heatmap complete. Files saved to:")
    print(f"  - {heatmap_dir}/combined_significance_heatmap_zscore.png")
    print(f"  - {heatmap_dir}/heatmap_legend_zscore.png")


In [21]:
! pwd

/cluster/home/taekim/stressed_mice/jupyter_notebooks


In [22]:
# Load and preprocess data

file_path = '../data_normalized/cleaned_Normalized_1000.csv' #CHANGE PATH


pivot_df, bin_ids, metadata_df = preprocess_data(file_path)

# View the first few rows of processed data
display(pivot_df.head())
display(metadata_df.head())


Reading data from ../data_normalized/cleaned_Normalized_1000.csv...
Detected bin size: 1000
Sample of the processed dataframe:
                       Sample        Bin Strand               FeatureID Group  \
0  Sample_14_CRS_evening_S14_  3049000.0      -  3049000.0_-NC_000067.7     0   
1  Sample_14_CRS_evening_S14_  3050000.0      +  3050000.0_+NC_000067.7     0   
2  Sample_14_CRS_evening_S14_  3050000.0      -  3050000.0_-NC_000067.7     0   
3  Sample_14_CRS_evening_S14_  3051000.0      +  3051000.0_+NC_000067.7     0   
4  Sample_14_CRS_evening_S14_  3051000.0      -  3051000.0_-NC_000067.7     0   

   Median_Normalized_Damage  
0                  0.000000  
1                  1.745443  
2                  1.944181  
3                  3.758130  
4                  6.334267  
Created pivot table with 20 samples and 5300115 genomic features
Extracted metadata with factors: treatment (['Ctrl' 'CRS']) and timepoint (['morning' 'evening'])
                       Sample treatment tim

FeatureID,1000000.0_+NC_000087.8,1000000.0_-NC_000087.8,10000000.0_+NC_000067.7,10000000.0_+NC_000068.8,10000000.0_+NC_000069.7,10000000.0_+NC_000070.7,10000000.0_+NC_000071.7,10000000.0_+NC_000072.7,10000000.0_+NC_000073.7,10000000.0_+NC_000074.7,...,99999000.0_-NC_000073.7,99999000.0_-NC_000074.7,99999000.0_-NC_000075.7,99999000.0_-NC_000076.7,99999000.0_-NC_000077.7,99999000.0_-NC_000078.7,99999000.0_-NC_000079.7,99999000.0_-NC_000080.7,99999000.0_-NC_000081.7,99999000.0_-NC_000086.8
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sample_01_Ctrl_morning_S1_,4.572128,4.748145,2.104067,0.0,0.491426,0.0,3.100413,0.0,0.0,0.0,...,0.756436,0.865145,0.0,0.891847,0.545205,0.0,1.632534,0.0,0.628171,0.0
Sample_02_CRS_morning_S2_,6.19492,7.877645,2.715111,0.0,0.475606,0.0,2.400484,0.0,0.0,0.0,...,0.0,0.0,0.582617,1.726274,0.527654,1.294705,0.78999,0.0,0.607949,0.0
Sample_03_Ctrl_morning_S3_,6.641426,3.941203,0.509391,2.08825,0.0,0.502079,2.702177,0.655841,0.606558,1.065325,...,0.0,0.0,0.437227,1.943232,0.395979,0.485808,3.557103,0.0,1.368711,1.380718
Sample_04_CRS_morning_S4_,6.097946,9.800611,0.0,3.994508,1.638564,0.0,0.0,1.00362,0.0,4.075615,...,0.0,0.0,0.0,4.956149,0.0,0.0,0.907227,0.806931,0.698171,0.0
Sample_05_Ctrl_morning_S5_,8.187601,7.591789,0.0,0.0,0.0,1.547417,4.16408,0.0,2.804135,0.0,...,0.0,1.936588,4.042628,0.0,1.830624,0.0,0.913588,2.437766,2.109197,0.0


Unnamed: 0,Sample,treatment,timepoint
0,Sample_01_Ctrl_morning_S1_,Ctrl,morning
1,Sample_02_CRS_morning_S2_,CRS,morning
2,Sample_03_Ctrl_morning_S3_,Ctrl,morning
3,Sample_04_CRS_morning_S4_,CRS,morning
4,Sample_05_Ctrl_morning_S5_,Ctrl,morning


In the sorted unified z-scored, heatmap, we see the division of the groups more clearly. In the first group where the bins are only significant with treatment, we see that 