In [35]:
#!/usr/bin/env python3
# Two-way ANOVA analysis for genomic oxidation data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multitest import multipletests
from statsmodels.formula.api import ols
import statsmodels.api as sm
import os
import re
from tqdm import tqdm
import multiprocessing as mp
from functools import partial
import warnings
from statsmodels.tools.sm_exceptions import ValueWarning

warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=ValueWarning)


In [36]:

def preprocess_data(file_path):
    """
    Preprocess the raw data file to extract bin information and create a feature matrix
    
    Parameters:
    -----------
    file_path : str
        Path to the CSV file with oxidation data
        
    Returns:
    --------
    tuple
        (pivot_df, bin_ids, metadata_df)
        pivot_df: DataFrame with samples as rows and bins as columns
        bin_ids: List of bin IDs
        metadata_df: DataFrame with sample metadata extracted from sample names
    """
    print(f"Reading data from {file_path}...")
    
    # Extract bin size from file path
    bin_size = re.search(r'Normalized_(\d+)', file_path).group(1)
    print(f"Detected bin size: {bin_size}")
    
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Create FeatureID by combining Bin, Strand, and Chromosome
    df['FeatureID'] = df['Bin'].astype(str) + '_' + df['Strand'] + df['Chromosome'].astype(str)
    
    # Handle potential duplicates by grouping
    df['Group'] = df.groupby(['Sample', 'FeatureID']).cumcount().astype(str)
    
    # Show sample of the processed data
    print("Sample of the processed dataframe:")
    print(df[['Sample', 'Bin', 'Strand', 'FeatureID', 'Group', 'Median_Normalized_Damage']].head(5))
    
    # Create pivot table: samples as rows, features as columns
    pivot_df = df.pivot_table(
        index='Sample', 
        columns='FeatureID', 
        values='Median_Normalized_Damage',
        aggfunc='mean'
    )
    
    # Extract bin IDs (these will be the column names in the pivot table)
    bin_ids = pivot_df.columns.tolist()
    
    # Extract metadata from sample names
    metadata_df = pd.DataFrame(index=pivot_df.index)
    
    # Extract group (treatment) and time point from sample names
    metadata_df['treatment'] = metadata_df.index.map(lambda x: re.search(r'(CRS|Ctrl)', x).group(1) if re.search(r'(CRS|Ctrl)', x) else "Unknown")
    metadata_df['timepoint'] = metadata_df.index.map(lambda x: re.search(r'(evening|morning)', x).group(1) if re.search(r'(evening|morning)', x) else "Unknown")
    
    # Reset index to make Sample a standard column
    metadata_df = metadata_df.reset_index().rename(columns={'index': 'Sample'})
    
    print(f"Created pivot table with {pivot_df.shape[0]} samples and {pivot_df.shape[1]} genomic features")
    print(f"Extracted metadata with factors: treatment ({metadata_df['treatment'].unique()}) and timepoint ({metadata_df['timepoint'].unique()})")
    print(metadata_df.head(5))
    
    return pivot_df, bin_ids, metadata_df

In [37]:
def perform_two_way_anova(bin_data, factor1_name, factor2_name):
    """
    Perform two-way ANOVA on a single bin's data with improved error handling
    """
    try:
        # Check if we have at least two levels for each factor
        factor1_levels = bin_data[factor1_name].nunique()
        factor2_levels = bin_data[factor2_name].nunique()
        
        if factor1_levels < 2 or factor2_levels < 2:
            print(f"Skipping bin: Insufficient factor levels ({factor1_levels}, {factor2_levels})")
            return {
                'factor1_pvalue': np.nan,
                'factor2_pvalue': np.nan,
                'interaction_pvalue': np.nan,
                'factor1_Fvalue': np.nan,
                'factor2_Fvalue': np.nan,
                'interaction_Fvalue': np.nan
            }
        
        # Check for complete cells - make sure each combination has data
        combinations = bin_data.groupby([factor1_name, factor2_name]).size()
        if 0 in combinations.values or combinations.shape[0] < factor1_levels * factor2_levels:
            print(f"Skipping bin: Incomplete factorial design")
            return {
                'factor1_pvalue': np.nan,
                'factor2_pvalue': np.nan,
                'interaction_pvalue': np.nan,
                'factor1_Fvalue': np.nan,
                'factor2_Fvalue': np.nan,
                'interaction_Fvalue': np.nan
            }
            
        # Create the formula for the model
        formula = f"measurement ~ C({factor1_name}) + C({factor2_name}) + C({factor1_name}):C({factor2_name})"
        
        # Fit the model with try/except to catch specific warnings
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=RuntimeWarning)
            warnings.filterwarnings('ignore', category=ValueWarning)
            model = ols(formula, data=bin_data).fit()
        
        # Get ANOVA table
        anova_table = sm.stats.anova_lm(model, typ=3)
        
        # Extract results with checks for invalid values
        results = {}
        for factor, prefix in zip(
            [f'C({factor1_name})', f'C({factor2_name})', f'C({factor1_name}):C({factor2_name})'],
            ['factor1_', 'factor2_', 'interaction_']
        ):
            if factor in anova_table.index:
                results[f'{prefix}pvalue'] = anova_table.loc[factor, 'PR(>F)']
                results[f'{prefix}Fvalue'] = anova_table.loc[factor, 'F']
            else:
                results[f'{prefix}pvalue'] = np.nan
                results[f'{prefix}Fvalue'] = np.nan
        
        # Check for invalid values
        for key in results:
            if pd.isna(results[key]) or np.isinf(results[key]):
                results[key] = np.nan
                
        return results
    
    except Exception as e:
        print(f"Error in ANOVA: {str(e)}")
        return {
            'factor1_pvalue': np.nan,
            'factor2_pvalue': np.nan,
            'interaction_pvalue': np.nan,
            'factor1_Fvalue': np.nan,
            'factor2_Fvalue': np.nan,
            'interaction_Fvalue': np.nan
        }

In [38]:



def process_bin(args):
    """Process a single bin - for parallel processing"""
    bin_idx, bin_id, pivot_df, metadata_df, factor1_name, factor2_name = args
    
    # Extract measurements for this bin
    bin_values = pivot_df.iloc[:, bin_idx].values
    
    # Create DataFrame with measurements and factors
    df = pd.DataFrame({
        'measurement': bin_values,
        'Sample': pivot_df.index.tolist()
    })

    
    # Merge with metadata to get the factors
    df = pd.merge(df, metadata_df, on='Sample')
    
    # Perform ANOVA
    result = perform_two_way_anova(df, factor1_name, factor2_name)
    result['bin_id'] = bin_id
    
    return result

In [39]:


def analyze_genomic_bins(file_path, factor1_name='treatment', factor2_name='timepoint', 
                       output_prefix=None, n_cores=None):
    """
    Analyze thousands of genomic bins with two-way ANOVA
    
    Parameters:
    -----------
    file_path : str
        Path to CSV file with oxidation data
    factor1_name : str
        Name of the first factor (default: 'treatment')
    factor2_name : str
        Name of the second factor (default: 'timepoint')
    output_prefix : str
        Prefix for output files (default: derived from input file)
    n_cores : int, optional
        Number of CPU cores to use for parallel processing
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame with ANOVA results for all bins
    """
    # Set default output prefix if not provided
    if output_prefix is None:
        output_prefix = os.path.splitext(os.path.basename(file_path))[0] + "_anova"
    
    # Process the data
    pivot_df, bin_ids, metadata_df = preprocess_data(file_path)
    
    # Determine number of cores for parallel processing
    if n_cores is None:
        n_cores = max(1, mp.cpu_count() - 1)  # Leave one core free
    
    print(f"Processing {len(bin_ids)} bins using {n_cores} cores...")
    
    # Prepare arguments for parallel processing
    args_list = [(i, bin_id, pivot_df, metadata_df, factor1_name, factor2_name) 
                for i, bin_id in enumerate(bin_ids)]
    
    # Initialize results list
    results_list = []
    
    # Set up parallel processing
    with mp.Pool(n_cores) as pool:
        # Process bins in parallel with progress bar
        for result in tqdm(pool.imap(process_bin, args_list), total=len(bin_ids)): #Perform test
            results_list.append(result)
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results_list)
    
    # Apply multiple testing correction
    for col in ['factor1_pvalue', 'factor2_pvalue', 'interaction_pvalue']:
        # Benjamini-Hochberg FDR correction
        mask = ~np.isnan(results_df[col])
        corrected = np.full(len(results_df), np.nan)
        
        if mask.sum() > 0:
            corrected[mask] = multipletests(results_df.loc[mask, col], method='fdr_bh')[1]
            
        results_df[f'{col}_adj'] = corrected
        
        # Add significance flag (True/False)
        results_df[f'{col.replace("pvalue", "significant")}'] = results_df[f'{col}_adj'] < 0.05
    
    # Create output directory
    os.makedirs(os.path.dirname(output_prefix) if os.path.dirname(output_prefix) else '.', exist_ok=True)
    
    # Write results to file
    results_df.to_csv(f"{output_prefix}_results.csv", index=False)
    
    print(f"Analysis complete. Found:")
    print(f"  - {results_df['factor1_significant'].sum()} bins significant for {factor1_name}")
    print(f"  - {results_df['factor2_significant'].sum()} bins significant for {factor2_name}")
    print(f"  - {results_df['interaction_significant'].sum()} bins significant for interaction")
    
    return results_df

In [40]:
def visualize_anova_results(results_df, factor1_name, factor2_name, output_prefix):
    """
    Create visualizations of ANOVA results
    
    Parameters:
    -----------
    results_df : pandas.DataFrame
        DataFrame with ANOVA results from analyze_genomic_bins()
    factor1_name : str
        Name of the first factor
    factor2_name : str
        Name of the second factor
    output_prefix : str
        Prefix for output files
        
    Returns:
    --------
    dict
        Dictionary with matplotlib figure objects
    """
    # Create output directory for plots
    plots_dir = "images/anova_results"
    os.makedirs(plots_dir, exist_ok=True)
    
    print("Creating visualizations...")
    figures = {}
    
    # 1. Manhattan-like plot of p-values
    plt.figure(figsize=(12, 6))
    plt.scatter(range(len(results_df)), -np.log10(results_df['factor1_pvalue']), 
               alpha=0.5, s=10, label=factor1_name)
    plt.scatter(range(len(results_df)), -np.log10(results_df['factor2_pvalue']), 
               alpha=0.5, s=10, label=factor2_name)
    plt.scatter(range(len(results_df)), -np.log10(results_df['interaction_pvalue']), 
               alpha=0.5, s=10, label='Interaction')
    
    # Add significance thresholds
    plt.axhline(-np.log10(0.05), linestyle='--', color='red', label='p=0.05')
    plt.axhline(-np.log10(0.05/len(results_df)), linestyle='--', color='blue', label='Bonferroni')
    
    plt.xlabel('Genomic Bin Index')
    plt.ylabel('-log10(p-value)')
    plt.title('Manhattan Plot of Two-way ANOVA p-values')
    plt.legend()
    plt.tight_layout()
    
    plt.savefig(f"{plots_dir}/manhattan_plot.png", dpi=300)
    figures['manhattan'] = plt.gcf()
    
    # 2. Volcano plots for each factor
    factor_cols = [
        (factor1_name, 'factor1_pvalue', 'factor1_Fvalue', 'factor1_significant'),
        (factor2_name, 'factor2_pvalue', 'factor2_Fvalue', 'factor2_significant'),
        ('Interaction', 'interaction_pvalue', 'interaction_Fvalue', 'interaction_significant')
    ]
    
    for name, pval_col, fval_col, sig_col in factor_cols:
        plt.figure(figsize=(8, 6))
        
        # Create a mask for non-NaN values
        mask = ~np.isnan(results_df[pval_col]) & ~np.isnan(results_df[fval_col])
        
        # Create a scatter plot with color indicating significance
        plt.scatter(
            results_df.loc[mask, fval_col],
            -np.log10(results_df.loc[mask, pval_col]),
            c=results_df.loc[mask, sig_col].map({True: 'red', False: 'black'}),
            alpha=0.5,
            s=15
        )
        
        plt.axhline(-np.log10(0.05), linestyle='--', color='red', label='p=0.05')
        plt.xlabel('F value (effect size)')
        plt.ylabel('-log10(p-value)')
        plt.title(f'Volcano Plot: {name}')
        plt.tight_layout()
        
        plt.savefig(f"{plots_dir}/volcano_{name.lower().replace(' ', '_')}.png", dpi=300)
        figures[f'volcano_{name.lower().replace(" ", "_")}'] = plt.gcf()
    
    # 3. Distribution of p-values
    plt.figure(figsize=(10, 6))
    
    # Create histograms for each factor
    bins = np.linspace(0, 1, 21)  # 20 bins from 0 to 1
    
    for i, (name, col) in enumerate(zip(
        [factor1_name, factor2_name, 'Interaction'],
        ['factor1_pvalue', 'factor2_pvalue', 'interaction_pvalue']
    )):
        # Skip NaN values
        p_values = results_df[col].dropna()
        if len(p_values) > 0:
            plt.hist(p_values, bins=bins, alpha=0.5, label=name)
    
    plt.xlabel('p-value')
    plt.ylabel('Count')
    plt.title('Distribution of p-values')
    plt.legend()
    plt.tight_layout()
    
    plt.savefig(f"{plots_dir}/pvalue_distribution.png", dpi=300)

    figures['pvalue_distribution'] = plt.gcf()
    
    # 4. Pie charts for significant results
    plt.figure(figsize=(15, 5))
    
    # Extract counts
    sig_factor1 = results_df['factor1_significant'].sum()
    sig_factor2 = results_df['factor2_significant'].sum()
    sig_interaction = results_df['interaction_significant'].sum()
    total_bins = len(results_df)
    
    # Create subplots for pie charts
    plt.subplot(1, 3, 1)
    plt.pie([sig_factor1, total_bins - sig_factor1], 
           labels=[f'Significant\n{sig_factor1} ({sig_factor1/total_bins:.1%})', 
                  f'Not significant\n{total_bins - sig_factor1} ({1-sig_factor1/total_bins:.1%})'],
           colors=['red', 'lightgray'], autopct='%1.1f%%')
    plt.title(f'Significant bins: {factor1_name}')
    
    plt.subplot(1, 3, 2)
    plt.pie([sig_factor2, total_bins - sig_factor2],
           labels=[f'Significant\n{sig_factor2} ({sig_factor2/total_bins:.1%})', 
                  f'Not significant\n{total_bins - sig_factor2} ({1-sig_factor2/total_bins:.1%})'],
           colors=['blue', 'lightgray'], autopct='%1.1f%%')
    plt.title(f'Significant bins: {factor2_name}')
    
    plt.subplot(1, 3, 3)
    plt.pie([sig_interaction, total_bins - sig_interaction],
           labels=[f'Significant\n{sig_interaction} ({sig_interaction/total_bins:.1%})', 
                  f'Not significant\n{total_bins - sig_interaction} ({1-sig_interaction/total_bins:.1%})'],
           colors=['purple', 'lightgray'], autopct='%1.1f%%')
    plt.title(f'Significant bins: Interaction')
    
    plt.tight_layout()
    plt.savefig(f"{plots_dir}/significance_summary.png", dpi=300)

    figures['significance_summary'] = plt.gcf()
    
    # 5. Overlap between significant sets (Venn-like diagram)
    # Create a DataFrame with all combinations of significance
    sig_df = pd.DataFrame({
        factor1_name: results_df['factor1_significant'],
        factor2_name: results_df['factor2_significant'],
        'Interaction': results_df['interaction_significant']
    })
    
    # Create a summary table
    summary = pd.DataFrame(columns=['Category', 'Count'])
    
    # Add total
    summary = pd.concat([summary, pd.DataFrame({'Category': ['Total bins'], 'Count': [total_bins]})])
    
    # Add individual significances
    for factor in [factor1_name, factor2_name, 'Interaction']:
        count = sig_df[factor].sum()
        summary = pd.concat([summary, pd.DataFrame({'Category': [f'Significant for {factor}'], 'Count': [count]})])
    
    # Add overlap counts
    for i, f1 in enumerate([factor1_name, factor2_name, 'Interaction']):
        for f2 in [factor1_name, factor2_name, 'Interaction'][i+1:]:
            count = (sig_df[f1] & sig_df[f2]).sum()
            summary = pd.concat([summary, pd.DataFrame({'Category': [f'Significant for both {f1} and {f2}'], 'Count': [count]})])
    
    # Add triple overlap
    count = (sig_df[factor1_name] & sig_df[factor2_name] & sig_df['Interaction']).sum()
    summary = pd.concat([summary, pd.DataFrame({'Category': ['Significant for all three factors'], 'Count': [count]})])
    
    # Save summary in current directory
    summary.to_csv(f"{output_prefix}_significance_summary.csv", index=False)
    
    print("Visualizations complete. All plots saved to", plots_dir)
    
    return figures

def plot_significant_bins(file_path, results_df, factor1_name, factor2_name, output_prefix, max_bins=10):
    """
    Plot significant bins with their oxidation patterns across factors
    
    Parameters:
    -----------
    file_path : str
        Path to the original data file
    results_df : pandas.DataFrame
        ANOVA results from analyze_genomic_bins()
    factor1_name : str
        Name of the first factor
    factor2_name : str
        Name of the second factor
    output_prefix : str
        Prefix for output files
    max_bins : int
        Maximum number of top bins to plot
    """
    # Process the data again to get the pivot table and metadata
    pivot_df, bin_ids, metadata_df = preprocess_data(file_path)
    
    # Create directory for bin plots
    bin_plots_dir = "images/anova_results"
    os.makedirs(bin_plots_dir, exist_ok=True)
    
    print(f"Plotting top significant bins...")
    
    # For each factor, plot the top significant bins
    factor_cols = [
        (factor1_name, 'factor1_pvalue', 'factor1_significant'),
        (factor2_name, 'factor2_pvalue', 'factor2_significant'),
        ('Interaction', 'interaction_pvalue', 'interaction_significant')
    ]
    
    # Dictionary to store DataFrames with top bins
    top_bins_dict = {}
    
    for name, pval_col, sig_col in factor_cols:
        # Get significant bins
        sig_bins = results_df[results_df[sig_col] == True].copy()
        
        if len(sig_bins) == 0:
            print(f"No significant bins found for {name}")
            continue
            
        # Sort by p-value
        sig_bins = sig_bins.sort_values(pval_col)
        
        # Take top bins
        top_bins = sig_bins.head(min(max_bins, len(sig_bins)))
        top_bins_dict[f"{name.lower()}_top_bins"] = top_bins
        
        # Save to file in current directory
        top_bins.to_csv(f"{output_prefix}_{name.lower()}_top_bins.csv", index=False)
        
        # Plot each top bin
        for i, row in enumerate(top_bins.itertuples()):
            try:
                # Get the bin ID
                bin_id = row.bin_id
                
                # Check if bin exists in pivot table
                if bin_id not in pivot_df.columns:
                    print(f"Warning: Bin {bin_id} not found in data")
                    continue
                
                # Create a DataFrame for this bin
                bin_data = pd.DataFrame({
                    'Sample': pivot_df.index,
                    'measurement': pivot_df[bin_id].values
                })
                
                # Merge with metadata
                bin_data = pd.merge(bin_data, metadata_df, on='Sample')
                
                # Create the plot
                plt.figure(figsize=(10, 6))
                
                # Use seaborn for better visualization
                ax = sns.pointplot(data=bin_data, x=factor1_name, y='measurement', 
                                  hue=factor2_name, dodge=True, errorbar=('se', 1), 
                                  capsize=0.2)
                
                # Add title and p-values
                plt.title(f"Bin: {bin_id}")
                plt.suptitle(
                    f"{factor1_name} p = {getattr(row, f'factor1_pvalue'):.3e}, "
                    f"{factor2_name} p = {getattr(row, f'factor2_pvalue'):.3e}, "
                    f"Interaction p = {getattr(row, f'interaction_pvalue'):.3e}",
                    y=0.92, fontsize=9
                )
                
                plt.ylabel('Oxidation Level')
                plt.tight_layout()
                
                # Save the figure in images/anova_results
                output_file = f"{bin_plots_dir}/{name.lower()}_top{i+1}_{bin_id.replace(':', '_')}"
                plt.savefig(f"{output_file}.png", dpi=300)

                plt.close()
                
            except Exception as e:
                print(f"Error plotting bin {bin_id}: {str(e)}")
    
    print(f"Bin plots complete. All plots saved to {bin_plots_dir}")
    return top_bins_dict

def create_heatmap(file_path, results_df, factor1_name, factor2_name, output_prefix, max_bins=50):
    """
    Create heatmaps of the top significant bins
    """
    # Process the data again to get the pivot table and metadata
    pivot_df, bin_ids, metadata_df = preprocess_data(file_path)
    
    # Create directory for heatmaps
    heatmap_dir = "images/anova_results"
    os.makedirs(heatmap_dir, exist_ok=True)
    
    print(f"Creating heatmaps of top significant bins...")
    
    # For each factor, create a heatmap of the top significant bins
    factor_cols = [
        (factor1_name, 'factor1_pvalue', 'factor1_significant'),
        (factor2_name, 'factor2_pvalue', 'factor2_significant'),
        ('Interaction', 'interaction_pvalue', 'interaction_significant')
    ]
    
    for name, pval_col, sig_col in factor_cols:
        # Get significant bins
        sig_bins = results_df[results_df[sig_col] == True].copy()
        
        if len(sig_bins) == 0:
            print(f"No significant bins found for {name}, skipping heatmap.")
            continue
            
        # Sort by p-value
        sig_bins = sig_bins.sort_values(pval_col)
        
        # Take top bins
        top_bins = sig_bins.head(min(max_bins, len(sig_bins)))
        
        # Get the bin IDs
        bin_ids = top_bins['bin_id'].tolist()
        
        # Filter pivot table to only include these bins
        # Check if all bin IDs exist in the pivot table
        existing_bins = [b for b in bin_ids if b in pivot_df.columns]
        
        if len(existing_bins) < 2:
            print(f"Not enough valid bins for {name} to create a heatmap, need at least 2.")
            continue
            
        bins_pivot = pivot_df[existing_bins]
        
        # Check for NaN values and handle them
        if bins_pivot.isnull().values.any():
            print(f"Warning: NaN values found in the data for {name}. Filling with column means.")
            bins_pivot = bins_pivot.fillna(bins_pivot.mean())
        
        # Add the factors to the index
        heatmap_df = bins_pivot.copy()
        heatmap_df[factor1_name] = metadata_df.set_index('Sample')[factor1_name]
        heatmap_df[factor2_name] = metadata_df.set_index('Sample')[factor2_name]
        
        # Sort by factors
        heatmap_df = heatmap_df.sort_values([factor1_name, factor2_name])
        
        # Extract the factors for the row colors
        row_colors = pd.DataFrame({
            factor1_name: heatmap_df[factor1_name],
            factor2_name: heatmap_df[factor2_name]
        })
        
        # Remove the factors from the dataframe before plotting
        heatmap_data = heatmap_df.drop([factor1_name, factor2_name], axis=1)
        
        # Check if we have enough data to cluster
        if heatmap_data.shape[1] < 2 or heatmap_data.shape[0] < 2:
            print(f"Not enough data to create a heatmap for {name}.")
            continue
            
        # Create color map for factors
        factor1_values = row_colors[factor1_name].unique()
        factor2_values = row_colors[factor2_name].unique()
        
        factor1_cmap = dict(zip(factor1_values, sns.color_palette("Set1", len(factor1_values))))
        factor2_cmap = dict(zip(factor2_values, sns.color_palette("Set2", len(factor2_values))))
        
        # Apply color maps
        row_colors_mapped = pd.DataFrame({
            factor1_name: row_colors[factor1_name].map(factor1_cmap),
            factor2_name: row_colors[factor2_name].map(factor2_cmap)
        })
        
        try:
            # Create the heatmap
            plt.figure(figsize=(max(10, len(existing_bins) * 0.4), max(8, len(heatmap_data) * 0.4)))
            
            # Use clustermap for hierarchical clustering
            g = sns.clustermap(
                heatmap_data,
                cmap="viridis",
                z_score=0,  # Z-score normalize the rows
                row_colors=row_colors_mapped,
                col_cluster=True,
                row_cluster=False,
                xticklabels=True,
                yticklabels=False,
                figsize=(max(10, len(existing_bins) * 0.4), max(8, len(heatmap_data) * 0.4))
            )
            
            # Add title
            plt.suptitle(f"Heatmap of top {len(existing_bins)} bins significant for {name}", y=1.02)
            
            # Create legends for factors
            for factor, cmap, pos in zip(
                [factor1_name, factor2_name],
                [factor1_cmap, factor2_cmap],
                [1.01, 1.01 + 0.05 * len(factor1_values)]
            ):
                for label, color in cmap.items():
                    g.ax_row_dendrogram.bar(0, 0, color=color, label=f"{factor}: {label}", linewidth=0)
                g.ax_row_dendrogram.legend(loc="center left", ncol=1, bbox_to_anchor=(1, pos))
            
            # Save the figure in images/anova_results
            plt.savefig(f"{heatmap_dir}/{name.lower()}_heatmap.png", dpi=300, bbox_inches='tight')

            plt.close()
            
        except Exception as e:
            print(f"Error creating heatmap for {name}: {str(e)}")
            # Try a simpler heatmap without clustering
            try:
                plt.figure(figsize=(max(10, len(existing_bins) * 0.4), max(8, len(heatmap_data) * 0.4)))
                sns.heatmap(heatmap_data, cmap="viridis", xticklabels=True, yticklabels=False)
                plt.title(f"Heatmap of top {len(existing_bins)} bins significant for {name}")
                plt.tight_layout()
                plt.savefig(f"{heatmap_dir}/{name.lower()}_heatmap_simple.png", dpi=300)

                plt.close()
            except Exception as e2:
                print(f"Could not create even a simple heatmap for {name}: {str(e2)}")
    
    print(f"Heatmaps complete. All heatmaps saved to {heatmap_dir}")

In [None]:
# 1. Load and preprocess data
file_path = 'data_normalized/cleaned_Normalized_100000.csv'
pivot_df, bin_ids, metadata_df = preprocess_data(file_path)

# 2. View the first few rows of processed data
display(pivot_df.head())
display(metadata_df.head())

# 3. Configure and run ANOVA analysis
factor1_name = 'treatment'  # CRS vs Ctrl
factor2_name = 'timepoint'  # morning vs evening
output_prefix = 'oxidation_analysis'
n_cores = 4  # Adjust based on your system

# 4. Run the analysis
results_df = analyze_genomic_bins(
    file_path, factor1_name, factor2_name)

# 5. Examine results
display(results_df.head())
print(f"Significant for {factor1_name}: {results_df['factor1_significant'].sum()}")
print(f"Significant for {factor2_name}: {results_df['factor2_significant'].sum()}")
print(f"Significant for interaction: {results_df['interaction_significant'].sum()}")

# 6. Create visualizations
visualize_anova_results(results_df, factor1_name, factor2_name, output_prefix)

# 7. Visualize top significant bins
plot_significant_bins(file_path, results_df, factor1_name, factor2_name, output_prefix)

# 8. Create heatmaps
create_heatmap(file_path, results_df, factor1_name, factor2_name, output_prefix)

Reading data from data_normalized/cleaned_Normalized_100000.csv...
Detected bin size: 100000
Sample of the processed dataframe:
                       Sample       Bin Strand              FeatureID Group  \
0  Sample_14_CRS_evening_S14_       0.0      +       0.0_+NC_000067.7     0   
1  Sample_14_CRS_evening_S14_       0.0      -       0.0_-NC_000067.7     0   
2  Sample_14_CRS_evening_S14_  100000.0      +  100000.0_+NC_000067.7     0   
3  Sample_14_CRS_evening_S14_  100000.0      -  100000.0_-NC_000067.7     0   
4  Sample_14_CRS_evening_S14_  200000.0      +  200000.0_+NC_000067.7     0   

   Median_Normalized_Damage  
0                       0.0  
1                       0.0  
2                       0.0  
3                       0.0  
4                       0.0  
Created pivot table with 20 samples and 54486 genomic features
Extracted metadata with factors: treatment (['Ctrl' 'CRS']) and timepoint (['morning' 'evening'])
                       Sample treatment timepoint
0  Sam

FeatureID,0.0_+NC_000067.7,0.0_+NC_000068.8,0.0_+NC_000069.7,0.0_+NC_000070.7,0.0_+NC_000071.7,0.0_+NC_000072.7,0.0_+NC_000073.7,0.0_+NC_000074.7,0.0_+NC_000075.7,0.0_+NC_000076.7,...,99900000.0_-NC_000073.7,99900000.0_-NC_000074.7,99900000.0_-NC_000075.7,99900000.0_-NC_000076.7,99900000.0_-NC_000077.7,99900000.0_-NC_000078.7,99900000.0_-NC_000079.7,99900000.0_-NC_000080.7,99900000.0_-NC_000081.7,99900000.0_-NC_000086.8
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sample_01_Ctrl_morning_S1_,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.849249,0.929127,1.282609,0.775262,0.943953,0.824291,1.054163,0.810096,1.004788,0.720863
Sample_02_CRS_morning_S2_,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.944873,1.036385,1.051391,0.800325,1.147813,1.307092,1.216669,0.945022,1.055067,0.727344
Sample_03_Ctrl_morning_S3_,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.015057,0.834947,1.252251,1.013523,0.874564,0.999333,1.050962,1.013886,0.977799,0.824327
Sample_04_CRS_morning_S4_,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.921588,0.743868,1.25416,0.697528,0.941538,0.838626,0.931485,0.739589,0.861289,0.613679
Sample_05_Ctrl_morning_S5_,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.905597,0.863649,1.333553,0.942068,0.954911,0.922569,1.187176,0.955254,0.815876,0.549317


Unnamed: 0,Sample,treatment,timepoint
0,Sample_01_Ctrl_morning_S1_,Ctrl,morning
1,Sample_02_CRS_morning_S2_,CRS,morning
2,Sample_03_Ctrl_morning_S3_,Ctrl,morning
3,Sample_04_CRS_morning_S4_,CRS,morning
4,Sample_05_Ctrl_morning_S5_,Ctrl,morning


Reading data from data_normalized/cleaned_Normalized_100000.csv...
Detected bin size: 100000
Sample of the processed dataframe:
                       Sample       Bin Strand              FeatureID Group  \
0  Sample_14_CRS_evening_S14_       0.0      +       0.0_+NC_000067.7     0   
1  Sample_14_CRS_evening_S14_       0.0      -       0.0_-NC_000067.7     0   
2  Sample_14_CRS_evening_S14_  100000.0      +  100000.0_+NC_000067.7     0   
3  Sample_14_CRS_evening_S14_  100000.0      -  100000.0_-NC_000067.7     0   
4  Sample_14_CRS_evening_S14_  200000.0      +  200000.0_+NC_000067.7     0   

   Median_Normalized_Damage  
0                       0.0  
1                       0.0  
2                       0.0  
3                       0.0  
4                       0.0  
Created pivot table with 20 samples and 54486 genomic features
Extracted metadata with factors: treatment (['Ctrl' 'CRS']) and timepoint (['morning' 'evening'])
                       Sample treatment timepoint
0  Sam

 69%|██████▉   | 37548/54486 [13:45<06:13, 45.40it/s]