In [2]:
import pandas as pd
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns
from matplotlib.colors import ListedColormap

In [3]:
# Load statistical summeries per database
ddinter_pathway_df = pd.read_csv('results/pathway_syntox_analysis/ddinter_synergy_toxicity_statistical_summary.csv')
drugbank_pathway_df = pd.read_csv('results/pathway_syntox_analysis/drugbank_synergy_toxicity_statistical_summary.csv')

# Combine both dataframes
ddinter_pathway_df["Database"] = "DDInter"
drugbank_pathway_df["Database"] = "DrugBank"
combined_pathway_df = pd.concat([ddinter_pathway_df, drugbank_pathway_df], ignore_index=True)

In [5]:
# Define expected metrics and databases for looping
metrics = [
    'Drug Target Jaccard Similarity', 
    'Lowest Reactome Pathways Jaccard Similarity', 
    'All Reactome Pathways Jaccard Similarity'
    ]

databases = ['DrugBank', 'DDInter'] 

# Retrieve all necessary data for one test/database/metric combination 
def get_stats_data(df, metric, db, test_type, comparison=''):
    """
    Filters the DataFrame for a specific test result and extracts the Statistic, 
    P-value, Effect Size, and Effect Symbol.
    """
    
    # Determine which P-value column to use and the Effect Size Symbol
    if test_type == 'Kruskal-Wallis':
        p_col = 'P_value'
        e_symbol = '\\eta^2'
    elif test_type == 'Dunn Post-Hoc':
        p_col = 'P_value_Corrected'
        e_symbol = '\\delta'
    elif test_type == 'Jonckheere-Terpstra':
        p_col = 'P_value'
        e_symbol = 'r'
    else:
        return np.nan, np.nan, np.nan, ''

    # Filter the DataFrame
    print(df[(df['Synergy_Score'] == metric) &
        (df['Database'] == db)].head(3))
    result_row = df[
        (df['Synergy_Score'] == metric) &
        (df['Database'] == db) &
        (df['Test_Type'] == test_type) &
        (df['Comparison'] == comparison)
    ]
    print(result_row )
    if result_row.empty:
        # Return standard NaN placeholders if no data found
        return np.nan, np.nan, np.nan, e_symbol
    
    # Extract values
    row = result_row.iloc[0]
    
    # Test Statistic (assumed to be H for KW, Z for JT, and NaN/blank for Dunn)
    stat = row.get('Test_Statistic', np.nan)
    
    p = row.get(p_col, np.nan)
    e = row.get('Effect_Size_Value', np.nan)

    return stat, p, e, e_symbol


# Function to format the P-value and Effect Size for LaTeX
def format_p_effect(p_value, effect_size, effect_symbol):
    """Formats P-value and Effect Size into the string P-value (Effect Size Symbol = Value)."""
    
    # Check for missing data
    if pd.isna(p_value) or pd.isna(effect_size):
         return '-'
         
    # P-value Formatting
    if p_value >= 0.05:
        # For non-significant P-values: e.g., 0.10, >0.99. All must be in math mode.
        p_str = "${:.3f}$".format(p_value)
    else:
        # Handle P-value of zero or near-zero
        if p_value <= 1e-300: # Use a safe threshold
            p_str = "${:.3f}$".format(p_value)
        else:
            # For significant P-values: use scientific notation. All must be in math mode.
            exponent = int(np.floor(np.log10(p_value)))
            mantissa = p_value / 10**exponent
            p_str = "${:.3f} \\times 10^{{{:d}}}$".format(mantissa, exponent)
    
    # Effect Size Formatting
    # Ensure correct sign formatting for the effect size
    sign = "+" if effect_size >= 0 else ""
    effect_str = "{}{:.3f}".format(sign, effect_size)
    
    # Combine P-value and effect size with the correct symbol. All symbols and values must be in math mode.
    return f"{p_str} ($\\mathbf{{{effect_symbol}}}={effect_str}$)"

# --- Function to format the Test Statistic ---
def format_statistic(stat):
    """Formats the Test Statistic (H or Z) into a clean LaTeX string."""
    if not isinstance(stat, str):
        return '-'
    
    stat_type = str(stat.split('=')[0])
    stat_val = float(stat.split('=')[1])

    abs_stat = abs(stat_val)
    if abs_stat >= 10:
        exponent = int(np.floor(np.log10(abs_stat)))
        mantissa = abs_stat / 10**exponent
        sign = "-" if stat_val < 0 else ""
        return f"${sign}{mantissa:.3f} \\times 10^{{{exponent:d}}}$"
    else:
        # Fixed decimal for smaller stats, enclosed in math mode
        return "${:.3f}$".format(stat_val)

# Generate LaTex Table
def generate_latex_table_from_df(df, metrics_list, databases_list):
    """Generates the final LaTeX code string for the summary table based on the DataFrame."""
    
    # Define the 6 Test rows for each Synergy Score, matching the DataFrame columns
    test_rows_map = [
        ('Kruskal-Wallis', 'Kruskal-Wallis', 'All Groups'),
        ('Dunn: Major/Minor', 'Dunn Post-Hoc', 'Major vs Minor'),
        ('Dunn: Major/Moderate', 'Dunn Post-Hoc', 'Major vs Moderate'),
        ('Dunn: Moderate/Minor', 'Dunn Post-Hoc', 'Moderate vs Minor'),
        ('Jonckheere-Terpstra: Increasing Toxicity', 'Jonckheere-Terpstra', 'Increasing Trend (Minor->Major)'),
        ('Jonckheere-Terpstra: Decreasing Toxicity', 'Jonckheere-Terpstra', 'Decreasing Trend (Major->Minor)'),
    ]
    
    # Start of LaTeX Table Code (Using your requested structure)
    latex_code = [
        "\\begin{table}[htbp]",
        "\\centering",
        "\\small", # Requested formatting
        "\\caption{\\textbf{Drug target and pathway overlap toxicity analysis}. Results for statistical analysis of drug targets and pathway overlap correlated with toxicity categories. Each section refers to a different Jaccard Similarity metric. The first section computed the overlap between the sets of drug targets in a given drug combination, the second the sets of pathways, and the third the sets of pathways when restricted to the lowest level of Reactome. The results for the Kruskal-Wallis, Dunn with Bonferroni correction, and Jonckheere-Terpstra tests are included for both the DrugBank and DDInter datasets. The Jonckheere-Terpestra Increasing Toxicity test assesses whether the distribution of the Jaccard Similarity increases when toxicity categories become more severe, while the Jonckheere-Terpstra Decreasing Toxicity test evaluates if the Jaccard Similarity distributions increase when the toxicity categories decrease in severity.}",
        "\\label{tab:s_targ_tox}",
        "\\vspace{0.5em}", # Requested formatting
        "\\resizebox{\\textwidth}{!}{", 
        "\\scriptsize", # Use smallest common font size
        "\\begin{tabular}{ll|rr|rr}", # l=Score, l=Test, r r = DrugBank, r r = DDInter
        "\\toprule",
    ]
    
    # Header Rows
    header_row_1 = ["\\textbf{Overlap Metric}", "\\textbf{Test}", 
                    "\\multicolumn{2}{c|}{\\textbf{DrugBank}}", 
                    "\\multicolumn{2}{c}{\\textbf{DDInter}}"]
    latex_code.append(" & ".join(header_row_1) + " \\\\")
    
    # P-value column now includes Effect Size
    header_row_2 = ["\\textbf{Score}", "", "\\textbf{Test Statistic}", 
                    "\\textbf{P-value (Effect Size)}", 
                    "\\textbf{Test Statistic}", 
                    "\\textbf{P-value (Effect Size)}"]
    latex_code.append(" & ".join(header_row_2) + " \\\\")

    latex_code.append("\\midrule")
    
    # Data Rows 
    for i, metric in enumerate(metrics_list):
        # Escape underscores in synergy scores (e.g., S\_max) for text mode
        escaped_metric = metric.replace('_', '\\_') 
        score_cell = f"\\multirow{{{len(test_rows_map)}}}{{*}}{{\\textbf{{{escaped_metric}}}}}"
        
        for j, (display_name, test_type, comparison_col) in enumerate(test_rows_map):
            row_data = {}
            is_statistic_required = test_type == 'Kruskal-Wallis' or test_type == 'Jonckheere-Terpstra'

            for db in databases_list: # Loop over ['DrugBank', 'DDInter']
                # 1. Get Data
                stat, p, e, e_symbol = get_stats_data(df, metric, db, test_type, comparison_col)
                print(f"Retrived for {db}, {metric}, {test_type}, {comparison_col}: stat={stat}, p={p}, e={e}, e_symbol={e_symbol}")
                # 2. Format Strings
                stat_str = format_statistic(stat) if is_statistic_required else '-'
                p_e_str = format_p_effect(p, e, e_symbol)

                row_data[db] = {
                    'stat': stat_str,
                    'p_e': p_e_str
                }
            # 4. Construct Row
            row_cells = [
                score_cell if j == 0 else "", 
                display_name,
                row_data['DrugBank']['stat'],
                row_data['DrugBank']['p_e'],
                row_data['DDInter']['stat'],
                row_data['DDInter']['p_e']
            ]
            latex_code.append(" & ".join(row_cells) + " \\\\")

        # Add horizontal rule after each metric group, except the last one
        if i < len(metrics_list) - 1:
            latex_code.append("\\midrule")
            
    # --- Footer ---
    latex_code.append("\\bottomrule")
    latex_code.append("\\end{tabular}")
    latex_code.append("}") # Closing bracket for \resizebox
    latex_code.append("\\end{table}")
    
    return "\n".join(latex_code)

# Now generate the table
latex_output = generate_latex_table_from_df(combined_pathway_df, metrics, databases)

# Print the output to the console (standard behavior)
print(latex_output)

filename = "results/pathway_syntox_analysis/pathway_statistical_summary_table.tex"
with open(filename, 'w', encoding='utf-8') as f:
            f.write(latex_output)
print(f"Success! LaTeX table saved to {filename}")
print(latex_output)


                     Synergy_Score       Test_Type         Comparison  \
18  Drug Target Jaccard Similarity  Kruskal-Wallis         All Groups   
19  Drug Target Jaccard Similarity   Dunn Post-Hoc     Major vs Minor   
20  Drug Target Jaccard Similarity   Dunn Post-Hoc  Major vs Moderate   

   Test_Statistic       P_value  P_value_Corrected Effect_Size_Type  \
18   H=1.5326e+02  5.253554e-34                NaN    Eta-Squared H   
19         Dunn Z           NaN       9.491539e-31    Cliff's Delta   
20         Dunn Z           NaN       2.636158e-11    Cliff's Delta   

    Effect_Size_Value  Database  
18           0.002443  DrugBank  
19           0.035961  DrugBank  
20           0.014351  DrugBank  
                     Synergy_Score       Test_Type  Comparison Test_Statistic  \
18  Drug Target Jaccard Similarity  Kruskal-Wallis  All Groups   H=1.5326e+02   

         P_value  P_value_Corrected Effect_Size_Type  Effect_Size_Value  \
18  5.253554e-34                NaN    Eta-Squar