# Comparison of Corresponding Values in Toxin-Actin Datasets

This notebook compares the `m1` and `m2` values across observations of organelle. Boxplots are used to visualize the distribution of `m1` and `m2` values across these different conditions.


## Importing Libraries

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import re

## Define the working directory

In [None]:
wd = '/home/jovyan/LNMA/bravoa/data/'

## Data configurations

In [None]:

# Example data_configurations
data_configurations = {
    "Toxin_vs_actin": {"df_name": "Toxin_vs_actin"},
    "Toxin_vs_Nucleus_a": {"df_name": "Toxin_vs_Nucleus_a"},
    "Toxin_vs_actin_ord": {"df_name": "Toxin_vs_actin_ord"},
    "Toxin_vs_actin_desord": {"df_name": "Toxin_vs_actin_desord"},
    "Toxin_vs_actin_mutord": {"df_name": "Toxin_vs_actin_mutord"},
    "Toxin_vs_Nucleus_ER": {"df_name": "Toxin_vs_Nucleus_ER"},
    "Toxin_vs_ER": {"df_name": "Toxin_vs_ER"},
    "Toxin_vs_Nucleus_L": {"df_name": "Toxin_vs_Nucleus_L"},
    "Toxin_vs_Lizo": {"df_name": "Toxin_vs_Lizo"},
    "Toxin_vs_Nucleus_M": {"df_name": "Toxin_vs_Nucleus_M"},
    "Toxin_vs_Mito": {"df_name": "Toxin_vs_Mito"},
    "Toxin_vs_Nucleus_E": {"df_name": "Toxin_vs_Nucleus_E"},
    "Toxin_vs_Endo": {"df_name": "Toxin_vs_Endo"},
    "Toxin_vs_EndoEarly": {"df_name": "Toxin_vs_EndoEarly"},
    "Toxin_vs_mutEndoEarly": {"df_name": "Toxin_vs_mutEndoEarly"},
    "Vesicles_Toxin_vs_actin": {"df_name": "Vesicles_Toxin_vs_actin"},
    "Vesicles_Toxin_vs_ER": {"df_name": "Vesicles_Toxin_vs_ER"},
    "Vesicles_Toxin_vs_Lizo": {"df_name": "Vesicles_Toxin_vs_Lizo"},
    "Vesicles_Toxin_vs_Mito": {"df_name": "Vesicles_Toxin_vs_Mito"},
    "Vesicles_Toxin_vs_Endo": {"df_name": "Vesicles_Toxin_vs_Endo"}
}

# Configurations for the categories
vesicles_configurations = {
    "actin": data_configurations["Vesicles_Toxin_vs_actin"],
    "ER": data_configurations["Vesicles_Toxin_vs_ER"],
    "lysosomes": data_configurations["Vesicles_Toxin_vs_Lizo"],
    "mitochondria": data_configurations["Vesicles_Toxin_vs_Mito"],
    "endosomes": data_configurations["Vesicles_Toxin_vs_Endo"]
}

nucleus_configurations = {
    "actin": data_configurations["Toxin_vs_Nucleus_a"],
    "ER": data_configurations["Toxin_vs_Nucleus_ER"],
    "lysosomes": data_configurations["Toxin_vs_Nucleus_L"],
    "mitochondria": data_configurations["Toxin_vs_Nucleus_M"],
    "endosomes": data_configurations["Toxin_vs_Nucleus_E"]
}

cytoplasm_configurations = {
    "actin": data_configurations["Toxin_vs_actin"],
    "ER": data_configurations["Toxin_vs_ER"],
    "lysosomes": data_configurations["Toxin_vs_Lizo"],
    "mitochondria": data_configurations["Toxin_vs_Mito"],
    "endosomes": data_configurations["Toxin_vs_Endo"]
}

actin_configurations = {
    "Cry11Aa-polymerized Actin": data_configurations["Toxin_vs_actin_ord"],
    "Cry11Aa-depolymerized": data_configurations["Toxin_vs_actin_desord"],
    "Cry11AaE97A-polymerized Actin": data_configurations["Toxin_vs_actin_mutord"]
}

endo_configurations = {
    "Cry11Aa": data_configurations["Toxin_vs_EndoEarly"],
    "Cry11AaE97A": data_configurations["Toxin_vs_mutEndoEarly"]
}


## Define Functions

In [None]:
def plot_single_category(ax, configurations, category_name, max_conditions, custom_labels=None):
    data_M1 = []
    data_M2 = []
    conditions = []

    # Load data for each configuration in the category
    for config_name, config in configurations.items():
        file_path = os.path.join(wd, f"{config['df_name']}_all_manders_results.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            if 'm1' in df.columns and 'm2' in df.columns:
                data_M1.append(df['m1'].tolist())
                data_M2.append(df['m2'].tolist())
                conditions.append(config_name)
        else:
            print(f"File not found: {file_path}")

    # If custom_labels are provided, pad data to match lengths
    if custom_labels:
        while len(data_M1) < len(custom_labels):
            data_M1.append([])
            data_M2.append([])
        conditions = custom_labels

    # Add empty labels and data to standardize heights
    while len(conditions) < max_conditions:
        conditions.append("")
        data_M1.append([])
        data_M2.append([])

    # Define positions to standardize widths
    positions = range(1, len(conditions) + 1)

    # Boxplot for M1
    ax[0].boxplot(data_M1, labels=conditions, vert=False, positions=positions)
    ax[0].set_title(f"{category_name} M1")
    ax[0].set_xlim(0.2, 1)
    ax[0].set_ylim(0.5, max_conditions + 0.5)
    for i, data in enumerate(data_M1):
        if data:
            ax[0].text(0.9, i + 1, f'n={len(data)}', va='center', fontsize=8)

    # Boxplot for M2
    ax[1].boxplot(data_M2, labels=[""] * len(conditions), vert=False, positions=positions)  # Empty labels for M2
    ax[1].set_title(f"{category_name} M2")
    ax[1].set_xlim(0.2, 1)
    ax[1].set_ylim(0.5, max_conditions + 0.5)
    for i, data in enumerate(data_M2):
        if data:
            ax[1].text(0.9, i + 1, f'n={len(data)}', va='center', fontsize=8)

## Determine the maximum number of conditions

In [None]:
max_conditions = max(len(vesicles_configurations), len(nucleus_configurations), 
                     len(cytoplasm_configurations), len(actin_configurations), len(endo_configurations))


## Custom labels for specific categories

In [None]:
actin_labels = ["Cry11Aa-polymerized Actin", "Cry11Aa-depolymerized actin", "Cry11AaE97A-polymerized Actin"]
endo_labels = ["Cry11Aa", "Cry11AaE97A"]

## Create the plots

In [None]:
fig, axes = plt.subplots(5, 2, figsize=(10, 15))  # Adjusted size for 5 categories
plt.subplots_adjust(hspace=0.4)

plot_single_category(axes[0], vesicles_configurations, "Extracellular Vesicles", max_conditions)
plot_single_category(axes[1], nucleus_configurations, "Nucleus", max_conditions)
plot_single_category(axes[2], cytoplasm_configurations, "Cytoplasm", max_conditions)
plot_single_category(axes[3], actin_configurations, "Microvilli", max_conditions, custom_labels=actin_labels)
plot_single_category(axes[4], endo_configurations, "Endosomes", max_conditions, custom_labels=endo_labels)
# Save the combined figure as a PDF
pdf_path = os.path.join(wd, "Manders_Coefficients_All_Categories.pdf")
fig.suptitle("Manders' Coefficients for All Categories", fontsize=16, y=0.92)
plt.savefig(pdf_path, format="pdf")
print(f"Saved combined plot as PDF: {pdf_path}")

plt.show()

In [None]:
import os
import pandas as pd

def compute_data_for_category(configurations, category_name, max_conditions, custom_labels=None):
    """
    Computes data_M1, data_M2, and conditions for the given category.

    Parameters:
    - configurations: Dictionary with configuration details.
    - category_name: The category name for which data is being computed.
    - max_conditions: Maximum number of conditions to include.
    - custom_labels: Optional custom labels for the conditions.

    Returns:
    - data_M1: List of M1 data arrays.
    - data_M2: List of M2 data arrays.
    - conditions: List of conditions labels.
    """
    data_M1 = []
    data_M2 = []
    conditions = []

    # Load data for each configuration in the category
    for config_name, config in configurations.items():
        file_path = os.path.join(wd, f"{config['df_name']}_all_manders_results.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            if 'm1' in df.columns and 'm2' in df.columns:
                data_M1.append(df['m1'].tolist())
                data_M2.append(df['m2'].tolist())
                conditions.append(config_name)
        else:
            print(f"File not found: {file_path}")

    # If custom_labels are provided, pad data to match lengths
    if custom_labels:
        while len(data_M1) < len(custom_labels):
            data_M1.append([])
            data_M2.append([])
        conditions = custom_labels

    # Add empty labels and data to standardize heights
    while len(conditions) < max_conditions:
        conditions.append("")
        data_M1.append([])
        data_M2.append([])

    return data_M1, data_M2, conditions


def plot_single_category(ax, data_M1, data_M2, conditions, category_name, max_conditions):
    """
    Plots M1 and M2 boxplots for the given category data.

    Parameters:
    - ax: Array of Axes objects for plotting.
    - data_M1: List of M1 data arrays.
    - data_M2: List of M2 data arrays.
    - conditions: List of condition labels.
    - category_name: The category name for the plots.
    - max_conditions: Maximum number of conditions to display.
    """
    positions = range(1, len(conditions) + 1)

    # Boxplot for M1
    ax[0].boxplot(data_M1, labels=conditions, vert=False, positions=positions)
    ax[0].set_title(f"{category_name} M1")
    ax[0].set_xlim(0.2, 1)
    ax[0].set_ylim(0.5, max_conditions + 0.5)
    for i, data in enumerate(data_M1):
        if data:
            ax[0].text(0.9, i + 1, f'n={len(data)}', va='center', fontsize=8)

    # Boxplot for M2
    ax[1].boxplot(data_M2, labels=[""] * len(conditions), vert=False, positions=positions)  # Empty labels for M2
    ax[1].set_title(f"{category_name} M2")
    ax[1].set_xlim(0.2, 1)
    ax[1].set_ylim(0.5, max_conditions + 0.5)
    for i, data in enumerate(data_M2):
        if data:
            ax[1].text(0.9, i + 1, f'n={len(data)}', va='center', fontsize=8)


In [None]:
import matplotlib.pyplot as plt
import os

# Define max_conditions (replace with an appropriate value for your data)
max_conditions = 5  

# Compute data for each category
data_vesicles_M1, data_vesicles_M2, conditions_vesicles = compute_data_for_category(
    vesicles_configurations, "Extracellular Vesicles", max_conditions
)
data_nucleus_M1, data_nucleus_M2, conditions_nucleus = compute_data_for_category(
    nucleus_configurations, "Nucleus", max_conditions
)
data_cytoplasm_M1, data_cytoplasm_M2, conditions_cytoplasm = compute_data_for_category(
    cytoplasm_configurations, "Cytoplasm", max_conditions
)
data_actin_M1, data_actin_M2, conditions_actin = compute_data_for_category(
    actin_configurations, "Microvilli", max_conditions, custom_labels=actin_labels
)
data_endo_M1, data_endo_M2, conditions_endo = compute_data_for_category(
    endo_configurations, "Endosomes", max_conditions, custom_labels=endo_labels
)

# Create subplots
fig, axes = plt.subplots(5, 2, figsize=(10, 15))  # Adjusted size for 5 categories
plt.subplots_adjust(hspace=0.4)

# Plot data for each category
plot_single_category(axes[0], data_vesicles_M1, data_vesicles_M2, conditions_vesicles, "Extracellular Vesicles", max_conditions)
plot_single_category(axes[1], data_nucleus_M1, data_nucleus_M2, conditions_nucleus, "Nucleus", max_conditions)
plot_single_category(axes[2], data_cytoplasm_M1, data_cytoplasm_M2, conditions_cytoplasm, "Cytoplasm", max_conditions)
plot_single_category(axes[3], data_actin_M1, data_actin_M2, conditions_actin, "Microvilli", max_conditions)
plot_single_category(axes[4], data_endo_M1, data_endo_M2, conditions_endo, "Endosomes", max_conditions)

# Save the combined figure as a PDF
pdf_path = os.path.join(wd, "Manders_Coefficients_All_Categories.pdf")
fig.suptitle("Manders' Coefficients for All Categories", fontsize=16, y=0.92)
plt.savefig(pdf_path, format="pdf")
print(f"Saved combined plot as PDF: {pdf_path}")

plt.show()


In [None]:
import numpy as np
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import pandas as pd

# Example data structure (replace with actual data from `data_vesicles_M1`)
# Assuming data_vesicles_M1 = [[values for condition 1], [values for condition 2], ...]

# Flatten the data and create a condition label
data_flat = []
condition_labels = []
for i, data in enumerate(data_vesicles_M1):
    data_flat.extend(data)
    condition_labels.extend([f"Condition {i+1}"] * len(data))

# Perform Kruskal-Wallis H-test (non-parametric)
kruskal_stat, kruskal_p = stats.kruskal(*data_vesicles_M1)
print(f"Kruskal-Wallis H-test: Statistic={kruskal_stat}, p-value={kruskal_p}")

# Post-hoc analysis using Tukey's HSD test
# Convert data to a DataFrame for Tukey's test
data_df = pd.DataFrame({"Value": data_flat, "Condition": condition_labels})
tukey_result = pairwise_tukeyhsd(data_df["Value"], data_df["Condition"])

print("\nTukey HSD post-hoc results:")
print(tukey_result.summary())


In [None]:
import numpy as np
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import pandas as pd

def perform_statistical_analysis(data_df, value_col, condition_col):
    """
    Performs statistical analysis (Kruskal-Wallis H-test and Tukey's HSD) on the provided dataset.

    Parameters:
    - data_df (pd.DataFrame): A DataFrame containing the data.
    - value_col (str): The column name for the numeric values.
    - condition_col (str): The column name for the conditions.

    Returns:
    - dict: A dictionary with results for Kruskal-Wallis and Tukey's HSD tests.
    """
    # Group data by condition
    grouped_data = [group[value_col].tolist() for _, group in data_df.groupby(condition_col)]

    # Perform Kruskal-Wallis H-test
    kruskal_stat, kruskal_p = stats.kruskal(*grouped_data)

    # Perform Tukey's HSD test
    tukey_result = pairwise_tukeyhsd(data_df[value_col], data_df[condition_col])

    # Prepare results
    results = {
        "kruskal_wallis": {
            "statistic": kruskal_stat,
            "p_value": kruskal_p,
        },
        "tukey_hsd": tukey_result.summary()
    }

    return results


In [None]:
import pandas as pd

def analyze_all_datasets():
    """
    Analyzes all datasets by performing Kruskal-Wallis H-test and Tukey's HSD post hoc analysis.
    Assumes the following datasets and conditions are available in the scope:
    - data_vesicles_M1, data_vesicles_M2, conditions_vesicles
    - data_nucleus_M1, data_nucleus_M2, conditions_nucleus
    - data_cytoplasm_M1, data_cytoplasm_M2, conditions_cytoplasm
    - data_actin_M1, data_actin_M2, conditions_actin
    - data_endo_M1, data_endo_M2, conditions_endo
    """

    # Define datasets and their labels
    datasets = {
        "Extracellular Vesicles M1": (data_vesicles_M1, conditions_vesicles),
        "Extracellular Vesicles M2": (data_vesicles_M2, conditions_vesicles),
        "Nucleus M1": (data_nucleus_M1, conditions_nucleus),
        "Nucleus M2": (data_nucleus_M2, conditions_nucleus),
        "Cytoplasm M1": (data_cytoplasm_M1, conditions_cytoplasm),
        "Cytoplasm M2": (data_cytoplasm_M2, conditions_cytoplasm),
        "Microvilli M1": (data_actin_M1, conditions_actin),
        "Microvilli M2": (data_actin_M2, conditions_actin),
        "Endosomes M1": (data_endo_M1, conditions_endo),
        "Endosomes M2": (data_endo_M2, conditions_endo),
    }

    # Function to prepare a DataFrame for statistical analysis
    def prepare_dataframe(data, conditions):
        data_flat = []
        condition_labels = []
        for i, values in enumerate(data):
            data_flat.extend(values)
            condition_labels.extend([conditions[i]] * len(values))
        return pd.DataFrame({"Value": data_flat, "Condition": condition_labels})

    # Dictionary to store results
    analysis_results = {}

    # Analyze each dataset
    for dataset_name, (data, conditions) in datasets.items():
        print(f"Analyzing {dataset_name}...")
        data_df = prepare_dataframe(data, conditions)
        results = perform_statistical_analysis(data_df, "Value", "Condition")
        analysis_results[dataset_name] = results
        print(f"Kruskal-Wallis Results for {dataset_name}: {results['kruskal_wallis']}")
        print(f"Tukey HSD Results for {dataset_name}:\n{results['tukey_hsd']}")

    return analysis_results

# Run the analysis for all datasets
all_results = analyze_all_datasets()



In [None]:
def plot_single_category_violin(ax, data_M1, data_M2, conditions, category_name, max_conditions):
    """
    Plots M1 and M2 violin plots for the given category data.

    Parameters:
    - ax: Array of Axes objects for plotting.
    - data_M1: List of M1 data arrays.
    - data_M2: List of M2 data arrays.
    - conditions: List of condition labels.
    - category_name: The category name for the plots.
    - max_conditions: Maximum number of conditions to display.
    """
    positions = range(1, len(conditions) + 1)

    # Filter out empty datasets
    valid_data_M1 = [data for data in data_M1 if len(data) > 0]
    valid_data_M2 = [data for data in data_M2 if len(data) > 0]
    valid_conditions_M1 = [conditions[i] for i in range(len(data_M1)) if len(data_M1[i]) > 0]
    valid_conditions_M2 = [conditions[i] for i in range(len(data_M2)) if len(data_M2[i]) > 0]

    # Violin plot for M1
    ax[0].violinplot(valid_data_M1, positions=range(1, len(valid_conditions_M1) + 1), 
                     vert=False, widths=0.8, showmeans=True, showextrema=True, showmedians=True)
    ax[0].set_title(f"{category_name} M1")
    ax[0].set_xlim(0.2, 1)
    ax[0].set_ylim(0.5, max_conditions + 0.5)
    ax[0].set_yticks(range(1, len(valid_conditions_M1) + 1))
    ax[0].set_yticklabels(valid_conditions_M1)
    for i, data in enumerate(valid_data_M1):
        ax[0].text(0.9, i + 1, f'n={len(data)}', va='center', fontsize=8)

    # Violin plot for M2
    ax[1].violinplot(valid_data_M2, positions=range(1, len(valid_conditions_M2) + 1), 
                     vert=False, widths=0.8, showmeans=True, showextrema=True, showmedians=True)
    ax[1].set_title(f"{category_name} M2")
    ax[1].set_xlim(0.2, 1)
    ax[1].set_ylim(0.5, max_conditions + 0.5)
    ax[1].set_yticks(range(1, len(valid_conditions_M2) + 1))
    ax[1].set_yticklabels("")
    for i, data in enumerate(valid_data_M2):
        ax[1].text(0.9, i + 1, f'n={len(data)}', va='center', fontsize=8)


In [None]:
fig, axes = plt.subplots(5, 2, figsize=(10, 15))  # Adjusted size for 5 categories
plt.subplots_adjust(hspace=0.4)

plot_single_category_violin(axes[0], data_vesicles_M1, data_vesicles_M2, conditions_vesicles, "Extracellular Vesicles", max_conditions)
plot_single_category_violin(axes[1], data_nucleus_M1, data_nucleus_M2, conditions_nucleus, "Nucleus", max_conditions)
plot_single_category_violin(axes[2], data_cytoplasm_M1, data_cytoplasm_M2, conditions_cytoplasm, "Cytoplasm", max_conditions)
plot_single_category_violin(axes[3], data_actin_M1, data_actin_M2, conditions_actin, "Microvilli", max_conditions)
plot_single_category_violin(axes[4], data_endo_M1, data_endo_M2, conditions_endo, "Endosomes", max_conditions)

pdf_path = os.path.join(wd, "Manders_Coefficients_All_Categories_Violin.pdf")
fig.suptitle("Manders' Coefficients for All Categories (Violin Plots)", fontsize=16, y=0.92)
plt.savefig(pdf_path, format="pdf")
print(f"Saved combined plot as PDF: {pdf_path}")

plt.show()
