# Comparison of Corresponding Values in Toxin-Actin Datasets

This notebook compares the `m1` and `m2` values across three datasets: **Ordered**, **Mutant Ordered**, and **Disordered** observations of toxin-actin interactions. Boxplots are used to visualize the distribution of `m1` and `m2` values across these different conditions.



Explanation

	•	Unmodified data_configurations: This code directly reads each entry from data_configurations without modifications, ensuring that no information is lost.
	•	Dynamic Loading and Verification: Each file is loaded if it exists; otherwise, a message is printed indicating the missing file.
	•	Data Aggregation: data_M1, data_M2, and conditions capture all relevant data for each condition.
	•	Plotting: Horizontal boxplots are created for m1 and m2, with annotations indicating the number of data points for each condition.
	•	Saving and Displaying: The final plot is saved as a PDF for all conditions and displayed.

This code will produce a comprehensive set of boxplots that include every configuration listed in data_configurations.

## 1. Importing Libraries and Loading Data

We begin by importing the necessary libraries for data analysis and visualization, followed by loading each dataset and inspecting the first few rows.



In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import re

# Define the working directory
wd = '/home/jovyan/LNMA/bravoa/data/'

# Each configuration category
vesicles_configurations = {
    "Vesicles_Toxin_vs_actin": data_configurations["Vesicles_Toxin_vs_actin"],
    "Vesicles_Toxin_vs_ER": data_configurations["Vesicles_Toxin_vs_ER"],
    "Vesicles_Toxin_vs_Lizo": data_configurations["Vesicles_Toxin_vs_Lizo"],
    "Vesicles_Toxin_vs_Mito": data_configurations["Vesicles_Toxin_vs_Mito"],
    "Vesicles_Toxin_vs_Endo": data_configurations["Vesicles_Toxin_vs_Endo"]
}

nucleus_configurations = {
    "Toxin_vs_Nucleus_a": data_configurations["Toxin_vs_Nucleus_a"],
    "Toxin_vs_Nucleus_ER": data_configurations["Toxin_vs_Nucleus_ER"],
    "Toxin_vs_Nucleus_L": data_configurations["Toxin_vs_Nucleus_L"],
    "Toxin_vs_Nucleus_M": data_configurations["Toxin_vs_Nucleus_M"],
    "Toxin_vs_Nucleus_E": data_configurations["Toxin_vs_Nucleus_E"]
}

cytoplasm_configurations = {
    "Toxin_vs_actin": data_configurations["Toxin_vs_actin"],
    "Toxin_vs_ER": data_configurations["Toxin_vs_ER"],
    "Toxin_vs_Lizo": data_configurations["Toxin_vs_Lizo"],
    "Toxin_vs_Mito": data_configurations["Toxin_vs_Mito"],
    "Toxin_vs_Endo": data_configurations["Toxin_vs_Endo"]
}

actin_configurations = {
    "Toxin_vs_actin_ord": data_configurations["Toxin_vs_actin_ord"],
    "Toxin_vs_actin_desord": data_configurations["Toxin_vs_actin_desord"],
    "Toxin_vs_actin_mutord": data_configurations["Toxin_vs_actin_mutord"]
}

endo_configurations = {
    "Toxin_vs_EndoEarly": data_configurations["Toxin_vs_EndoEarly"],
    "Toxin_vs_mutEndoEarly": data_configurations["Toxin_vs_mutEndoEarly"]
}

# Helper function to simplify condition names by removing "Toxin" and "Vesicles" prefixes while retaining organelle info
def simplify_name(name):
    name = re.sub(r"^Toxin_vs_", "", name)      # Remove "Toxin_vs_" prefix
    name = re.sub(r"^Toxin_", "", name)          # Remove any "Toxin_" prefix
    name = re.sub(r"^Vesicles_", "", name)       # Remove "Vesicles_" prefix
    name = name.replace("_", " ")                # Replace underscores with spaces for readability
    return name

# Function to plot each category in a subplot with consistent boxplot size
def plot_single_category(ax, configurations, category_name, max_conditions):
    data_M1 = []
    data_M2 = []
    conditions = []

    # Load data for each configuration in the category
    for config_name, config in configurations.items():
        file_path = os.path.join(wd, f"{config['df_name']}_all_manders_results.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            if 'm1' in df.columns and 'm2' in df.columns:
                data_M1.append(df['m1'].tolist())
                data_M2.append(df['m2'].tolist())
                # Simplify the condition name and add to labels
                simplified_name = simplify_name(config_name)
                conditions.append(simplified_name)
        else:
            print(f"File not found: {file_path}")

    # Add empty data points and labels if this category has fewer conditions than the max
    while len(conditions) < max_conditions:
        conditions.append("")  # Empty label for spacing
        data_M1.append([])     # Empty list to maintain uniformity in boxplot height
        data_M2.append([])

    # Boxplot for M1 and M2 side by side within the same subplot
    ax[0].boxplot(data_M1, labels=conditions, vert=False)
    ax[0].set_title(f"{category_name} M1")
    ax[0].set_xlim(0.2, 1)
    ax[0].set_ylim(0.5, max_conditions + 0.5)
    for i, data in enumerate(data_M1):
        if data:  # Only add text if there is data in the slot
            ax[0].text(0.7, i + 1, f'n={len(data)}', va='center', fontsize=8)

    ax[1].boxplot(data_M2, labels=conditions, vert=False)
    ax[1].set_title(f"{category_name} M2")
    ax[1].set_xlim(0.2, 1)
    ax[1].set_ylim(0.5, max_conditions + 0.5)
    for i, data in enumerate(data_M2):
        if data:  # Only add text if there is data in the slot
            ax[1].text(0.7, i + 1, f'n={len(data)}', va='center', fontsize=8)

# Determine the maximum number of conditions across all categories
max_conditions = max(len(vesicles_configurations), len(nucleus_configurations), 
                     len(cytoplasm_configurations), len(actin_configurations), len(endo_configurations))

# Create a figure with subplots for each category
fig, axes = plt.subplots(5, 2, figsize=(10, 15))  # 5 rows, 2 columns (M1 and M2 side by side for each category)
plt.subplots_adjust(hspace=0.4)  # Add vertical space between rows

# Plot each category in its respective subplot row
plot_single_category(axes[0], vesicles_configurations, "Vesicles", max_conditions)
plot_single_category(axes[1], nucleus_configurations, "Nucleus", max_conditions)
plot_single_category(axes[2], cytoplasm_configurations, "Cytoplasm", max_conditions)
plot_single_category(axes[3], actin_configurations, "Actin", max_conditions)
plot_single_category(axes[4], endo_configurations, "Endo", max_conditions)

# Save the entire figure as a single PDF
pdf_path = os.path.join(wd, "Manders_Coefficients_All_Categories.pdf")
fig.suptitle("Manders' Coefficients for All Categories", fontsize=16, y=0.92)
plt.savefig(pdf_path, format="pdf")
print(f"Saved combined plot as PDF: {pdf_path}")

plt.show()