# Kymograph Statistical Analysis, NORMALIZED TO CONDITION 1 MEAN

After running KymoButlerDataProccessing.ipynb, there should be a folder within each folder of kymographs called kymoresults, in this folder is the compiled and processed data for each directional metric that is outputted from KymoButler (AI program that analyzes kymographs made by Max Jakobs: https://github.com/MaxJakobs/KymoButler). Now the following scripts will perform statistical analysis across different conditions of data. It will output graphs displaying results from each metric and the statistical analysis (t-tests, ANOVA, mean).

# Statistical Analysis for 1 Replicate, NORMALIZED TO CONDITION 1 MEAN

Intended to be used on one experiment that has several conditions of data (cannot perform t-tests on one condition of data). Choose directory that contains a folder for each condition of data. Script will loop through each folder, open kymoresults folder, then create take data and create a graph for each metric, and perform t-tests and ANOVA on the MEAN results from each cell. Saves graphs and analysis results in new folder called 'results' in selected directory. When asked for condition names, enter the name of each condition folder, the first one entered will be the one the others are normalized to.

In [None]:
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, f_oneway
from tkinter import filedialog, Tk
import numpy as np

def analyze_data(directory, condition_folders, normalize):
    """
    Analyze the data in the specified directory for the given condition folders.

    Parameters:
        directory (str): The path to the directory containing the CSV files.
        condition_folders (list): List of folder names for different conditions.
        normalize (bool): Whether to normalize the data.
    """

    # Create a folder named "results" in the selected directory
    results_folder = os.path.join(directory, "results_normalized" if normalize else "results")
    os.makedirs(results_folder, exist_ok=True)

    # Check if all required folders exist
    condition_paths = [os.path.join(directory, folder, "kymoresults") for folder in condition_folders]
    if not all(os.path.exists(folder) for folder in condition_paths):
        print("One or more required folders are missing.")
        return

    significant_tests = False

    # Define the CSV files to analyze
    csv_files = ['ANTdisplacementresults.csv', 'ANTdurationresults.csv', 'ANTframe2frameresults.csv',
                 'ANTstart2endresults.csv', 'RETdisplacementresults.csv', 'RETdurationresults.csv',
                 'RETframe2frameresults.csv', 'RETstart2endresults.csv', 'TOTALdisplacementresults.csv',
                 'TOTALdurationresults.csv', 'TOTALframe2frameresults.csv', 'TOTALstart2endresults.csv']

    for file in csv_files:
        # Read data from each CSV file into dataframes for each condition
        condition_dfs = []
        for folder in condition_paths:
            condition_dfs.append(pd.read_csv(os.path.join(folder, file)))

        # Normalize the data if the user chose to normalize
        if normalize:
            scale_factor = condition_dfs[0].mean().mean()
            for i in range(1, len(condition_dfs)):
                condition_dfs[i] = condition_dfs[i] * (condition_dfs[i].mean().mean()/scale_factor)

        # Calculate mean summary statistics for each condition
        mean_summaries = [df.describe().loc['mean'] for df in condition_dfs]

        # Combine mean summary statistics across conditions
        combined_summary = pd.concat(mean_summaries, axis=1)

        # Label combined_summary with condition folder names
        combined_summary.columns = condition_folders

        # Perform t-tests between conditions
        t_test_results = []
        for i in range(len(condition_folders)):
            for j in range(i + 1, len(condition_folders)):
                t_stat, p_value = ttest_ind(combined_summary[condition_folders[i]], combined_summary[condition_folders[j]], nan_policy='omit')
                t_test_results.append((f"T-test between {condition_folders[i]} and {condition_folders[j]} ({file}):",
                                       f"t-statistic: {t_stat}, p-value: {p_value}"))
                print(f"\nT-test between {condition_folders[i]} and {condition_folders[j]} ({file}):")
                print(f"t-statistic: {t_stat}, p-value: {p_value}")

                # Check if the test is significant
                if p_value < 0.05:
                    significant_tests = True

        # Drop rows with missing values
        clean_combined = combined_summary.dropna()

        # ANOVA test
        anova_f_stat, anova_p_value = f_oneway(*[clean_combined[column] for column in clean_combined.columns])
        anova_result = f"ANOVA test ({file}): F-statistic: {anova_f_stat}, p-value: {anova_p_value}"

        print(f"\nANOVA test ({file}):")
        print(f"F-statistic: {anova_f_stat}, p-value: {anova_p_value}")

        # Check if the test is significant
        if anova_p_value < 0.05:
            significant_tests = True

        # Write the results to a text file
        with open(os.path.join(results_folder, f"{file}_test_results.txt"), "w") as f:
            for result in t_test_results:
                f.write(result[0] + "\n")
                f.write(result[1] + "\n")
            f.write("\n")
            f.write(anova_result + "\n")
            f.write("\n")

        # Reshape the data for the violin plot
        combined_summary = combined_summary.reset_index().melt(id_vars='index', var_name='Condition', value_name='Mean')

        palette1 = sns.color_palette("flare", n_colors=len(condition_folders))

        # Create violin plot
        plt.figure(figsize=(6, 6))

        # Draw violin plot with mean line
        sns.violinplot(x='Condition', y='Mean', data=combined_summary, bw_method=0.2,
                       palette=palette1, linewidth=0, hue='Condition', legend=False,
                       inner=None)  # Remove inner annotations to add mean line separately

        # Calculate mean for each group
        means = combined_summary.groupby('Condition')['Mean'].mean()
        # Sort means Series based on the order of condition_folders
        means = means.reindex(condition_folders)
        print("Means:", means)  # Print mean values
        print("Condition folders:", condition_folders)  # Print condition folder names
        for i, mean_val in enumerate(means):
            print(f"Plotting mean {mean_val} for condition {condition_folders[i]} at position {i}")
            plt.plot(i, mean_val, marker='D', color='black')  # Draw mean point

        plt.title(f'{"Anterograde" if "ANT" in file else "Retrograde" if "RET" in file else "Total"} '
                  f'{"Duration" if "duration" in file else "Displacement" if "displacement" in file else "Frame2Frame Velocity" if "frame2frame" in file else "Start2End Velocity"}',
                  fontsize=16, fontweight='bold')

        plt.ylabel(f'{"Track Mean Velocity [µm/sec]" if "2" in file else "Track Duration [sec]" if "duration" in file else "Track Displacement [µm]"} ', fontsize=16, fontweight='bold')
        plt.xlabel('')
        plt.xticks(np.arange(len(condition_folders)), condition_folders, fontsize=12, fontweight='bold')  # Set ticks at each group
        plt.yticks(fontsize=12, fontweight='bold')
        plt.gca().spines['top'].set_visible(False)
        plt.gca().spines['right'].set_visible(False)
        plt.gca().spines['left'].set_linewidth(2)
        plt.gca().spines['bottom'].set_linewidth(2)

        # Save the plot to the results folder
        plt.savefig(os.path.join(results_folder, f'{"Anterograde" if "ANT" in file else "Retrograde" if "RET" in file else "Total"} '
                                                 f'{"Duration" if "duration" in file else "Displacement" if "displacement" in file else "Frame2Frame Velocity" if "frame2frame" in file else "Start2End Velocity"}_summary.png'))

        plt.tight_layout()
        plt.show()

    # Print if any test was significant
    if significant_tests:
        print("\nAt least one test was significant.")
    else:
        print("\nNo test was significant.")

# Prompt the user for the number of conditions and their names
num_conditions = int(input("Enter the number of conditions: "))

# Check if the number of conditions is 1
if (num_conditions) == 1:
    print("T-tests and ANOVA tests cannot be performed on a single condition. Exiting program.")
    sys.exit()

condition_folders = []
for i in range(num_conditions):
    folder_name = input(f"Enter the folder name for condition {i + 1}: ")
    condition_folders.append(folder_name)

# Ask the user if they want to normalize the data
normalize_input = input("Do you want to normalize the data? (yes/no): ").strip().lower()
normalize = normalize_input == "yes"

# Create the tkinter window
root = Tk()
root.withdraw()  # Hide the main window

# Ask the user to select a directory
directory = filedialog.askdirectory(title="Select Directory Containing Folders")

# Perform analysis if a directory is selected
if directory:
    analyze_data(directory, condition_folders, normalize)
else:
    print("No directory selected.")


# Statistical Analysis for All Replicates, NORMALIZED TO CONDITION 1 MEAN

Intended to be used on all replicates of an experiment that each have several conditions of data (cannot perform t-tests on one condition of data). Choose directory that contains all the folders for each replicate of experiment. Script will loop through each experimental replicate folder, open each condition folder, open kymoresults folder, then create take data and create a graph of the compiled results from each replicate of each metric, and perform t-tests and ANOVA on the MEAN results from each cell. Saves graphs and analysis results in new folder called 'results' in selected directory. When asked for condition names, enter name of folders (needs to be exactly the same folder names accross replicates) and the first one entered is the other conditions are normalized to.

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, f_oneway
from tkinter import filedialog, Tk
import numpy as np

def analyze_data(directory, num_conditions, condition_names):
    significant_tests = False

    # Create a folder named "results_normalized" in the selected directory
    results_folder = os.path.join(directory, "results_normalized")
    os.makedirs(results_folder, exist_ok=True)

    subfolders = [subfolder for subfolder in os.listdir(directory) if os.path.isdir(os.path.join(directory, subfolder))]

    condition_dfs = {condition_name: {} for condition_name in condition_names}

    csv_files = ['ANTdisplacementresults.csv', 'ANTdurationresults.csv', 'ANTframe2frameresults.csv',
                 'ANTstart2endresults.csv', 'RETdisplacementresults.csv', 'RETdurationresults.csv',
                 'RETframe2frameresults.csv', 'RETstart2endresults.csv', 'TOTALdisplacementresults.csv',
                 'TOTALdurationresults.csv', 'TOTALframe2frameresults.csv', 'TOTALstart2endresults.csv']

    for subfolder in subfolders:
        for condition_name in condition_names:
            condition_folder = os.path.join(directory, subfolder, condition_name)
            if os.path.exists(condition_folder):
                for file in csv_files:
                    file_path = os.path.join(condition_folder, "kymoresults", file)
                    if os.path.exists(file_path):
                        df = pd.read_csv(file_path)
                        key = os.path.splitext(file)[0]
                        if key not in condition_dfs[condition_name]:
                            condition_dfs[condition_name][key] = df
                        else:
                            condition_dfs[condition_name][key] = pd.concat([condition_dfs[condition_name][key], df], axis=1, ignore_index=True)

    for key in condition_dfs[condition_names[0]]:
        condition_dataframes = [condition_dfs[condition_name][key] for condition_name in condition_names]

        # Normalize means
        for i in range(len(condition_dataframes)):
            normalization_factor = condition_dataframes[i].mean().mean() / condition_dataframes[0].mean().mean()
            condition_dataframes[i] = condition_dataframes[i] * normalization_factor

        combined_summary = pd.concat([df.describe().loc['mean'] for df in condition_dataframes], axis=1)
        combined_summary.columns = condition_names


        t_test_results = []
        for i in range(len(condition_names)):
            for j in range(i+1, len(condition_names)):
                t_stat, p_value = ttest_ind(combined_summary[condition_names[i]], combined_summary[condition_names[j]], nan_policy='omit')
                t_test_results.append((f"T-test between {condition_names[i]} and {condition_names[j]} ({key}):", f"t-statistic: {t_stat}, p-value: {p_value}"))
                print(f"\nT-test between {condition_names[i]} and {condition_names[j]} ({key}):")
                print(f"t-statistic: {t_stat}, p-value: {p_value}")

                if p_value < 0.05:
                    significant_tests = True

        # Drop rows with missing values
        clean_combined = combined_summary.dropna()

        anova_f_stat, anova_p_value = f_oneway(*[clean_combined[column] for column in clean_combined.columns])
        anova_result = f"ANOVA test ({key}): F-statistic: {anova_f_stat}, p-value: {anova_p_value}"
        
        print(f"\nANOVA test ({key}):")
        print(f"F-statistic: {anova_f_stat}, p-value: {anova_p_value}")

        if anova_p_value < 0.05:
            significant_tests = True

        # Write the results to a text file
        with open(os.path.join(results_folder, f"{key}_test_results.txt"), "w") as f:
            for result in t_test_results:
                f.write(result[0] + "\n")
                f.write(result[1] + "\n")
            f.write("\n")
            f.write(anova_result + "\n")
            f.write("\n")


        # Reshape the data for the violin plot
        combined_summary = combined_summary.reset_index().melt(id_vars='index', var_name='Condition', value_name='Mean')

        palette1 = sns.color_palette("flare", n_colors=len(condition_names))

        # Create violin plot
        plt.figure(figsize=(6, 6))

        # Draw violin plot with mean line
        sns.violinplot(x='Condition', y='Mean', data=combined_summary, bw_method=0.2,
                    palette=palette1, linewidth=0, hue='Condition', legend=False,
                    inner=None)  # Remove inner annotations to add mean line separately

        # Calculate mean for each group
        means = combined_summary.groupby('Condition')['Mean'].mean()
        # Sort means Series based on the order of condition_names
        means = means.reindex(condition_names)
        print("Means:", means)  # Print mean values
        print("Condition names:", condition_names)  # Print condition folder names
        for i, mean_val in enumerate(means):
            print(f"Plotting mean {mean_val} for condition {condition_names[i]} at position {i}")
            plt.plot(i, mean_val, marker='D', color='black')  # Draw mean point


        plt.title(f'{"Anterograde" if "ANT" in key else "Retrograde" if "RET" in key else "Total"} '
                f'{"Duration" if "duration" in key else "Displacement" if "displacement" in key else "Frame2Frame Velocity" if "frame2frame" in key else "Start2End Velocity"}', 
                fontsize=16, fontweight='bold')

        plt.ylabel(f'{"Track Mean Velocity [µm/sec]" if "2" in key else "Track Duration [sec]" if "duration" in key else "Track Displacement [µm]"} ', fontsize=16, fontweight='bold')
        plt.xlabel('')
        plt.xticks(np.arange(len(means)), means.index, fontsize=12, fontweight='bold')  # Set ticks at each group
        plt.yticks(fontsize=12, fontweight='bold')
        plt.gca().spines['top'].set_visible(False)
        plt.gca().spines['right'].set_visible(False)
        plt.gca().spines['left'].set_linewidth(2)
        plt.gca().spines['bottom'].set_linewidth(2)

        plt.tight_layout()
        # Save the plot to the results folder
        plt.savefig(os.path.join(results_folder, f'{key}_summary.png'))
        #plt.show()
        plt.close()

    if significant_tests:
        print("\nAt least one test was significant.")
    else:
        print("\nNo test was significant.")

# Prompt the user for the number of conditions and their names
num_conditions = int(input("Enter the number of conditions: "))

condition_names = []
for i in range(num_conditions):
    condition_name = input(f"Enter the name for condition {i+1}: ")
    condition_names.append(condition_name)

# Create the tkinter window
root = Tk()
root.withdraw()  # Hide the main window

# Ask the user to select a directory
directory = filedialog.askdirectory(title="Select Directory Containing Subfolders with Condition Folders")

# Perform analysis if a directory is selected
if directory:
    analyze_data(directory, num_conditions, condition_names)
else:
    print("No directory selected.")
