In [1]:
'''
 Compiles and filters different data metrics from TrackMate's tracks output file

 Assumes all output files are in different folders within one main directory

 Processes one condition
'''

import os
import pandas as pd
import numpy as np
from tkinter import filedialog
import tkinter as tk
from scipy.stats import ttest_ind, f_oneway
import seaborn as sns
import matplotlib.pyplot as plt

# Create a Tkinter root window
root = tk.Tk()
root.withdraw()  # Hide the root window

# Ask user to select the input directory using a file dialog
input_directory = filedialog.askdirectory(title="Select Input Directory")

# Check if a directory was selected
if not input_directory:
    print("No directory selected. Exiting...")
    exit()

# Ask the user for track duration to filter out
track_duration_input = int(input("Enter the minimum track duration (sec) you would like (Please enter whole number): "))

column_titles = ['NUMBER_SPOTS', 'NUMBER_GAPS', 'NUMBER_SPLITS', 'NUMBER_MERGES',
                 'NUMBER_COMPLEX', 'LONGEST_GAP', 'TRACK_START', 'TRACK_STOP', 'TRACK_DISPLACEMENT', 'TRACK_DURATION',
                 'TRACK_X_LOCATION', 'TRACK_Y_LOCATION', 'TRACK_Z_LOCATION', 'TRACK_MEAN_SPEED', 'TRACK_MAX_SPEED',
                 'TRACK_MIN_SPEED', 'TRACK_MEDIAN_SPEED', 'TRACK_STD_SPEED', 'TRACK_MEAN_QUALITY',
                 'TOTAL_DISTANCE_TRAVELED', 'MAX_DISTANCE_TRAVELED', 'CONFINEMENT_RATIO', 'MEAN_STRAIGHT_LINE_SPEED',
                 'LINEARITY_OF_FORWARD_PROGRESSION', 'MEAN_DIRECTIONAL_CHANGE_RATE']

df_SPOTS = pd.DataFrame()
df_GAPS = pd.DataFrame()
df_SPLITS = pd.DataFrame()
df_MERGES = pd.DataFrame()
df_COMPLEX = pd.DataFrame()
df_LONG_GAP = pd.DataFrame()
df_START = pd.DataFrame()
df_STOP = pd.DataFrame()
df_DISPLACEMENT = pd.DataFrame()
df_DURATION = pd.DataFrame()
df_X_LOC = pd.DataFrame()
df_Y_LOC = pd.DataFrame()
df_Z_LOC = pd.DataFrame()
df_MEAN_SPEED = pd.DataFrame()
df_MAX_SPEED = pd.DataFrame()
df_MIN_SPEED = pd.DataFrame()
df_MEDIAN_SPEED = pd.DataFrame()
df_STD_SPEED = pd.DataFrame()
df_MEAN_QUAL = pd.DataFrame()
df_TOT_DIST = pd.DataFrame()
df_MAX_DIST = pd.DataFrame()
df_CONFINEMENT = pd.DataFrame()
df_MEAN_LINE_SPEED = pd.DataFrame()
df_LINEARITY = pd.DataFrame()
df_MEAN_DIRECTION = pd.DataFrame()

df_list = [df_SPOTS, df_GAPS, df_SPLITS, df_MERGES, df_COMPLEX, df_LONG_GAP, df_START, df_STOP,
           df_DISPLACEMENT, df_DURATION, df_X_LOC, df_Y_LOC, df_Z_LOC, df_MEAN_SPEED,
           df_MAX_SPEED, df_MIN_SPEED, df_MEDIAN_SPEED, df_STD_SPEED, df_MEAN_QUAL,
           df_TOT_DIST, df_MAX_DIST, df_CONFINEMENT, df_MEAN_LINE_SPEED, df_LINEARITY,
           df_MEAN_DIRECTION]

# while True:
#     # Ask the user for which data they would like to extract
#     column_input = input("Enter the exact column name of the data you would like to extract (EX. TRACK_MEAN_SPEED): ")
#     if column_input in column_titles:
#         break
#     else:
#         print("Invalid entry, please enter a column title exactly as written in column_titles.")

# Create a folder named "trackmateresults" in the selected directory
# trackmateresults_folder = os.path.join(input_directory, "trackmateresults")
# os.makedirs(trackmateresults_folder, exist_ok=True)

# Create a list to store NumPy arrays
result_arrays = []

# Iterate over folders in the directory
for folder_name in os.listdir(input_directory):
    folder_path = os.path.join(input_directory, folder_name)
    
    # Check if the item in the directory is a folder
    if os.path.isdir(folder_path):
        print(f'Processing folder: {folder_name}')
        
        # Check if export.csv exists in the folder
        export_csv_path = os.path.join(folder_path, 'export.csv')
        if not os.path.exists(export_csv_path):
            print(f"export.csv not found in folder '{folder_name}'. Skipping...")
            continue
        
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(export_csv_path)

        # Convert column_input and 'TRACK_DURATION' to numeric values
        for column_input in column_titles:
            df[column_input] = pd.to_numeric(df[column_input], errors='coerce')


        print('original', df.shape)

        # Filter rows starting from the 5th row based on conditions
        filtered_rows = df.iloc[4:].loc[(~df[column_input].isna()) & (df['TRACK_DURATION'] >= track_duration_input)]

        print('filtered', filtered_rows.shape)

        for column in filtered_rows:
            if column in column_titles:
                df_list[column_titles.index(column)] = pd.concat([df_list[column_titles.index(column)], filtered_rows[column]], ignore_index=True)
        
        for df in df_list:
            print(df.head(2))


#         # Extract values from column_input and convert them to a NumPy array
#         result_array = np.array(filtered_rows[column_input])

#         # Add the NumPy array to the list
#         result_arrays.append(result_array)

# if result_arrays:
#     # Create a DataFrame from the list of arrays
#     result_df = pd.DataFrame(result_arrays).T
#     result_df.columns = [f'Column_{i+1}' for i in range(result_df.shape[1])]

#     # Save the resulting DataFrame to a CSV file
#     output_file_path = os.path.join(trackmateresults_folder, f"{column_input}_{track_duration_input}sec.csv")
#     result_df.to_csv(output_file_path, index=False)
#     print("Results saved successfully.")
    
#     # Perform statistical analysis
#     condition_folders = [folder_name for folder_name in os.listdir(input_directory) if os.path.isdir(os.path.join(input_directory, folder_name))]
    
#     # Calculate mean summary statistics for each condition
#     mean_summaries = [result_df[f'Column_{i+1}'].describe().loc['mean'] for i in range(result_df.shape[1])]

#     # Combine mean summary statistics across conditions
#     combined_summary = pd.concat(mean_summaries, axis=1)

#     # Label combined_summary with condition folder names
#     combined_summary.columns = condition_folders

#     # Perform t-tests between conditions
#     t_test_results = []
#     for i in range(len(condition_folders)):
#         for j in range(i + 1, len(condition_folders)):
#             t_stat, p_value = ttest_ind(combined_summary[condition_folders[i]], combined_summary[condition_folders[j]], nan_policy='omit')
#             significance = 'ns'
#             if p_value < 0.0001:
#                 significance = '****'
#             elif p_value < 0.001:
#                 significance = '***'
#             elif p_value < 0.01:
#                 significance = '**'
#             elif p_value < 0.05:
#                 significance = '*'
#             t_test_results.append((f"T-test between {condition_folders[i]} and {condition_folders[j]} ({column_input}):",
#                                     t_stat, p_value, significance))

#     # Drop rows with missing values
#     clean_combined = combined_summary.dropna()

#     # ANOVA test
#     anova_f_stat, anova_p_value = f_oneway(*[clean_combined[column] for column in clean_combined.columns])
#     anova_significance = 'ns'
#     if anova_p_value < 0.0001:
#         anova_significance = '****'
#     elif anova_p_value < 0.001:
#         anova_significance = '***'
#     elif anova_p_value < 0.01:
#         anova_significance = '**'
#     elif anova_p_value < 0.05:
#         anova_significance = '*'

#     # Create DataFrame to store results
#     statsdf = pd.DataFrame(columns=['Comparison', 'T-Statistic', 'P-Value', 'Significance'])

#     # Populate DataFrame with t-test results
#     for result in t_test_results:
#         statsdf = pd.concat([statsdf, pd.DataFrame([{'Comparison': result[0], 'T-Statistic': result[1], 'P-Value': result[2], 'Significance': result[3]}])], ignore_index=True)

#     # Add empty rows as separation
#     statsdf = pd.concat([statsdf, pd.DataFrame([{'Comparison': None, 'T-Statistic': None, 'P-Value': None, 'Significance': None} for _ in range(2)])], ignore_index=True)

#     # Add ANOVA results
#     statsdf = pd.concat([statsdf, pd.DataFrame([{'Comparison': 'ANOVA', 'T-Statistic': anova_f_stat, 'P-Value': anova_p_value, 'Significance': anova_significance}])],

Processing folder: 1
original (2022, 28)
filtered (191, 28)
      0
0  81.0
1  80.0
     0
0  0.0
1  1.0
     0
0  0.0
1  0.0
     0
0  0.0
1  0.0
     0
0  0.0
1  0.0
     0
0  0.0
1  1.0
     0
0  0.0
1  0.0
           0
0  97.645245
1  97.645245
          0
0  0.255824
1  0.859784
           0
0  97.645245
1  97.645245
           0
0  48.261458
1  55.503353
           0
0  33.405276
1  94.337797
     0
0  0.0
1  0.0
          0
0  0.063990
1  0.115077
          0
0  0.179895
1  0.485974
          0
0  0.010003
1  0.010754
          0
0  0.055124
1  0.088271
          0
0  0.036843
1  0.093974
            0
0  118.691396
1  103.240033
           0
0   6.248345
1  11.488374
          0
0  0.684904
1  1.217413
          0
0  0.040943
1  0.074839
          0
0  0.002620
1  0.008805
          0
0  0.040943
1  0.076516
          0
0  1.541695
1  1.479382
Processing folder: 2
original (1790, 28)
filtered (122, 28)
      0
0  81.0
1  80.0
     0
0  0.0
1  1.0
     0
0  0.0
1  0.0
     0
0  