In [None]:
#PERCENTAGE OF ANIMALS IN CLUSTERS

import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt




folder_path = "/Volumes/lab-windingm/home/shared/rena/AUTISM_SCREEN/mCherry/feathers"
csv_files = glob.glob(folder_path + "/*.csv") #finding all csvs

framewise_percentages = [] #preparing to store per-frame % clustering

print(f"Found {len(csv_files)} CSV files")

for file in csv_files:
    print("Processing: {file}")
    df = pd.read_csv(file)

    #checking csv validity - skip if empty or missing expected columns
    if df.empty or not all(col in df.columns for col in ['frame', 'track_id', 'cluster']):
        print("CSV is empty or missing required columns. Skipping.")
        continue

    df['frame'] = df['frame'].astype(int)
    df['track_id'] = df['track_id'].astype(int)

    grouped_frames = df.groupby('frame')
    frame_numbers = sorted(grouped_frames.groups.keys())

    percentages = []
    for f in frame_numbers:
        frame_data = grouped_frames.get_group(f)
        clustered = frame_data[frame_data['cluster'] >= 0]['track_id'].nunique()
        percent_clustered = (clustered / 40) * 100  
        percentages.append(percent_clustered)

    framewise_percentages.append(percentages) #storing the percentages for this particular video
    #so that ^ looks like a list of lists
    
# Align videos by padding with NaNs to match longest video to align frame index
max_length = max(len(p) for p in framewise_percentages)
for i in range(len(framewise_percentages)):
    pad_len = max_length - len(framewise_percentages[i])
    framewise_percentages[i] += [np.nan] * pad_len

# Create frame-aligned DataFrame: rows = frame numbers, columns = videos
percent_df = pd.DataFrame(framewise_percentages).T #.T is transpose which makes rows=frames and columns=videos
percent_df.columns = [f"video_{i+1}" for i in range(len(framewise_percentages))]
percent_df['frame'] = percent_df.index

# Bin frames into 1000-frame chunks
bin_size = 1000
percent_df['bin'] = percent_df['frame'] // bin_size

# Compute mean and std per bin across videos
video_data = percent_df.drop(columns=['frame', 'bin'])
grouped = video_data.groupby(percent_df['bin'])
mean_per_bin = grouped.mean()
std_per_bin = grouped.std()

# Assemble summary DataFrame
summary = pd.DataFrame({
    'mean_clustered': mean_per_bin.mean(axis=1),
    'std_clustered': mean_per_bin.std(axis=1),
    'frame_start': mean_per_bin.index * bin_size
})


