In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sorted_alpha import sorted_alpha
from moviepy.editor import VideoFileClip
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from tslearn.barycenters import dtw_barycenter_averaging
import time
from tqdm import tqdm

# the first 2 minutes of each video 

In [None]:
import os
import pandas as pd

folder_name = '/Users/andrei-macpro/Documents/Data/openpose/play/tracking/tracking/1206_play_1'

tracks = []
f_names = sorted_alpha(folder_name)
for filename in f_names:
    # load the file in a pandas dataframe
    print(filename)
    filepath = os.path.join(folder_name, filename)
    df = pd.read_csv(filepath)
    df = df[~df.index.duplicated(keep='first')]
    tracks.append(df)

# Function to average x, y values for every 50 rows in the first 3000 frames 
def average_xy(df):
    averaged_data = []
    df = df[df.index < 3000]
    
    # Iterate through the DataFrame in chunks of 50 frames
    for i in range(0, len(df), 50):
        chunk = df.iloc[i:i+50]
        
        
        # Calculate the average x and y values for the chunk
        avg_x = chunk['x'].mean()
        avg_y = chunk['y'].mean()
        averaged_data.append({'frame': chunk.index[0], 'avg_x': avg_x, 'avg_y': avg_y})
    
    return pd.DataFrame(averaged_data)

# Apply the function to each DataFrame in tracks
averaged_tracks = [average_xy(df) for df in tracks]

# Print the averaged tracks
for i, avg_df in enumerate(averaged_tracks):
    print(f"Averaged DataFrame {i+1}:\n", avg_df)

# Print the cleaned tracks
for i, df in enumerate(tracks):
    print(f"Cleaned DataFrame {i+1}:\n", df)

In [None]:
averaged_tracks[0]

In [None]:
combined = pd.merge(averaged_tracks[0], averaged_tracks[1], on='frame', suffixes=('_caregiver', '_child'))
print("Aligned Positions Data:")
print(combined)

In [None]:
# Calculate the differences between consecutive rows for 'avg_x_child' and 'avg_y_child'
diff_x = combined['avg_x_child'].diff().dropna()
diff_y = combined['avg_y_child'].diff().dropna()

# Calculate the Euclidean distance
child_movement = np.sqrt(diff_x**2 + diff_y**2)
print("Euclidean differences between consecutive rows for 'avg_x_child' and 'avg_y_child':")
print(euclidean_distances)

In [None]:
# Calculate the Euclidean distances between avg x and y positions of caregiver and child
diff_x = combined['avg_x_caregiver'] - combined['avg_x_child']
diff_y = combined['avg_y_caregiver'] - combined['avg_y_child']

# Calculate the Euclidean distance
proximity = np.sqrt(diff_x**2 + diff_y**2)
print("Euclidean distances between avg x and y positions of caregiver and child:")
print(euclidean_distances)

In [None]:
## now for all the files in the folder

video_folder = '/Users/andrei-macpro/Documents/Data/videos/play_videos' 

durations = []
file_names = []

for folder_name in sorted(os.listdir(video_folder)):
    if folder_name == ".DS_Store":
        continue
    file_path = os.path.join(video_folder, folder_name)
    if os.path.isfile(file_path):
        clip = VideoFileClip(file_path)
        print(clip.fps)
        durations.append(clip.duration)



In [None]:
# find a way to align the recordings in terms of time
durations_meal = pd.Series(durations)
durations_meal.sort_values()

In [None]:
durations_meal.hist()


In [None]:
# find a way to align the recordings in terms of time
durations_play = pd.Series(durations)
durations_play.sort_values()

In [None]:
durations_play.hist()

In [None]:
# the features are child movement, caregiver movement, and proximity 
# it doesn't make sense to align the recordings of the same participant in terms of time
# so find a way to combine them 
# also resample for same fps 


# Dynamic Time Warping

In [None]:
# resample all to 25 fps
def resample_df(df, original_fps, target_fps):
    # Convert frame indices to time-based index
    df['time'] = pd.to_timedelta(df.index / original_fps, unit='s')
    df.set_index('time', inplace=True)
    
    # Resample the data to the target fps
    resample_interval = f'{int(1e9 / target_fps)}N'  # Nanoseconds interval
    df_resampled = df.resample(resample_interval).mean().dropna()
    
    # Convert time-based index back to frame indices
    df_resampled.index = (df_resampled.index.total_seconds() * target_fps).astype(int)
    return df_resampled

In [None]:
tracking_folder = '/Users/andrei-macpro/Documents/Data/openpose/play/tracking/tracking/' 
video_folder = '/Users/andrei-macpro/Documents/Data/videos/play_videos'   # Assuming video files 

time_series = {}
for folder_name in sorted(os.listdir(tracking_folder)):
    print(folder_name)
    if folder_name == ".DS_Store":
        continue
    file_path = os.path.join(tracking_folder, folder_name)
    tracks = {}
    clip = VideoFileClip(os.path.join(video_folder, folder_name+'.mp4',))
    if clip.fps > 25:
        for file in sorted(os.listdir(file_path)):
            if file == ".DS_Store":
                continue    
            filepath = os.path.join(tracking_folder,folder_name, file)
            df = pd.read_csv(filepath, index_col=0)
            df = df[~df.index.duplicated(keep='first')]
            df.columns = ['x_' + file.split('.')[0], 'y_' + file.split('.')[0]]
            df_resampled = resample_df(df, original_fps=clip.fps, target_fps=25)
            if folder_name not in tracks:
                tracks[folder_name] = []
            tracks[folder_name].append(df_resampled)
        # Combine the resampled DataFrames
        all_dfs = [df for dfs in tracks.values() for df in dfs]
        combined = pd.concat(all_dfs, axis=1).dropna()
        # Calculate the differences between consecutive rows for 'x_child' and 'y_child'
        time_series[folder_name] = combined
    else:
        for file in sorted(os.listdir(file_path)):
            if file == ".DS_Store":
                continue    
            filepath = os.path.join(tracking_folder,folder_name, file)
            df = pd.read_csv(filepath, index_col=0)
            df = df[~df.index.duplicated(keep='first')]
            df.columns = ['x_' + file.split('.')[0], 'y_' + file.split('.')[0]]
            if folder_name not in tracks:
                tracks[folder_name] = []
            
            # Append the resampled DataFrame to the list
            tracks[folder_name].append(df)
        # Combine the resampled DataFrames
        all_dfs = [df for dfs in tracks.values() for df in dfs]
        # Concatenate the DataFrames
        combined = pd.concat(all_dfs, axis=1).dropna()
        time_series[folder_name] = combined

         





In [None]:
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

# Identify the longest sequence
longest_sequence_key = max(time_series, key=lambda k: len(time_series[k]))
longest_sequence = time_series[longest_sequence_key]

# Align sequences to the longest sequence using DTW
aligned_to_longest = {}

for key, sequence in time_series.items():
    print(f"Aligning sequence for {key} to the longest sequence...")
    if key == longest_sequence_key:
        aligned_to_longest[key] = sequence
        continue
    
    # Perform DTW alignment
    distance, path = fastdtw(sequence.values, longest_sequence.values, dist=euclidean)

    # Create an aligned sequence based on the DTW path
    aligned_sequence = pd.DataFrame(np.zeros_like(longest_sequence.values), columns=longest_sequence.columns)
    
    for (i, j) in path:
        # Assign values from the sequence to the aligned sequence based on DTW path
        aligned_sequence.iloc[j] += sequence.iloc[i]  # Use += to handle multiple mappings (DTW might map multiple i to the same j)

    # Optionally interpolate missing values (though may not always be needed)
    aligned_sequence = aligned_sequence.replace(0, np.nan).interpolate(method='linear', limit_direction='both')
    aligned_to_longest[key] = aligned_sequence

aligned_sequences_list = list(aligned_to_longest.values())
# Compute the average reference sequence, ignoring NaN values
average_reference = pd.DataFrame(np.nanmean(np.array([df.values for df in aligned_sequences_list]), axis=0),
                                 columns=longest_sequence.columns)

# If you prefer to keep rows without any data (NaNs at the end), you can fill them using interpolation or another method
average_reference = average_reference.interpolate(method='linear', limit_direction='both')

# Align sequences to the average reference using DTW
aligned_dict = {}

for key, sequence in time_series.items():
    print(f"Aligning sequence for {key} to the average reference...")

    # Perform DTW alignment
    distance, path = fastdtw(sequence.values, average_reference.values, dist=euclidean)

    # Create an aligned sequence based on the DTW path
    aligned_sequence = pd.DataFrame(np.zeros_like(average_reference.values), columns=average_reference.columns)
    
    for (i, j) in path:
        # Assign values from the sequence to the aligned sequence based on DTW path
        aligned_sequence.iloc[j] += sequence.iloc[i]

    # Optionally interpolate missing values (though may not always be needed)
    aligned_sequence = aligned_sequence.replace(0, np.nan).interpolate(method='linear', limit_direction='both')
    aligned_dict[key] = aligned_sequence

# Print the aligned features
for key, aligned_sequence in aligned_dict.items():
    print(f"Aligned sequence for {key}:")
    print(aligned_sequence)

In [None]:
# Function to perform dtw_barycenter_averaging with progress tracking
def dtw_barycenter_averaging_with_progress(time_series, num_iterations=10):
    start_time = time.time()  # Start the timer

    # Convert your time series to a list of numpy arrays
    time_series_array = [sequence.values for sequence in time_series.values()]

    # Initialize a random or pre-defined sequence for the first iteration
    print("Starting DTW Barycenter Averaging with progress tracking...")
    
    # Add a progress bar using tqdm
    with tqdm(total=num_iterations, desc="DTW Barycenter Averaging Iterations") as pbar:
        for iteration in range(num_iterations):
            # Perform the averaging in each iteration
            barycenter = dtw_barycenter_averaging(time_series_array)

            # Update the progress bar
            pbar.update(1)

            # Print how long the process has taken so far
            elapsed_time = time.time() - start_time
            print(f"Iteration {iteration + 1}/{num_iterations} completed. Time elapsed: {elapsed_time:.2f} seconds.")
    
    total_time = time.time() - start_time  # Calculate total elapsed time
    print(f"Total time for DTW Barycenter Averaging: {total_time:.2f} seconds.")

    return barycenter  # Return the final barycenter result


In [None]:
def batch_dtw_barycenter_averaging(time_series_dict, batch_size=10, num_iterations=10):
    keys = list(time_series_dict.keys())
    batched_barycenters = []
    
    # Divide time series into batches
    for i in range(0, len(keys), batch_size):
        batch_keys = keys[i:i+batch_size]
        batch = {key: time_series_dict[key] for key in batch_keys}
        
        # Perform DTW barycenter averaging on the batch
        batch_barycenter = dtw_barycenter_averaging_with_progress(batch, num_iterations)
        batched_barycenters.append(batch_barycenter)
    
    # Average the barycenters of all batches
    final_barycenter = np.mean(batched_barycenters, axis=0)
    return final_barycenter

# Run batch processing
average_sequence = batch_dtw_barycenter_averaging(time_series, batch_size=10, num_iterations=10)

In [None]:
# Identify the longest sequence
longest_sequence_key = max(time_series, key=lambda k: len(time_series[k]))
longest_sequence = time_series[longest_sequence_key]

# Align sequences to the longest sequence using DTW
aligned_to_longest = {}

for key, sequence in time_series.items():
    print(f"Aligning sequence for {key} to the longest sequence...")
    if key == longest_sequence_key:
        aligned_to_longest[key] = sequence
        continue
    
    # Perform DTW alignment
    distance, path = fastdtw(sequence.values, longest_sequence.values, dist=euclidean)

    # Create an aligned sequence based on the DTW path
    aligned_sequence = pd.DataFrame(np.zeros_like(longest_sequence.values), columns=longest_sequence.columns)
    
    for (i, j) in path:
        # Assign values from the sequence to the aligned sequence based on DTW path
        aligned_sequence.iloc[j] += sequence.iloc[i]  # Use += to handle multiple mappings (DTW might map multiple i to the same j)

    # Interpolate missing values and fill remaining NaNs with the mean of the column
    aligned_sequence = aligned_sequence.replace(0, np.nan).interpolate(method='linear', limit_direction='both').fillna(aligned_sequence.mean())
    aligned_to_longest[key] = aligned_sequence

# Ensure all aligned sequences have the same length as the longest sequence
for key, aligned_sequence in aligned_to_longest.items():
    if len(aligned_sequence) < len(longest_sequence):
        # Pad the sequence with NaNs to match the length of the longest sequence
        padding = pd.DataFrame(np.nan, index=range(len(longest_sequence) - len(aligned_sequence)), columns=aligned_sequence.columns)
        aligned_to_longest[key] = pd.concat([aligned_sequence, padding], ignore_index=True)

# Print the aligned features for verification
for key, aligned_sequence in aligned_to_longest.items():
    print(f"Aligned sequence for {key}:")
    print(aligned_sequence)

In [None]:
# try without the averaging
aligned_to_longest


In [None]:
features_dict = {}
for key, df in aligned_to_longest.items():
    # Calculate the differences between consecutive rows for 'x_child' and 'y_child'
    diff_x_child = df['x_child'].diff().dropna()
    diff_y_child = df['y_child'].diff().dropna()
    child_movement = np.sqrt(diff_x_child**2 + diff_y_child**2)
    
    # Calculate the differences between consecutive rows for 'x_cg' and 'y_cg'
    diff_x_cg = df['x_cg'].diff().dropna()
    diff_y_cg = df['y_cg'].diff().dropna()
    caregiver_movement = np.sqrt(diff_x_cg**2 + diff_y_cg**2)
    
    # Calculate proximity
    diff_x_proximity = df['x_cg'] - df['x_child']
    diff_y_proximity = df['y_cg'] - df['y_child']
    proximity = np.sqrt(diff_x_proximity**2 + diff_y_proximity**2)
    
    # Combine the features into a single DataFrame
    features_df = pd.concat([child_movement, caregiver_movement, proximity], axis=1).dropna()
    features_df.columns = ['child_movement', 'caregiver_movement', 'proximity']
    
    # Store the features DataFrame in features_dict
    features_dict[key] = features_df
# Print the features for verification
for key, features_df in features_dict.items():
    print(f"Features for {key}:")
    print(features_df)

In [None]:
features_dict['1043_play']

In [None]:
# Take the last 2 minutes of each df
features_first_2min = {}
for key, df in features_dict.items():
    features_first_2min[key] = df[df.index <= 3000]

In [None]:
features_middle_2min = {}
frames_in_2_minutes = 3000

for key, df in features_dict.items():
    total_frames = len(df)
    middle_index = total_frames // 2
    start_index = max(0, middle_index - frames_in_2_minutes // 2)
    end_index = min(total_frames, middle_index + frames_in_2_minutes // 2)
    features_middle_2min[key] = df.iloc[start_index:end_index]

In [None]:
features_last_2min = {}
for key, df in features_dict.items():
    max_index = df.index.max()
    features_last_2min[key] = df[df.index >= max_index - 3000]
features_last_2min

In [None]:
features_first_2min = {key: df.groupby(df.index // 50).mean() for key, df in features_first_2min.items()}

In [None]:
features_last_2min = {key: df.groupby(df.index // 50).mean() for key, df in features_last_2min.items()}

In [None]:
features_middle_2min = {key: df.groupby(df.index // 50).mean() for key, df in features_middle_2min.items()}

In [None]:
group_size = 50
num_groups = 23679 // group_size
features_dict = {key: df.groupby(np.arange(len(df)) // group_size).mean() for key, df in features_dict.items()}

In [None]:
# Check if all DataFrames in features_dict are of the same length
def check_same_length(features_dict):
    lengths = [len(df) for df in features_dict.values()]
    return all(length == lengths[0] for length in lengths)

# Assuming features_dict is already defined
if check_same_length(features_dict):
    print("All DataFrames in features_dict are of the same length.")
else:
    print("DataFrames in features_dict have different lengths.")

In [None]:
# Flatten each DataFrame and combine them into a single DataFrame
flattened_dict = {key: df.values.flatten() for key, df in features_dict.items()}
flattened_df = pd.DataFrame(flattened_dict).T  # Transpose to have sequences as rows
print(flattened_df)


In [None]:
flattened_df.index = flattened_df.index.str.split('_').str[0].astype(int)

In [None]:
flattened_df

In [None]:
flattened_df.index

In [None]:
from sklearn.decomposition import PCA
# Print the original shape of flattened_df for reference
print("Original shape of flattened_df:", flattened_df.shape)
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(flattened_df.values)

# Apply PCA to reduce the number of columns/features by 99%
n_components = round(flattened_df.shape[1] - 99/100*flattened_df.shape[1])  # Retain 10% of the features
#n_components = 18
pca = PCA(n_components=n_components) # Retain 10% of the variance
pca_transformed = pca.fit_transform(scaled_data)

# Convert the PCA-transformed data back to a DataFrame
pca_transformed_df = pd.DataFrame(pca_transformed, columns=[f'PC{i+1}' for i in range(pca_transformed.shape[1])])

# Print the shape of the PCA-transformed DataFrame to verify the reduction
print("Shape of PCA-transformed DataFrame:", pca_transformed_df.shape)

# Print the PCA-transformed DataFrame
print("PCA-transformed DataFrame:")
print(pca_transformed_df)

In [None]:
pca_transformed_df.index = flattened_df.index

In [None]:
df_play = pd.read_csv('/Users/andrei-macpro/Documents/Data/tracking/features/play/combined_features.csv', index_col=0)
df_play = df_play.drop(columns=['Age', 'DAI', 'Rinab', 'IQ_T2', 'duration_meal', 'duration_play','Gender'])

In [None]:
df_play.index = df_play.index.str.split('_').str[0].astype(int)
pca_transformed_df['label'] = df_play['label']

In [None]:
pca_transformed_df

In [None]:
len(pca_transformed_df.index.unique())

In [None]:
pca_transformed_df.to_csv('/Users/andrei-macpro/Documents/Data/tracking/features/play/pca_proximity.csv')

In [None]:
## question: i've got 3 time series: child movement, caregiver movement, and proximity
## do i use all of them in combination or each one separately? 

# Classification

In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score


In [None]:
df = pca_transformed_df
# Map 'no_rad' to 0 and 'rad' to 1
df['label'] = df['label'].map({'no_rad': 0, 'rad': 1})


In [None]:
# Perform a grid search for each classifier
X = df.drop(['label'], axis=1)
y = df['label']
groups = df.index

# Create a GroupKFold object
gkf = KFold(n_splits=5)

# Define the classifiers and their parameters
classifiers = [
('lr', LogisticRegression(), {'lr__C': [0.01, 0.1, 1, 10, 100], 'lr__penalty': ['l1', 'l2'], 'lr__solver': ['liblinear', 'saga']}),
    ('svc_linear', SVC(kernel='linear'), {'svc_linear__C': [0.01, 0.1, 1, 10, 100]}),
    ('svc_rbf', SVC(kernel='rbf'), {'svc_rbf__C': [0.01, 0.1, 1, 10, 100], 'svc_rbf__gamma': [0.01, 0.1, 1, 10, 100]}),
    ('rf', RandomForestClassifier(), {'rf__n_estimators': [10, 50, 100, 200], 'rf__max_depth': [None, 5, 10, 15], 'rf__min_samples_split': [2, 5, 10]})
]


# Initialize a list to store the results
results = []

# Perform the grid search 10 times with different random states
for i in range(10):
    # Shuffle the data with a different random state each time
    X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=i)

    # Perform a grid search for each classifier
    for name, classifier, params in classifiers:
        pipeline = Pipeline([('scaler', StandardScaler()), (name, classifier)])
        grid_search = GridSearchCV(pipeline, params, cv=gkf)
        grid_search.fit(X_shuffled, y_shuffled, groups=groups_shuffled)

        # Calculate the cross-validated F1 score, precision, and recall
        f1_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='f1_macro', groups=groups_shuffled)
        precision_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='precision_macro', groups=groups_shuffled)
        recall_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='recall_macro', groups=groups_shuffled)

        # Store the results in a dictionary and add it to the list
        results.append({
            'random_state': i,
            'classifier': name,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'f1_score': f1_scores.mean(),
            'precision': precision_scores.mean(),
            'recall': recall_scores.mean()
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

In [None]:
results_df.groupby('classifier').mean()