In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sorted_alpha import sorted_alpha
from moviepy.editor import VideoFileClip
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from tslearn.barycenters import dtw_barycenter_averaging
import time
from tqdm import tqdm

In [None]:
# resample all to 25 fps
def resample_df(df, original_fps, target_fps):
    # Convert frame indices to time-based index
    df['time'] = pd.to_timedelta(df.index / original_fps, unit='s')
    df.set_index('time', inplace=True)
    
    # Resample the data to the target fps
    resample_interval = f'{int(1e9 / target_fps)}N'  # Nanoseconds interval
    df_resampled = df.resample(resample_interval).mean().dropna()
    
    # Convert time-based index back to frame indices
    df_resampled.index = (df_resampled.index.total_seconds() * target_fps).astype(int)
    return df_resampled

In [None]:
tracking_folder = '/Users/andrei-macpro/Documents/Data/openpose/play/tracking/tracking/' 
video_folder = '/Users/andrei-macpro/Documents/Data/videos/play_videos'   # Assuming video files 

time_series = {}
for folder_name in sorted(os.listdir(tracking_folder)):
    print(folder_name)
    if folder_name == ".DS_Store":
        continue
    file_path = os.path.join(tracking_folder, folder_name)
    tracks = {}
    clip = VideoFileClip(os.path.join(video_folder, folder_name+'.mp4',))
    if clip.fps > 25:
        for file in sorted(os.listdir(file_path)):
            if file == ".DS_Store":
                continue    
            filepath = os.path.join(tracking_folder,folder_name, file)
            df = pd.read_csv(filepath, index_col=0)
            df = df[~df.index.duplicated(keep='first')]
            df.columns = ['x_' + file.split('.')[0], 'y_' + file.split('.')[0]]
            df_resampled = resample_df(df, original_fps=clip.fps, target_fps=25)
            if folder_name not in tracks:
                tracks[folder_name] = []
            tracks[folder_name].append(df_resampled)
        # Combine the resampled DataFrames
        all_dfs = [df for dfs in tracks.values() for df in dfs]
        combined = pd.concat(all_dfs, axis=1).dropna()
        diff_x = combined['x_child'].diff().dropna()
        diff_y = combined['y_child'].diff().dropna()
        child_movement = np.sqrt(diff_x**2 + diff_y**2)
        # Calculate the differences between consecutive rows for 'x_cg' and 'y_cg'
        diff_x = combined['x_cg'].diff().dropna()
        diff_y = combined['y_cg'].diff().dropna()
        caregiver_movement = np.sqrt(diff_x**2 + diff_y**2)
        # Calculate proximiy
        diff_x = combined['x_cg'] - combined['x_child']
        diff_y = combined['y_cg'] - combined['y_child']
        proximity = np.sqrt(diff_x**2 + diff_y**2)
        merged_df = pd.concat([child_movement, caregiver_movement, proximity], axis=1).dropna()
        merged_df.columns = ['child_movement', 'caregiver_movement', 'proximity']   
        time_series[folder_name] = merged_df
    else:
        for file in sorted(os.listdir(file_path)):
            if file == ".DS_Store":
                continue    
            filepath = os.path.join(tracking_folder,folder_name, file)
            df = pd.read_csv(filepath, index_col=0)
            df = df[~df.index.duplicated(keep='first')]
            df.columns = ['x_' + file.split('.')[0], 'y_' + file.split('.')[0]]
            if folder_name not in tracks:
                tracks[folder_name] = []
            tracks[folder_name].append(df)
        all_dfs = [df for dfs in tracks.values() for df in dfs]
        combined = pd.concat(all_dfs, axis=1).dropna()
        diff_x = combined['x_child'].diff().dropna()
        diff_y = combined['y_child'].diff().dropna()
        child_movement = np.sqrt(diff_x**2 + diff_y**2)
        # Calculate the differences between consecutive rows for 'x_cg' and 'y_cg'
        diff_x = combined['x_cg'].diff().dropna()
        diff_y = combined['y_cg'].diff().dropna()
        caregiver_movement = np.sqrt(diff_x**2 + diff_y**2)
        # Calculate proximiy
        diff_x = combined['x_cg'] - combined['x_child']
        diff_y = combined['y_cg'] - combined['y_child']
        proximity = np.sqrt(diff_x**2 + diff_y**2)
        merged_df = pd.concat([child_movement, caregiver_movement, proximity], axis=1).dropna()
        merged_df.columns = ['child_movement', 'caregiver_movement', 'proximity']   
        time_series[folder_name] = merged_df    

         





In [None]:
# Dictionary to store the averaged DataFrames
averaged_time_series = {}

# Iterate over the dictionary
for key, df in time_series.items():
    # Compute the mean of each column
    averaged_df = df.mean()
    # Store the result in the new dictionary
    averaged_time_series[key] = averaged_df
averaged_time_series

In [None]:
averaged_df = pd.DataFrame(averaged_time_series)
# Transpose the DataFrame to have the keys as rows
averaged_df = averaged_df.T
averaged_df

In [None]:
averaged_df['label'] = df_play['label']
averaged_df.index = averaged_df.index.str.split('_').str[0].astype(int)

In [None]:
df_play = pd.read_csv('/Users/andrei-macpro/Documents/Data/tracking/features/play/combined_features.csv', index_col=0)
df_play = df_play.drop(columns=['Age', 'DAI', 'Rinab', 'IQ_T2', 'duration_meal', 'duration_play','Gender'])

In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score


In [None]:
df = averaged_df


# Map 'no_rad' to 0 and 'rad' to 1
df['label'] = df['label'].map({'no_rad': 0, 'rad': 1})


In [None]:
# Perform a grid search for each classifier
#X = df.drop(['label'], axis=1)
X = df[['child_movement']]
y = df['label']
groups = df.index

# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)

# Define the classifiers and their parameters
classifiers = [
('lr', LogisticRegression(), {'lr__C': [0.01, 0.1, 1, 10, 100], 'lr__penalty': ['l1', 'l2'], 'lr__solver': ['liblinear', 'saga']}),
    ('svc_linear', SVC(kernel='linear'), {'svc_linear__C': [0.01, 0.1, 1, 10, 100]}),
    ('svc_rbf', SVC(kernel='rbf'), {'svc_rbf__C': [0.01, 0.1, 1, 10, 100], 'svc_rbf__gamma': [0.01, 0.1, 1, 10, 100]}),
    ('rf', RandomForestClassifier(), {'rf__n_estimators': [10, 50, 100, 200], 'rf__max_depth': [None, 5, 10, 15], 'rf__min_samples_split': [2, 5, 10]})
]


# Initialize a list to store the results
results = []

# Perform the grid search 10 times with different random states
for i in range(10):
    # Shuffle the data with a different random state each time
    X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=i)

    # Perform a grid search for each classifier
    for name, classifier, params in classifiers:
        pipeline = Pipeline([('scaler', StandardScaler()), (name, classifier)])
        grid_search = GridSearchCV(pipeline, params, cv=gkf, n_jobs=-1)
        grid_search.fit(X_shuffled, y_shuffled, groups=groups_shuffled)

        # Calculate the cross-validated F1 score, precision, and recall
        f1_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='f1_macro', groups=groups_shuffled, n_jobs=-1)
        precision_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='precision_macro', groups=groups_shuffled, n_jobs=-1)
        recall_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='recall_macro', groups=groups_shuffled, n_jobs=-1)

        # Store the results in a dictionary and add it to the list
        results.append({
            'random_state': i,
            'classifier': name,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'f1_score': f1_scores.mean(),
            'precision': precision_scores.mean(),
            'recall': recall_scores.mean()
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

In [None]:
results_df.groupby('classifier').mean()