In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.preprocessing import StandardScaler

# Extract transition counts, dwell time, autocorrelation, entropy, slope, fourier

In [None]:
from scipy.stats import zscore
def clean_outliers(df):
    z_scores_x = zscore(df.iloc[:,0])
    z_scores_y = zscore(df.iloc[:,1])

    # Define threshold for z-score (e.g., 3)
    threshold = 3

    # Identify outliers based on z-scores
    outliers_x = df[abs(z_scores_x) > threshold]
    outliers_y = df[abs(z_scores_y) > threshold]

    # Remove outliers from DataFrame
    cleaned_df = df[~df.index.isin(outliers_x.index) & ~df.index.isin(outliers_y.index)]
    return cleaned_df

In [None]:
from moviepy.editor import VideoFileClip
video_folder = '/Users/andrei-macpro/Documents/Data/videos/meal_videos' 

durations = []
file_names = []

for folder_name in sorted(os.listdir(video_folder)):
    if folder_name == ".DS_Store":
        continue
    file_path = os.path.join(video_folder, folder_name)
    if os.path.isfile(file_path):
        clip = VideoFileClip(file_path)
        durations.append(clip.duration)
        file_names.append(folder_name.split('.')[0])

durations = pd.DataFrame({'file_name': file_names, 'duration': durations})

In [None]:
tracking_folder = '/Users/andrei-macpro/Documents/Data/openpose/meal/tracking/tracking' 

# Extract movement states

In [None]:
# now do it for all videos
stats = {}
window_size = 200
step_size = 5
segments_stats_child = []
scaler = StandardScaler()
for folder_name in sorted(os.listdir(tracking_folder)):
    if folder_name == ".DS_Store":
        continue
    file_path = os.path.join(tracking_folder, folder_name)
    tracks = {}
    for file in sorted(os.listdir(file_path)):
                # load the file in a pandas dataframe
            if file == ".DS_Store":
                continue    
            filepath = os.path.join(tracking_folder,folder_name, file)
            df = pd.read_csv(filepath, index_col=0)
            df = df[~df.index.duplicated(keep='first')]
            df.columns = ['x_' + file.split('.')[0], 'y_' + file.split('.')[0]]
            df = clean_outliers(df)
            tracks[file.split('.')[0]] = df

    print(folder_name)
    combined_data = pd.concat([df for df in tracks.values()], axis=1)
    combined_scaled = scaler.fit_transform(combined_data)
    # Convert the scaled data back to a DataFrame
    combined_scaled_df = pd.DataFrame(combined_scaled, index=combined_data.index, columns=combined_data.columns)

# Split the combined_scaled_df back into separate DataFrames and assign them back to the tracks dictionary
    for track_name in tracks.keys():
        x_cols = [col for col in combined_scaled_df.columns if col.startswith('x_' + track_name)]
        y_cols = [col for col in combined_scaled_df.columns if col.startswith('y_' + track_name)]
        tracks[track_name] = combined_scaled_df[x_cols + y_cols]

    # Calculate Euclidean distances
    for track_name, track_df in tracks.items():
        distances = []
        frame_indices = track_df.index
        
        for i in range(len(track_df)-1):
            
            if frame_indices[i + 1] - frame_indices[i] < 5:
                distance = np.sqrt((track_df['x_' + track_name].iloc[i] - track_df['x_' + track_name].iloc[i+1])**2 + 
                            (track_df['y_' + track_name].iloc[i] - track_df['y_' + track_name].iloc[i+1])**2)
                distances.append(distance)
            else:
                distances.append(np.nan)
        
         # Append np.nan to match the number of rows in the DataFrame
        distances.append(np.nan)
        track_df['distance'] = distances
        k_means_segments = []
        for start in range(0, len(track_df) - window_size + 1, step_size):
            if track_df.columns[0] == 'x_cg':
                continue
            end = start + window_size
            segment = track_df['distance'].iloc[start:end]
            
            # Calculate statistics for the segment
            mean_distance = segment.mean()
            variance_distance = segment.var()
            max_distance = segment.max()
            
            # Append the statistics to the list
            k_means_segments.append({'id': folder_name , 'stats':[mean_distance, variance_distance, max_distance]})
    segments_stats_child.append(k_means_segments)      

        

# normalize the distances (mean, variance etc) by the duration of the video
    

In [None]:
segments_stats_child[0]

In [None]:
segments_stats_child_flattened = [{'id': s['id'], 'mean': s['stats'][0], 'variance': s['stats'][1], 'max': s['stats'][2]} for segment in segments_stats_child for s in segment]

In [None]:
segments_stats_df = pd.DataFrame(segments_stats_child_flattened)
segments_stats_df = segments_stats_df.dropna()
segments_stats_df_for_clustering = segments_stats_df.drop(columns=['id'])


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=0)  # Adjust the number of clusters as needed
kmeans.fit(segments_stats_df_for_clustering)


In [None]:
segments_stats_df['cluster'] = kmeans.labels_


In [None]:
segments_stats_df

In [None]:
from collections import defaultdict
transition_counts = defaultdict(lambda: defaultdict(int))
grouped = segments_stats_df.groupby('id')

In [None]:
for id, group in grouped:
    previous_cluster = None
    for cluster in group['cluster']:
        if previous_cluster is not None and previous_cluster != cluster:
            transition_counts[id][(previous_cluster, cluster)] += 1
        previous_cluster = cluster


In [None]:
total_transitions = {id: sum(count for (from_cluster, to_cluster), count in transitions.items() if from_cluster != to_cluster) for id, transitions in transition_counts.items()}


In [None]:
all_ids = segments_stats_df['id'].unique()
total_transitions = {id: total_transitions.get(id, 0) for id in all_ids}

In [None]:
total_transitions_df = pd.DataFrame(list(total_transitions.items()), columns=['id', 'total_transitions'])
total_transitions_df.set_index('id', inplace=True)

In [None]:
transition_counts = defaultdict(lambda: defaultdict(int))

# Group the DataFrame by 'id'
grouped = segments_stats_df.groupby('id')

# Iterate through each group and count transitions
for id, group in grouped:
    previous_cluster = None
    for cluster in group['cluster']:
        if previous_cluster is not None:
            if (previous_cluster == 0 and cluster == 2) or (previous_cluster == 2 and cluster == 0):
                transition_counts[id][(previous_cluster, cluster)] += 1
        previous_cluster = cluster

# Calculate the total number of transitions from 0 to 2 or from 2 to 0 for each id
total_transitions_0_2 = {id: sum(count for (from_cluster, to_cluster), count in transitions.items() if (from_cluster == 0 and to_cluster == 2) or (from_cluster == 2 and to_cluster == 0)) for id, transitions in transition_counts.items()}

# Ensure all ids are included, even if no transitions are found
all_ids = segments_stats_df['id'].unique()
total_transitions_0_2 = {id: total_transitions_0_2.get(id, 0) for id in all_ids}

# Convert the total transitions to a DataFrame
total_transitions_0_2_df = pd.DataFrame(list(total_transitions_0_2.items()), columns=['id', 'total_transitions_0_2'])

total_transitions_0_2_df.set_index('id', inplace=True)

## calculate dwell time

In [None]:
cluster_counts = segments_stats_df.groupby(['id', 'cluster']).size().reset_index(name='count')
cluster_counts_pivot = cluster_counts.pivot(index='id', columns='cluster', values='count').fillna(0).astype(int)

In [None]:
len(cluster_counts_pivot), len(total_transitions_df), len(total_transitions_0_2_df)

In [None]:
all_ids = cluster_counts_pivot.index.union(total_transitions_0_2_df.index).union(total_transitions_df.index)

cluster_counts_pivot = cluster_counts_pivot.reindex(all_ids).fillna(0)
total_transitions_0_2_df = total_transitions_0_2_df.reindex(all_ids).fillna(0)
total_transitions_df = total_transitions_df.reindex(all_ids).fillna(0)

In [None]:
temporal_features = pd.concat([cluster_counts_pivot, total_transitions_0_2_df, total_transitions_df], axis=1)
temporal_features.columns = ['cluster_0', 'cluster_1', 'cluster_2', 'jump_transitions', 'total_transitions']
temporal_features


In [None]:
durations.index = durations['file_name']
durations.index.name = 'id'
durations.drop(columns=['file_name'], inplace=True)
durations

In [None]:
if not temporal_features.index.equals(durations.index):
    print("Indices are not aligned. Aligning indices...")
    durations = durations.reindex(temporal_features.index)

In [None]:
temporal_features = temporal_features.div(durations['duration'], axis=0)*100

In [None]:
temporal_features

# calculate autocorrelation

In [None]:
from sklearn.linear_model import LinearRegression
from scipy.stats import entropy
def calculate_entropy(series):
    # Normalize the series to get probabilities
    value_counts = series.value_counts(normalize=True)
    return entropy(value_counts)

distance_entropy = calculate_entropy(track_df['distance'])

# Calculate slope using linear regression
def calculate_slope(series):
    X = np.arange(len(series)).reshape(-1, 1)
    y = series.values
    model = LinearRegression()
    model.fit(X, y)
    return model.coef_[0]

In [None]:
# now do it for all videos
stats = {}
lag = 50
scaler = StandardScaler()
for folder_name in sorted(os.listdir(tracking_folder)):
    if folder_name == ".DS_Store":
        continue
    file_path = os.path.join(tracking_folder, folder_name)
    tracks = {}
    for file in sorted(os.listdir(file_path)):
                # load the file in a pandas dataframe
            if file == ".DS_Store":
                continue    
            filepath = os.path.join(tracking_folder,folder_name, file)
            df = pd.read_csv(filepath, index_col=0)
            df = df[~df.index.duplicated(keep='first')]
            df.columns = ['x_' + file.split('.')[0], 'y_' + file.split('.')[0]]
            df = clean_outliers(df)
            tracks[file.split('.')[0]] = df

    print(folder_name)
    combined_data = pd.concat([df for df in tracks.values()], axis=1)
    combined_scaled = scaler.fit_transform(combined_data)
    # Convert the scaled data back to a DataFrame
    combined_scaled_df = pd.DataFrame(combined_scaled, index=combined_data.index, columns=combined_data.columns)

# Split the combined_scaled_df back into separate DataFrames and assign them back to the tracks dictionary
    for track_name in tracks.keys():
        x_cols = [col for col in combined_scaled_df.columns if col.startswith('x_' + track_name)]
        y_cols = [col for col in combined_scaled_df.columns if col.startswith('y_' + track_name)]
        tracks[track_name] = combined_scaled_df[x_cols + y_cols]

    # Calculate Euclidean distances
    for track_name, track_df in tracks.items():
        distances = []
        frame_indices = track_df.index
        
        for i in range(len(track_df)-1):
            
            if frame_indices[i + 1] - frame_indices[i] < 5:
                distance = np.sqrt((track_df['x_' + track_name].iloc[i] - track_df['x_' + track_name].iloc[i+1])**2 + 
                            (track_df['y_' + track_name].iloc[i] - track_df['y_' + track_name].iloc[i+1])**2)
                distances.append(distance)
            else:
                distances.append(np.nan)
        
         # Append np.nan to match the number of rows in the DataFrame
        distances.append(np.nan)
        track_df['distance'] = distances
        autocorrelation_50 = track_df['distance'].autocorr(lag=50)
        autocorrelation_100 = track_df['distance'].autocorr(lag=100)
        distance_entropy = calculate_entropy(track_df['distance'])
        distance_slope = calculate_slope(track_df['distance'].dropna())
        fft_values = np.fft.fft(track_df['distance'].dropna())
        fft_magnitude = np.abs(fft_values)
        fft_frequencies = np.fft.fftfreq(len(track_df['distance'].dropna()))
        spectral_centroid = np.sum(fft_frequencies * fft_magnitude) / np.sum(fft_magnitude)
        signal_power = np.sum(fft_magnitude**2)
        high_freq_power = np.sum(fft_magnitude[fft_frequencies > 0.5]**2)
        if folder_name not in stats:
            stats[folder_name] = {}
        stats[folder_name][track_name] = {'autocorrelation_50': autocorrelation_50, 'autocorrelation_100': autocorrelation_100,
                                          'entropy': distance_entropy, 'slope': distance_slope, 'spectral_centroid': spectral_centroid,
                                          'signal_power': signal_power, 'low_freq_power': low_freq_power}
        

# normalize the distances (mean, variance etc) by the duration of the video
    

In [None]:
fft_magnitude

In [None]:
# Convert the stats dictionary to a DataFrame
df = pd.DataFrame.from_dict({(i,j): stats[i][j] 
                           for i in stats.keys() 
                           for j in stats[i].keys()},
                           orient='index')

# Set the index names
df.index.names = ['id', 'track_name']

In [None]:
# Reset only the second level of the index (track_name)
df.reset_index(level=1, inplace=True)

# Create new DataFrame to hold results
new_df = pd.DataFrame(index=df.index.unique())

# Loop over each unique track_name
for track_name in df['track_name'].unique():
    # Select rows for this track_name
    temp_df = df[df['track_name'] == track_name].copy()
    # Drop the 'track_name' column as it's no longer needed
    temp_df.drop(columns=['track_name'], inplace=True)
    # Add the track_name as a prefix to each column name
    temp_df.columns = [f'{track_name}_{col}' for col in temp_df.columns]
    # Add the results to new_df
    new_df = new_df.join(temp_df)

new_df

In [None]:
all_features = pd.concat([temporal_features, new_df], axis=1)
all_features

In [None]:
all_features.to_csv('/Users/andrei-macpro/Documents/Data/tracking/features/meal/temporal_features.csv')