In [17]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# === Column Definition ===
expected_columns = [
    'time', 'palm_position_x', 'palm_position_y', 'palm_position_z',
    'palm_normal_x', 'palm_normal_y', 'palm_normal_z',
    'palm_direction_x', 'palm_direction_y', 'palm_direction_z',
    'hand_grab_angle', 'hand_grab_strength', 'hand_pinch_angle', 'hand_pinch_strength',
    'thumb_extension', 'index_extension', 'middle_extension', 'ring_extension', 'pinky_extension'
]

# === Utilities ===
def is_row_empty(row):
    return all(pd.isna(cell) or (isinstance(cell, str) and cell.strip() == '') for cell in row)

def trim_leading_empty_rows(df):
    for i in range(len(df)):
        if not is_row_empty(df.iloc[i]):
            return df.iloc[i:].reset_index(drop=True)
    return pd.DataFrame(columns=df.columns)

def trim_trailing_empty_rows(df):
    for i in reversed(range(len(df))):
        if not is_row_empty(df.iloc[i]):
            return df.iloc[:i+1].reset_index(drop=True)
    return pd.DataFrame(columns=df.columns)

def find_trial_split_index(df):
    for i in range(len(df)):
        if is_row_empty(df.iloc[i]):
            if i+1 < len(df) and is_row_empty(df.iloc[i+1]):
                return i
            return i
    return None

def split_trials(df):
    df = trim_leading_empty_rows(df)
    split_idx = find_trial_split_index(df)
    if split_idx is None:
        return df.reset_index(drop=True), pd.DataFrame(columns=df.columns)
    trial1 = df.iloc[:split_idx]
    trial2 = df.iloc[split_idx+1:]
    trial1 = trim_trailing_empty_rows(trial1)
    trial2 = trim_leading_empty_rows(trial2)
    trial2 = trim_trailing_empty_rows(trial2)
    return trial1.reset_index(drop=True), trial2.reset_index(drop=True)

def resample_by_time(df, time_col, target_rows):
    df = df.copy()
    df[time_col] = pd.to_numeric(df[time_col], errors='coerce')
    df = df.dropna(subset=[time_col])
    df = df.drop_duplicates(subset=time_col)
    for col in df.columns:
        if col != time_col:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.set_index(time_col)
    new_time_index = np.linspace(df.index.min(), df.index.max(), target_rows)
    df_resampled = df.reindex(new_time_index)
    df_resampled = df_resampled.interpolate(method='linear', axis=0).reset_index()
    df_resampled.rename(columns={'index': time_col}, inplace=True)
    return df_resampled

def safe_resample(df_list, time_col, target_rows):
    return [
        resample_by_time(df, time_col, target_rows)
        for df in df_list
        if time_col in df.columns and not df.empty
    ]

def cascade_gesture(trial1, trial2):
    trial1 = trial1[[col for col in trial1.columns if not col.endswith("time")]]
    trial2 = trial2[[col for col in trial2.columns if not col.endswith("time")]]
    vector = np.concatenate([trial1.to_numpy().T.flatten(), trial2.to_numpy().T.flatten()])
    return pd.DataFrame([vector])

# === MAIN PROCESSING ===
input_folder = r"C:\Users\Abhay\Downloads\ExportedSheets"
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

all_abhay_trial1, all_abhay_trial2 = [], []
all_arjun_trial1, all_arjun_trial2 = [], []

for file in csv_files:
    path = os.path.join(input_folder, file)
    try:
        df = pd.read_csv(path, header=None)
        df = df.iloc[1:].reset_index(drop=True)
        df = df.iloc[:, :39]  # 19 Abhay + 1 blank + 19 Arjun

        abhay_df = df.iloc[:, :19].copy()
        arjun_df = df.iloc[:, 20:].copy()

        abhay_df.columns = [f"Abhay_{col}" for col in expected_columns]
        arjun_df.columns = [f"Arjun_{col}" for col in expected_columns]

        abhay_df.dropna(axis=1, how='all', inplace=True)
        arjun_df.dropna(axis=1, how='all', inplace=True)

        abhay_t1, abhay_t2 = split_trials(abhay_df)
        arjun_t1, arjun_t2 = split_trials(arjun_df)

        all_abhay_trial1.append(abhay_t1)
        all_abhay_trial2.append(abhay_t2)
        all_arjun_trial1.append(arjun_t1)
        all_arjun_trial2.append(arjun_t2)

    except Exception as e:
        print(f"Error processing {file}: {e}")

# === Compute median trial length ===
all_lengths = [len(df) for df in all_abhay_trial1 + all_abhay_trial2 + all_arjun_trial1 + all_arjun_trial2]
median_len = int(np.median(all_lengths))
print(f"Using median trial length: {median_len}")

# === Resample all trials ===
all_abhay_trial1 = safe_resample(all_abhay_trial1, 'Abhay_time', median_len)
all_abhay_trial2 = safe_resample(all_abhay_trial2, 'Abhay_time', median_len)
all_arjun_trial1 = safe_resample(all_arjun_trial1, 'Arjun_time', median_len)
all_arjun_trial2 = safe_resample(all_arjun_trial2, 'Arjun_time', median_len)

# === Cascade trial1 + trial2 into gesture-level rows ===
cascaded_abhay = [cascade_gesture(t1, t2) for t1, t2 in zip(all_abhay_trial1, all_abhay_trial2)]
cascaded_arjun = [cascade_gesture(t1, t2) for t1, t2 in zip(all_arjun_trial1, all_arjun_trial2)]

full_abhay_df = pd.concat(cascaded_abhay, ignore_index=True)
full_arjun_df = pd.concat(cascaded_arjun, ignore_index=True)

print(f"✅ Abhay Cascaded Shape: {full_abhay_df.shape}")
print(f"✅ Arjun Cascaded Shape: {full_arjun_df.shape}")

# === Normalize using Min-Max Scaler ===
minmax_abhay = MinMaxScaler()
normalized_abhay = pd.DataFrame(minmax_abhay.fit_transform(full_abhay_df))

minmax_arjun = MinMaxScaler()
normalized_arjun = pd.DataFrame(minmax_arjun.fit_transform(full_arjun_df))

# === Split into Train/Test ===
train_abhay_df = normalized_abhay.iloc[:70].reset_index(drop=True)
test_abhay_df = normalized_abhay.iloc[70:].reset_index(drop=True)

train_arjun_df = normalized_arjun.iloc[:70].reset_index(drop=True)
test_arjun_df = normalized_arjun.iloc[70:].reset_index(drop=True)

# === PCA ===
pca_abhay = PCA(n_components=0.95)
train_abhay_pca = pd.DataFrame(pca_abhay.fit_transform(train_abhay_df))
test_abhay_pca = pd.DataFrame(pca_abhay.transform(test_abhay_df))

pca_arjun = PCA(n_components=0.95)
train_arjun_pca = pd.DataFrame(pca_arjun.fit_transform(train_arjun_df))
test_arjun_pca = pd.DataFrame(pca_arjun.transform(test_arjun_df))

# === Final Output Summary ===
print("🎯 PCA complete")
print(f"Abhay PCA shape: Train {train_abhay_pca.shape}, Test {test_abhay_pca.shape}")
print(f"Arjun PCA shape: Train {train_arjun_pca.shape}, Test {test_arjun_pca.shape}")


Using median trial length: 118
✅ Abhay Cascaded Shape: (100, 4248)
✅ Arjun Cascaded Shape: (100, 4248)
🎯 PCA complete
Abhay PCA shape: Train (70, 20), Test (30, 20)
Arjun PCA shape: Train (70, 21), Test (30, 21)
