In [3]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler

# === Constants ===
expected_columns = [
    'time', 'palm_position_x', 'palm_position_y', 'palm_position_z',
    'palm_normal_x', 'palm_normal_y', 'palm_normal_z',
    'palm_direction_x', 'palm_direction_y', 'palm_direction_z',
    'hand_grab_angle', 'hand_grab_strength', 'hand_pinch_angle', 'hand_pinch_strength',
    'thumb_extension', 'index_extension', 'middle_extension', 'ring_extension', 'pinky_extension'
]
TRAIN_ROWS = 70
TEST_ROWS = 30
input_folder = r"C:\Users\Abhay\Downloads\ExportedSheets"

# === Helpers ===
def is_row_empty(row):
    return all(pd.isna(cell) or (isinstance(cell, str) and cell.strip() == '') for cell in row)

def trim_leading_empty_rows(df):
    for i in range(len(df)):
        if not is_row_empty(df.iloc[i]):
            return df.iloc[i:].reset_index(drop=True)
    return pd.DataFrame(columns=df.columns)

def trim_trailing_empty_rows(df):
    for i in reversed(range(len(df))):
        if not is_row_empty(df.iloc[i]):
            return df.iloc[:i+1].reset_index(drop=True)
    return pd.DataFrame(columns=df.columns)

def find_trial_split_index(df):
    for i in range(len(df)):
        if is_row_empty(df.iloc[i]):
            if i+1 < len(df) and is_row_empty(df.iloc[i+1]):
                return i
            return i
    return None

def split_trials(df):
    df = trim_leading_empty_rows(df)
    split_idx = find_trial_split_index(df)
    if split_idx is None:
        return df.reset_index(drop=True), pd.DataFrame(columns=df.columns)

    trial1 = df.iloc[:split_idx]
    trial2 = df.iloc[split_idx+1:]

    trial1 = trim_trailing_empty_rows(trial1)
    trial2 = trim_leading_empty_rows(trial2)
    trial2 = trim_trailing_empty_rows(trial2)

    return trial1.reset_index(drop=True), trial2.reset_index(drop=True)

def resample_by_time(df, time_col, target_rows):
    df = df.copy()
    df[time_col] = pd.to_numeric(df[time_col], errors='coerce')
    df = df.dropna(subset=[time_col])
    df = df.drop_duplicates(subset=time_col)

    for col in df.columns:
        if col != time_col:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.set_index(time_col)
    new_time_index = np.linspace(df.index.min(), df.index.max(), target_rows)
    df_resampled = df.reindex(new_time_index)
    df_resampled = df_resampled.interpolate(method='linear', axis=0).reset_index()
    df_resampled.rename(columns={'index': time_col}, inplace=True)
    return df_resampled

def safe_resample(df_list, time_col, target_rows):
    return [resample_by_time(df, time_col, target_rows)
            for df in df_list if time_col in df.columns and not df.empty]

def standardize_df(df, exclude_cols):
    scaler = StandardScaler()
    cols_to_scale = [col for col in df.columns if col not in exclude_cols]
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    return df

def cascade(df, person_name):
    columns_to_stack = [col for col in df.columns if col != f'{person_name}_time']
    return pd.Series(np.concatenate([df[col].values for col in columns_to_stack]))

def process_trials(trials, person_name):
    train = [cascade(df.iloc[:TRAIN_ROWS], person_name) for df in trials]
    test  = [cascade(df.iloc[TRAIN_ROWS:TRAIN_ROWS+TEST_ROWS], person_name) for df in trials]
    return pd.DataFrame(train), pd.DataFrame(test)

# === Read and process all CSVs ===
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv') and 'Hand Data Summer 25 (5)' in f]
all_abhay_trial1, all_abhay_trial2 = [], []
all_arjun_trial1, all_arjun_trial2 = [], []

for file in csv_files:
    path = os.path.join(input_folder, file)
    try:
        df = pd.read_csv(path, header=None)
        df = df.iloc[1:].reset_index(drop=True)
        df = df.iloc[:, :39]  # 19 + 1 blank + 19

        abhay_df = df.iloc[:, :19].copy()
        arjun_df = df.iloc[:, 20:].copy()

        abhay_df.columns = [f"Abhay_{col}" for col in expected_columns]
        arjun_df.columns = [f"Arjun_{col}" for col in expected_columns]

        abhay_df.dropna(axis=1, how='all', inplace=True)
        arjun_df.dropna(axis=1, how='all', inplace=True)

        abhay_t1, abhay_t2 = split_trials(abhay_df)
        arjun_t1, arjun_t2 = split_trials(arjun_df)

        all_abhay_trial1.append(abhay_t1)
        all_abhay_trial2.append(abhay_t2)
        all_arjun_trial1.append(arjun_t1)
        all_arjun_trial2.append(arjun_t2)
    except Exception as e:
        print(f"Error processing {file}: {e}")

# === Get median length and resample ===
all_lengths = [len(df) for df in all_abhay_trial1 + all_abhay_trial2 + all_arjun_trial1 + all_arjun_trial2]
median_len = int(np.median(all_lengths))

all_abhay_trial1 = safe_resample(all_abhay_trial1, 'Abhay_time', median_len)
all_abhay_trial2 = safe_resample(all_abhay_trial2, 'Abhay_time', median_len)
all_arjun_trial1 = safe_resample(all_arjun_trial1, 'Arjun_time', median_len)
all_arjun_trial2 = safe_resample(all_arjun_trial2, 'Arjun_time', median_len)

all_abhay_trial1 = [standardize_df(df, ['Abhay_time']) for df in all_abhay_trial1]
all_abhay_trial2 = [standardize_df(df, ['Abhay_time']) for df in all_abhay_trial2]
all_arjun_trial1 = [standardize_df(df, ['Arjun_time']) for df in all_arjun_trial1]
all_arjun_trial2 = [standardize_df(df, ['Arjun_time']) for df in all_arjun_trial2]

# === Cascade into training/testing sets ===
abhay_train_df1, abhay_test_df1 = process_trials(all_abhay_trial1, 'Abhay')
abhay_train_df2, abhay_test_df2 = process_trials(all_abhay_trial2, 'Abhay')
arjun_train_df1, arjun_test_df1 = process_trials(all_arjun_trial1, 'Arjun')
arjun_train_df2, arjun_test_df2 = process_trials(all_arjun_trial2, 'Arjun')

# === Final merge (optional, or you can keep separate) ===
abhay_train_df = pd.concat([abhay_train_df1, abhay_train_df2], ignore_index=True)
abhay_test_df = pd.concat([abhay_test_df1, abhay_test_df2], ignore_index=True)
arjun_train_df = pd.concat([arjun_train_df1, arjun_train_df2], ignore_index=True)
arjun_test_df = pd.concat([arjun_test_df1, arjun_test_df2], ignore_index=True)

# === Optional Save ===
# abhay_train_df.to_csv("abhay_train_cascaded.csv", index=False)
# abhay_test_df.to_csv("abhay_test_cascaded.csv", index=False)
# arjun_train_df.to_csv("arjun_train_cascaded.csv", index=False)
# arjun_test_df.to_csv("arjun_test_cascaded.csv", index=False)


ValueError: cannot convert float NaN to integer

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
os.path.exists(r"C:\Users\Abhay\Downloads\Hand Data Summer 25 (5).xlsx")


# === Setup ===
input_file = r"C:\Users\Abhay\Downloads\Hand Data Summer 25 (5).xlsx"
sheet_name = 0  # Change to sheet name or index as needed (e.g., 'ASL - A' or 0)

expected_columns = [
    'time', 'palm_position_x', 'palm_position_y', 'palm_position_z',
    'palm_normal_x', 'palm_normal_y', 'palm_normal_z',
    'palm_direction_x', 'palm_direction_y', 'palm_direction_z',
    'hand_grab_angle', 'hand_grab_strength', 'hand_pinch_angle', 'hand_pinch_strength',
    'thumb_extension', 'index_extension', 'middle_extension', 'ring_extension', 'pinky_extension'
]

def is_row_empty(row):
    return all(pd.isna(cell) or (isinstance(cell, str) and cell.strip() == '') for cell in row)

def trim_leading_empty_rows(df):
    for i in range(len(df)):
        if not is_row_empty(df.iloc[i]):
            return df.iloc[i:].reset_index(drop=True)
    return pd.DataFrame(columns=df.columns)

def trim_trailing_empty_rows(df):
    for i in reversed(range(len(df))):
        if not is_row_empty(df.iloc[i]):
            return df.iloc[:i+1].reset_index(drop=True)
    return pd.DataFrame(columns=df.columns)

def find_trial_split_index(df):
    for i in range(len(df)):
        if is_row_empty(df.iloc[i]):
            return i
    return None

def split_trials(df):
    df = trim_leading_empty_rows(df)
    split_idx = find_trial_split_index(df)
    if split_idx is None:
        return df.reset_index(drop=True), pd.DataFrame(columns=df.columns)
    trial1 = df.iloc[:split_idx]
    trial2 = df.iloc[split_idx+1:]
    trial1 = trim_trailing_empty_rows(trial1)
    trial2 = trim_leading_empty_rows(trial2)
    trial2 = trim_trailing_empty_rows(trial2)
    return trial1.reset_index(drop=True), trial2.reset_index(drop=True)

def resample_by_time(df, time_col, target_rows):
    df = df.copy()
    df[time_col] = pd.to_numeric(df[time_col], errors='coerce')
    df = df.dropna(subset=[time_col])
    df = df.drop_duplicates(subset=time_col)
    for col in df.columns:
        if col != time_col:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.set_index(time_col)
    new_time_index = np.linspace(df.index.min(), df.index.max(), target_rows)
    df_resampled = df.reindex(new_time_index)
    df_resampled = df_resampled.interpolate(method='linear', axis=0).reset_index()
    df_resampled.rename(columns={'index': time_col}, inplace=True)
    return df_resampled

def standardize_df(df, exclude_cols):
    scaler = StandardScaler()
    cols_to_scale = [col for col in df.columns if col not in exclude_cols]
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    return df

def cascade(df, person_name, start_row, end_row):
    cols = [col for col in df.columns if col != f'{person_name}_time']
    return pd.Series(np.concatenate([df.loc[start_row:end_row-1, col].values for col in cols]))

# === Load and prepare ===
df = pd.read_excel(input_file, sheet_name=sheet_name, header=None)
df = df.iloc[1:].reset_index(drop=True)
df = df.iloc[:, :39]  # 19 + 1 blank + 19

# Split into Abhay and Arjun
abhay_df = df.iloc[:, :19].copy()
arjun_df = df.iloc[:, 20:].copy()
abhay_df.columns = [f"Abhay_{col}" for col in expected_columns]
arjun_df.columns = [f"Arjun_{col}" for col in expected_columns]

abhay_df.dropna(axis=1, how='all', inplace=True)
arjun_df.dropna(axis=1, how='all', inplace=True)

# Split trials
abhay_t1, abhay_t2 = split_trials(abhay_df)
arjun_t1, arjun_t2 = split_trials(arjun_df)

print(f"\n=== Trial lengths from sheet: {sheet_name} ===")
print(f"Abhay Trial 1: {len(abhay_t1)} rows")
print(f"Abhay Trial 2: {len(abhay_t2)} rows")
print(f"Arjun Trial 1: {len(arjun_t1)} rows")
print(f"Arjun Trial 2: {len(arjun_t2)} rows")

# Median length
lengths = [len(d) for d in [abhay_t1, abhay_t2, arjun_t1, arjun_t2] if not d.empty]
if not lengths:
    raise ValueError("No usable trials found")
median_len = int(np.median(lengths))
print(f"Using median trial length: {median_len}")

# Resample
abhay_t1 = resample_by_time(abhay_t1, 'Abhay_time', median_len)
abhay_t2 = resample_by_time(abhay_t2, 'Abhay_time', median_len)
arjun_t1 = resample_by_time(arjun_t1, 'Arjun_time', median_len)
arjun_t2 = resample_by_time(arjun_t2, 'Arjun_time', median_len)

# Standardize
abhay_t1 = standardize_df(abhay_t1, ['Abhay_time'])
abhay_t2 = standardize_df(abhay_t2, ['Abhay_time'])
arjun_t1 = standardize_df(arjun_t1, ['Arjun_time'])
arjun_t2 = standardize_df(arjun_t2, ['Arjun_time'])

# Cascade
TRAIN_ROWS = 70
TEST_ROWS = 30

abhay_train_df = pd.DataFrame([
    cascade(abhay_t1, 'Abhay', 0, TRAIN_ROWS),
    cascade(abhay_t2, 'Abhay', 0, TRAIN_ROWS)
])
abhay_test_df = pd.DataFrame([
    cascade(abhay_t1, 'Abhay', TRAIN_ROWS, TRAIN_ROWS + TEST_ROWS),
    cascade(abhay_t2, 'Abhay', TRAIN_ROWS, TRAIN_ROWS + TEST_ROWS)
])
arjun_train_df = pd.DataFrame([
    cascade(arjun_t1, 'Arjun', 0, TRAIN_ROWS),
    cascade(arjun_t2, 'Arjun', 0, TRAIN_ROWS)
])
arjun_test_df = pd.DataFrame([
    cascade(arjun_t1, 'Arjun', TRAIN_ROWS, TRAIN_ROWS + TEST_ROWS),
    cascade(arjun_t2, 'Arjun', TRAIN_ROWS, TRAIN_ROWS + TEST_ROWS)
])

# === OPTIONAL: Save to file
# abhay_train_df.to_csv("abhay_train.csv", index=False)
# abhay_test_df.to_csv("abhay_test.csv", index=False)
# arjun_train_df.to_csv("arjun_train.csv", index=False)
# arjun_test_df.to_csv("arjun_test.csv", index=False)

print("✅ Cascading complete.")
print("\n=== Final Cascaded Dimensions ===")
print(f"Abhay Training Set: {abhay_train_df.shape}")
print(f"Abhay Testing Set : {abhay_test_df.shape}")
print(f"Arjun Training Set: {arjun_train_df.shape}")
print(f"Arjun Testing Set : {arjun_test_df.shape}")




=== Trial lengths from sheet: 0 ===
Abhay Trial 1: 139 rows
Abhay Trial 2: 130 rows
Arjun Trial 1: 129 rows
Arjun Trial 2: 143 rows
Using median trial length: 134
✅ Cascading complete.

=== Final Cascaded Dimensions ===
Abhay Training Set: (2, 1260)
Abhay Testing Set : (2, 540)
Arjun Training Set: (2, 1260)
Arjun Testing Set : (2, 540)


In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler

# === Column Definition ===
expected_columns = [
    'time', 'palm_position_x', 'palm_position_y', 'palm_position_z',
    'palm_normal_x', 'palm_normal_y', 'palm_normal_z',
    'palm_direction_x', 'palm_direction_y', 'palm_direction_z',
    'hand_grab_angle', 'hand_grab_strength', 'hand_pinch_angle', 'hand_pinch_strength',
    'thumb_extension', 'index_extension', 'middle_extension', 'ring_extension', 'pinky_extension'
]

# === Utilities ===
def is_row_empty(row):
    return all(pd.isna(cell) or (isinstance(cell, str) and cell.strip() == '') for cell in row)

def trim_leading_empty_rows(df):
    for i in range(len(df)):
        if not is_row_empty(df.iloc[i]):
            return df.iloc[i:].reset_index(drop=True)
    return pd.DataFrame(columns=df.columns)

def trim_trailing_empty_rows(df):
    for i in reversed(range(len(df))):
        if not is_row_empty(df.iloc[i]):
            return df.iloc[:i+1].reset_index(drop=True)
    return pd.DataFrame(columns=df.columns)

def find_trial_split_index(df):
    for i in range(len(df)):
        if is_row_empty(df.iloc[i]):
            if i+1 < len(df) and is_row_empty(df.iloc[i+1]):
                return i
            return i
    return None

def split_trials(df):
    df = trim_leading_empty_rows(df)
    split_idx = find_trial_split_index(df)
    if split_idx is None:
        return df.reset_index(drop=True), pd.DataFrame(columns=df.columns)

    trial1 = df.iloc[:split_idx]
    trial2 = df.iloc[split_idx+1:]

    trial1 = trim_trailing_empty_rows(trial1)
    trial2 = trim_leading_empty_rows(trial2)
    trial2 = trim_trailing_empty_rows(trial2)

    return trial1.reset_index(drop=True), trial2.reset_index(drop=True)
def safe_resample(df_list, time_col, target_rows):
    return [
        resample_by_time(df, time_col, target_rows)
        for df in df_list
        if time_col in df.columns and not df.empty
    ]

def resample_by_time(df, time_col, target_rows):
    df = df.copy()

    # 1. Convert time column to numeric
    df[time_col] = pd.to_numeric(df[time_col], errors='coerce')
    df = df.dropna(subset=[time_col])
    df = df.drop_duplicates(subset=time_col)

    # 2. Convert all other columns to numeric
    for col in df.columns:
        if col != time_col:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # 3. Resample
    df = df.set_index(time_col)
    new_time_index = np.linspace(df.index.min(), df.index.max(), target_rows)
    df_resampled = df.reindex(new_time_index)
    df_resampled = df_resampled.interpolate(method='linear', axis=0).reset_index()
    df_resampled.rename(columns={'index': time_col}, inplace=True)

    return df_resampled



def standardize_df(df, exclude_cols):
    scaler = StandardScaler()
    cols_to_scale = [col for col in df.columns if col not in exclude_cols]
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    return df

# === MAIN ===
input_folder = r"C:\Users\Abhay\Downloads\ExportedSheets"
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

all_abhay_trial1, all_abhay_trial2 = [], []
all_arjun_trial1, all_arjun_trial2 = [], []

for file in csv_files:
    

    path = os.path.join(input_folder, file)
    try:
        df = pd.read_csv(path, header=None)
        df = df.iloc[1:].reset_index(drop=True)
        df = df.iloc[:, :39]  # 19 Abhay + 1 blank + 19 Arjun

        abhay_df = df.iloc[:, :19].copy()
        arjun_df = df.iloc[:, 20:].copy()

        abhay_df.columns = [f"Abhay_{col}" for col in expected_columns]
        arjun_df.columns = [f"Arjun_{col}" for col in expected_columns]

        abhay_df.dropna(axis=1, how='all', inplace=True)
        arjun_df.dropna(axis=1, how='all', inplace=True)

        abhay_t1, abhay_t2 = split_trials(abhay_df)
        arjun_t1, arjun_t2 = split_trials(arjun_df)


        all_abhay_trial1.append(abhay_t1)
        all_abhay_trial2.append(abhay_t2)
        all_arjun_trial1.append(arjun_t1)
        all_arjun_trial2.append(arjun_t2)

    except Exception as e:
        print(f"Error processing {file}: {e}")

# === Compute median length ===
all_lengths = [len(df) for df in all_abhay_trial1 + all_abhay_trial2 + all_arjun_trial1 + all_arjun_trial2]
median_len = int(np.median(all_lengths))
print(f"Using median trial length: {median_len}")
for filename, a1, a2, r1, r2 in zip(csv_files, all_abhay_trial1, all_abhay_trial2, all_arjun_trial1, all_arjun_trial2):
    print(f"{filename}:")
    print(f"  Abhay Trial 1: {len(a1)} rows")
    print(f"  Abhay Trial 2: {len(a2)} rows")
    print(f"  Arjun Trial 1: {len(r1)} rows")
    print(f"  Arjun Trial 2: {len(r2)} rows")

# === Resample ===
all_abhay_trial1 = safe_resample(all_abhay_trial1, 'Abhay_time', median_len)
all_abhay_trial2 = safe_resample(all_abhay_trial2, 'Abhay_time', median_len)
all_arjun_trial1 = safe_resample(all_arjun_trial1, 'Arjun_time', median_len)
all_arjun_trial2 = safe_resample(all_arjun_trial2, 'Arjun_time', median_len)
print("\n=== Trial Lengths Per File ===")


# === Standardize ===
all_abhay_trial1 = [standardize_df(df, ['Abhay_time']) for df in all_abhay_trial1]
all_abhay_trial2 = [standardize_df(df, ['Abhay_time']) for df in all_abhay_trial2]
all_arjun_trial1 = [standardize_df(df, ['Arjun_time']) for df in all_arjun_trial1]
all_arjun_trial2 = [standardize_df(df, ['Arjun_time']) for df in all_arjun_trial2]


def cascade_stack(df, rows):
    """Stacks columns of the first `rows` rows of df vertically into one long row."""
    return pd.DataFrame([df.iloc[:rows].to_numpy().T.flatten()])

# === Cascading ===
train_abhay1_df, test_abhay1_df = [], []
train_abhay2_df, test_abhay2_df = [], []
train_arjun1_df, test_arjun1_df = [], []
train_arjun2_df, test_arjun2_df = [], []

for a1, a2, r1, r2 in zip(all_abhay_trial1, all_abhay_trial2, all_arjun_trial1, all_arjun_trial2):
    # Abhay Trials
    train_abhay1_df.append(cascade_stack(a1, 70))
    test_abhay1_df.append(cascade_stack(a1, 30))
    train_abhay2_df.append(cascade_stack(a2, 70))
    test_abhay2_df.append(cascade_stack(a2, 30))
    
    # Arjun Trials
    train_arjun1_df.append(cascade_stack(r1, 70))
    test_arjun1_df.append(cascade_stack(r1, 30))
    train_arjun2_df.append(cascade_stack(r2, 70))
    test_arjun2_df.append(cascade_stack(r2, 30))

# === Combine to single DataFrames ===
train_abhay1_df = pd.concat(train_abhay1_df, ignore_index=True)
train_abhay2_df = pd.concat(train_abhay2_df, ignore_index=True)
train_arjun1_df = pd.concat(train_arjun1_df, ignore_index=True)
train_arjun2_df = pd.concat(train_arjun2_df, ignore_index=True)

test_abhay1_df = pd.concat(test_abhay1_df, ignore_index=True)
test_abhay2_df = pd.concat(test_abhay2_df, ignore_index=True)
test_arjun1_df = pd.concat(test_arjun1_df, ignore_index=True)
test_arjun2_df = pd.concat(test_arjun2_df, ignore_index=True)

print("✅ Cascading complete")
print(f"Abhay Train 1: {train_abhay1_df.shape}, Test 1: {test_abhay1_df.shape}")
print(f"Arjun Train 1: {train_arjun1_df.shape}, Test 1: {test_arjun1_df.shape}")


Using median trial length: 118
Air Quotes.csv:
  Abhay Trial 1: 63 rows
  Abhay Trial 2: 96 rows
  Arjun Trial 1: 112 rows
  Arjun Trial 2: 112 rows
ASL - 1.csv:
  Abhay Trial 1: 132 rows
  Abhay Trial 2: 120 rows
  Arjun Trial 1: 138 rows
  Arjun Trial 2: 142 rows
ASL - 10.csv:
  Abhay Trial 1: 139 rows
  Abhay Trial 2: 126 rows
  Arjun Trial 1: 135 rows
  Arjun Trial 2: 106 rows
ASL - 2.csv:
  Abhay Trial 1: 134 rows
  Abhay Trial 2: 141 rows
  Arjun Trial 1: 145 rows
  Arjun Trial 2: 136 rows
ASL - 3.csv:
  Abhay Trial 1: 102 rows
  Abhay Trial 2: 128 rows
  Arjun Trial 1: 74 rows
  Arjun Trial 2: 106 rows
ASL - 4.csv:
  Abhay Trial 1: 124 rows
  Abhay Trial 2: 105 rows
  Arjun Trial 1: 118 rows
  Arjun Trial 2: 105 rows
ASL - 5.csv:
  Abhay Trial 1: 107 rows
  Abhay Trial 2: 106 rows
  Arjun Trial 1: 102 rows
  Arjun Trial 2: 98 rows
ASL - 6.csv:
  Abhay Trial 1: 105 rows
  Abhay Trial 2: 105 rows
  Arjun Trial 1: 88 rows
  Arjun Trial 2: 107 rows
ASL - 7.csv:
  Abhay Trial 1: 118 