In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, f1_score
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
import time, os, sys, warnings
import wandb
from datetime import datetime
import pickle

from src.config import PROJECT_PATH, DATA_PATH, USE_WANDB, WANDB_PROJECT, WANDB_ENTITY
from src.tracking import ExperimentTracker
from src.metrics import average_f1_score
from src.training import train_and_evaluate_model

In [3]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 10000):
        display(df)

In [4]:
# Initialize the experiment tracker
tracker = ExperimentTracker(
    project_path=PROJECT_PATH,
    use_wandb=USE_WANDB,
    wandb_project_name=WANDB_PROJECT,
    wandb_entity=WANDB_ENTITY
)

In [5]:
# Load training data, gesture mappings and inverse gesture mappings
df_train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))

with open('processed_data/gesture_mappings.pkl', 'rb') as f:
    gesture_to_seq_type_map, gesture_map, inv_gesture_map = pickle.load(f)

In [6]:
df_train.head(2)

Unnamed: 0,row_id,sequence_type,sequence_id,sequence_counter,subject,orientation,behavior,phase,gesture,acc_x,...,tof_5_v54,tof_5_v55,tof_5_v56,tof_5_v57,tof_5_v58,tof_5_v59,tof_5_v60,tof_5_v61,tof_5_v62,tof_5_v63
0,SEQ_000007_000000,Target,SEQ_000007,0,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.683594,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,SEQ_000007_000001,Target,SEQ_000007,1,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.949219,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [7]:
print(f"Gesture Map: {gesture_map}")
print(f"\nInverted Gesture Map: {inv_gesture_map}")
print(f"\nGesture To Sequence Type: {gesture_to_seq_type_map}")

Gesture Map: {'Cheek - pinch skin': 0, 'Forehead - pull hairline': 1, 'Write name on leg': 2, 'Feel around in tray and pull out an object': 3, 'Neck - scratch': 4, 'Neck - pinch skin': 5, 'Eyelash - pull hair': 6, 'Eyebrow - pull hair': 7, 'Forehead - scratch': 8, 'Above ear - pull hair': 9, 'Wave hello': 10, 'Write name in air': 11, 'Text on phone': 12, 'Pull air toward your face': 13, 'Pinch knee/leg skin': 14, 'Scratch knee/leg skin': 15, 'Drink from bottle/cup': 16, 'Glasses on/off': 17}

Inverted Gesture Map: {0: 'Cheek - pinch skin', 1: 'Forehead - pull hairline', 2: 'Write name on leg', 3: 'Feel around in tray and pull out an object', 4: 'Neck - scratch', 5: 'Neck - pinch skin', 6: 'Eyelash - pull hair', 7: 'Eyebrow - pull hair', 8: 'Forehead - scratch', 9: 'Above ear - pull hair', 10: 'Wave hello', 11: 'Write name in air', 12: 'Text on phone', 13: 'Pull air toward your face', 14: 'Pinch knee/leg skin', 15: 'Scratch knee/leg skin', 16: 'Drink from bottle/cup', 17: 'Glasses on/of

In [8]:
N_SPLITS = 5
SEED = 42

MODEL_PARAMS = {
    'catboost': {
        'objective': 'MultiClass', # CatBoost specific
        'loss_function': 'MultiClass',
        'eval_metric': 'MultiClass',
        'iterations': 1000,
        'learning_rate': 0.05,
        'depth': 6,
        'l2_leaf_reg': 3.0,
        'random_seed': SEED,
        'verbose': False,
        'allow_writing_files': False,
        'task_type' : 'GPU',
    },
    'light_gbm': {
        'objective': 'multiclass',
        'num_class': 18, # Hard coding classes
        'metric': 'multi_logloss',
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'bagging_fraction': 0.8,
        'feature_fraction': 0.8,
        'bagging_freq': 1,
        'random_state': SEED,
        'n_jobs': -1,
        'verbose': -1,
    }
}

## Wave 2 Features and Training

In [None]:
def create_wave2_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates Wave 2 features: Adds advanced IMU (magnitude, jerk) and 
    Thermopile (gradients) features before performing phase-specific aggregation (Wave 1 style).
    Correctly uses 'sequence_counter' as an ordering index for calculations like jerk.
    Handles ToF -1.0 values by converting to NaN before aggregation.
    """
    required_base_cols = ['sequence_id', 'phase', 'subject', 'gesture', 
                          'acc_x', 'acc_y', 'acc_z', 
                          'rot_w', 'rot_x', 'rot_y', 'rot_z',
                          'thm_1', 'thm_2', 'thm_3', 'thm_4', # thm_5 often has issues
                          'sequence_counter']
    # Check for presence of some ToF columns (assuming they start with 'tof_')
    tof_cols_exist = any(col.startswith('tof_') for col in df.columns)
    if not tof_cols_exist:
        print("  Warning: No ToF columns (starting with 'tof_') found in DataFrame. Continuing without them.")
        # Remove tof requirement if not present
        required_cols = [col for col in required_base_cols if not col.startswith('tof_')]
    else:
        # If ToF exists, add a representative one to required check
        required_cols = required_base_cols + ['tof_0'] # Check at least one

    missing_cols = [col for col in required_cols if col not in df.columns]
    # Allow for thm_5 to be missing, as it's often problematic
    missing_cols = [col for col in missing_cols if col != 'thm_5'] 
    if missing_cols:
        raise ValueError(f"Missing required columns for Wave 2: {missing_cols}")
    
    df_feat = df.copy()
    
    # 1. Add advanced IMU features
    print("Calculating advanced IMU features...")
    
    # Vector magnitudes
    df_feat['acc_mag'] = np.sqrt(df_feat['acc_x']**2 + df_feat['acc_y']**2 + df_feat['acc_z']**2)
    # Using formula for quarternion magnitude --> sqrt(w^2 + x^2 + y^2 + z^2)
    df_feat['rot_mag'] = np.sqrt(df_feat['rot_w']**2 + df_feat['rot_x']**2 + df_feat['rot_y']**2 + df['rot_z']**2)
    
    # Jerk (derivative of acceleration magnitude)
    # Sorting by sequence counter within each sequence to ensure correct diff order
    # Ensure sorting is stable and handles potential duplicates
    df_feat_sorted = df_feat.sort_values(by=['sequence_id', 'sequence_counter'])
    
    # Calculate jerk using diff on the sorted data, grouped by sequence_id
    # fillna(0) for the first point in each sequence where diff is NaN
    df_feat_sorted['jerk'] = df_feat_sorted.groupby('sequence_id')['acc_mag'].diff().fillna(0)
    
    # Merge the calculated jerk back to the original DF ensuring the value is associated with the correct
    # original row index.
    df_feat = df_feat.merge(
        df_feat_sorted[['sequence_id', 'sequence_counter', 'jerk']],
        on=['sequence_id', 'sequence_counter'],
        how='left'
    )
    # Debugging : check for successful merge
    if df_feat['jerk'].isnull().any():
        print("Warning!! Some jerk values are NaN after merge. Check sequence_counter sorting/alignment.")
        
    # 2. Add Thermopile Gradient Features
    print(" Calculating Thermopile gradient features...")
    if 'thm_2' in df_feat.columns and 'thm_1' in df_feat.columns:
        df_feat['thm_grad_2_1'] = df_feat['thm_2'] - df_feat['thm_1']
    if 'thm_3' in df_feat.columns and 'thm_2' in df_feat.columns:
        df_feat['thm_grad_3_2'] = df_feat['thm_3'] - df_feat['thm_2']
    if 'thm_4' in df_feat.columns and 'thm_3' in df_feat.columns:
        df_feat['thm_grad_4_3'] = df_feat['thm_4'] - df_feat['thm_3']
    # Additional combos
    if 'thm_3' in df_feat.columns and 'thm_1' in df_feat.columns:
        df_feat['thm_grad_3_1'] = df_feat['thm_3'] - df_feat['thm_1']
    if 'thm_4' in df_feat.columns and 'thm_2' in df_feat.columns:
        df_feat['thm_grad_4_2'] = df_feat['thm_4'] - df_feat['thm_2']
    if 'thm_4' in df_feat.columns and 'thm_1' in df_feat.columns:
        df_feat['thm_grad_4_1'] = df_feat['thm_4'] - df_feat['thm_1']
    # Note skipping thm_5 due to high null rate - revisit if necessary
    
    # --- 3. Handle -1.0 in ToF columns (if they exist) ---
    tof_columns = [col for col in df_feat.columns if col.startswith('tof_')]
    if tof_columns:
        print(f"  Found {len(tof_columns)} ToF columns. Handling -1.0 values...")
        df_feat[tof_columns] = df_feat[tof_columns].replace(-1.0, np.nan)
    else:
        print("  No ToF columns found to process.")
        
    # --- 4. Perform Phase-Specific Aggregation (Wave 1 logic) ---
    # Now aggregate the base sensors AND the newly created features by phase
    print("  Performing phase-specific aggregation (Wave 1 style)...")
    
    # Define columns to aggregate (including new Wave 2 features)
    # Exclude non-feature columns
    feature_cols_to_agg = [col for col in df_feat.columns 
                           if col not in ['sequence_id', 'phase', 'subject', 'gesture', 'sequence_counter']]
    
    # Define aggregation functions
    aggs = {}
    for col in feature_cols_to_agg:
        aggs[col] = ['mean', 'std', 'min', 'max', 'median', 'skew']
        
    # --- Perform Aggregation ---
    phase_agg_df = df_feat.groupby(['sequence_id', 'phase']).agg(aggs)
    
    # --- Flatten Column Names ---
    phase_agg_df.columns = ['_'.join(col).strip() for col in phase_agg_df.columns]
    phase_agg_df.reset_index(inplace=True)
    
    # --- Unstack Phase Dimension ---
    print("  Unstacking phase dimension...")
    phase_agg_df.set_index(['sequence_id', 'phase'], inplace=True)
    phase_agg_df_unstacked = phase_agg_df.unstack(level='phase')
    
    # --- Flatten Multiindex Columns ---
    new_columns = [f"{sensor_stat}_{phase}" for sensor_stat, phase in phase_agg_df_unstacked.columns]
    phase_agg_df_unstacked.columns = new_columns
    phase_agg_df_unstacked.reset_index(inplace=True)
    
    # --- Merge with Metadata ---
    print(" Mering with metadata...")
    meta_df = df_feat.groupby('sequence_id')[['subject', 'gesture']].first().reset_index(drop=True)
    # Ensure sequence_id alignment
    final_df = pd.concat([phase_agg_df_unstacked, meta_df[['subject', 'gesture']]], axis=1)
    
    # --- 6. Encode Target ---
    print("  Encoding target variable...")
    # If using gesture_map: 
    final_df['gesture_encoded'] = final_df['gesture'].map(gesture_map)
    
    print(f"Feature engineering complete. Shape of features: {final_df.shape}")
    return final_df
