# 03 - Iteration and Switch to Offical Metric

In [1]:
#### TEMPORARY
import sys
sys.path.append('/home/bac/code/kaggle/kaggle-cmi-detect-behavior/')

In [18]:
#! conda install -c conda-forge umap-learn -y

## Setup and Load Data

In [2]:
import pandas as pd
import numpy as np
import os
import catboost as cat
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from umap.umap_ import UMAP
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# --- Pathing and Experiment Tracking Setup ---
from src.tracking import ExperimentTracker
from src.config import DATA_PATH, PROJECT_PATH, USE_WANDB, WANDB_PROJECT, WANDB_ENTITY

tracker = ExperimentTracker(
    project_path=os.path.expanduser(PROJECT_PATH),
    use_wandb=USE_WANDB,
    wandb_project_name=WANDB_PROJECT,
    wandb_entity=WANDB_ENTITY
)

In [3]:
# Load the raw data
train_sensor = pd.read_csv(os.path.join(os.path.expanduser(DATA_PATH), 'train.csv'))
train_demos = pd.read_csv(os.path.join(os.path.expanduser(DATA_PATH), 'train_demographics.csv'))

# Merge with demographics
train_df = pd.merge(train_sensor, train_demos, on='subject', how='left')
print("Data loaded and merged.")

# --- Create Helper Mappings for Evaluation Metric ---
# Important for the custom F1 score function
metadata = train_df[['gesture', 'sequence_type']].drop_duplicates()

# Map gesture string to sequence type (Target vs. Non-Target)
gesture_to_seq_type_map = metadata.set_index('gesture')['sequence_type'].to_dict()

# Map gesture string to integer code and back
gesture_map = {label: i for i, label in enumerate(metadata['gesture'].unique())}
inv_gesture_map = {i: label for label, i in gesture_map.items()}

Data loaded and merged.


Create evaluation function...

In [4]:
def average_f1_score(y_true_encoded, y_pred_proba):
    """
    Calculates the official competition F1 score.
    
    Args:
        y_true_encoded: True labels, integer encoded.
        y_pred_proba: Predicted probabilities from the model.
    """
    # Get predicted labels by finding the class with the highest probability
    y_pred_encoded = np.argmax(y_pred_proba, axis=1)
    
    # Map integer-encoded labels back to the string representations
    y_true_str = pd.Series(y_true_encoded).map(inv_gesture_map)
    y_pred_str = pd.Series(y_pred_encoded).map(inv_gesture_map)
    
    # Binary F1
    y_true_binary = y_true_str.map(gesture_to_seq_type_map)
    y_pred_binary = y_true_str.map(gesture_to_seq_type_map)
    binary_f1 = f1_score(y_true_binary, y_pred_binary, pos_label='Target', average='binary')
    
    # Macro F1 (collaped non-target class)
    def collapse_non_target(gesture):
        return 'non_target' if gesture_to_seq_type_map[gesture] == 'Non-Target' else gesture
    
    y_true_collapsed = y_true_str.apply(collapse_non_target)
    y_pred_collapsed = y_pred_str.apply(collapse_non_target)
    macro_f1 = f1_score(y_true_collapsed, y_pred_collapsed, average='macro')
    
    # Final score = average of the two components
    return (binary_f1 + macro_f1) / 2

## Recreate Wave 2 and Train Using Avg. F1-Score

In [4]:
def create_wave2_features(df):
    """
    Creates Wave 2 features: Adds advanced IMU (magnitude, jerk) and
    Thermopile (gradients) features before performing phase-specific aggregation.
    """
    print("Starting Wave 2 Feature Engineering...")
    df_feat = df.copy()
    df_feat['acc_mag'] = np.sqrt(df_feat['acc_x']**2 + df_feat['acc_y']**2 + df_feat['acc_z']**2)
    df_feat['rot_mag'] = np.sqrt(df_feat['rot_w']**2 + df_feat['rot_x']**2 + df_feat['rot_y']**2 + df_feat['rot_z']**2)
    df_feat['jerk'] = df_feat.groupby('sequence_id')['acc_mag'].diff().fillna(0)
    for i in range(1, 5):
        df_feat[f'thm_grad_{i}_{i+1}'] = df_feat[f'thm_{i}'] - df_feat[f'thm_{i+1}']
    
    imu_derived_cols = ['acc_mag', 'rot_mag', 'jerk']
    thm_grad_cols = [f'thm_grad_{i}_{i+1}' for i in range(1, 5)]
    original_sensor_cols = [c for c in df.columns if 'acc_' in c or 'rot_' in c or 'thm_' in c]
    tof_cols = [f'tof_{s}_v{p}' for s in range(1, 6) for p in range(64)]

    df_feat[tof_cols] = df_feat[tof_cols].replace(-1, np.nan)
    df_feat['tof_mean_all_pixels'] = df_feat[tof_cols].mean(axis=1)

    aggs = {}
    for col in original_sensor_cols + imu_derived_cols + thm_grad_cols:
        aggs[col] = ['mean', 'std', 'min', 'max', 'skew']
    aggs['tof_mean_all_pixels'] = ['mean', 'std', 'min', 'max']

    phase_agg_df = df_feat.groupby(['sequence_id', 'phase']).agg(aggs)
    phase_agg_df.columns = ['_'.join(col).strip() for col in phase_agg_df.columns.values]
    phase_agg_df_unstacked = phase_agg_df.unstack(level='phase')
    phase_agg_df_unstacked.columns = ['_'.join(col).strip() for col in phase_agg_df_unstacked.columns.values]
    
    meta_df = df.groupby('sequence_id').first()
    final_df = pd.concat([meta_df[['subject', 'gesture']], phase_agg_df_unstacked], axis=1).reset_index()
    
    # Use the globally defined gesture map for consistent encoding
    final_df['gesture_encoded'] = final_df['gesture'].map(gesture_map)
    
    print(f"Feature engineering complete. Shape of features: {final_df.shape}")
    return final_df



features_df = create_wave2_features(train_df)

Starting Wave 2 Feature Engineering...
Feature engineering complete. Shape of features: (8151, 202)


In [8]:
FEATURE_WAVE = "Wave 2"
MODEL_NAME = "CatBoost"
EXPERIMENT_NAME = f"{FEATURE_WAVE}-{MODEL_NAME}-Official-Metric-Baseline"
N_SPLITS = 5
SEED = 42

# --- Model Parameters ---
params = {
    'iterations': 1000, 'learning_rate': 0.05, 'depth': 6,
    'loss_function': 'MultiClass', 'eval_metric': 'MultiClass',
    'random_seed': SEED, 'verbose': 0
}

# --- Prepare data for CV ---
X = features_df.drop(columns=['sequence_id', 'subject', 'gesture', 'gesture_encoded'])
y = features_df['gesture_encoded']
groups = features_df['subject']

In [10]:
fold_scores = []
cv = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = cat.CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100)
    
    # Predict on validation set
    val_preds_proba = model.predict_proba(X_val)
    
    # Evaluate fold performance using the official metric
    fold_score = average_f1_score(y_val, val_preds_proba)
    fold_scores.append(fold_score)
    print(f"Fold {fold+1} Competition F1 Score: {fold_score:.5f}")

# --- Final Score and Logging ---
mean_cv_score = np.mean(fold_scores)
print(f"\n--- CV Summary for {MODEL_NAME} ---")
print(f"Mean Competition F1 Score: {mean_cv_score:.5f}")
print(f"Std Dev: {np.std(fold_scores):.5f}\n")

tracker.log_experiment(
    experiment_name=EXPERIMENT_NAME, model_name=MODEL_NAME, feature_wave=FEATURE_WAVE,
    cv_score=mean_cv_score, params=params,
    notes="Re-evaluating Wave 2 baseline using the official competition F1 metric."
)

--- Fold 1/5 ---
Fold 1 Competition F1 Score: 0.83614
--- Fold 2/5 ---
Fold 2 Competition F1 Score: 0.78580
--- Fold 3/5 ---
Fold 3 Competition F1 Score: 0.79748
--- Fold 4/5 ---
Fold 4 Competition F1 Score: 0.78122
--- Fold 5/5 ---
Fold 5 Competition F1 Score: 0.78969

--- CV Summary for CatBoost ---
Mean Competition F1 Score: 0.79807
Std Dev: 0.01977

Experiment 'Wave 2-CatBoost-Official-Metric-Baseline' logged to /home/bac/code/kaggle/kaggle-cmi-detect-behavior/experiment_log.csv


[34m[1mwandb[0m: Currently logged in as: [33mb-a-chaudhry[0m ([33mb-a-chaudhry-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
cv_score,▁

0,1
cv_score,0.79807
feature_wave,Wave 2
model_name,CatBoost


Experiment 'Wave 2-CatBoost-Official-Metric-Baseline' logged to W&B


## Feature Engineering - Wave 3 - Advanced ToF + Dimensionality Reduction

`NOTE` In order to determine the imapact of dimensionality reduction techniques on model performance, we will treaat PCA and UMAP training runs as two separate experiments to evaluate their impact.

### **PCA + Wave 3**

In [9]:
def create_wave3_pca_features(df):
    """
    Creates Wave-3 features : Adds PCA components from ToF sensors to the Wave 2
    feature set.
    """
    df_feat = df.copy()
    
    # Base feature creation from wave 2
    df_feat['acc_mag'] = np.sqrt(df_feat['acc_x']**2 + df_feat['acc_y']**2 + df_feat['acc_z']**2)
    df_feat['rot_mag'] = np.sqrt(df_feat['rot_w']**2 + df_feat['rot_x']**2 + df_feat['rot_y']**2 + df_feat['rot_z']**2)
    df_feat['jerk'] = df_feat.groupby('sequence_id')['acc_mag'].diff().fillna(0)
    for i in range(1, 5):
        df_feat[f'thm_grad_{i}_{i+1}'] = df_feat[f'thm_{i}'] - df_feat[f'thm_{i+1}']
    
    # Advanced ToF features
    tof_cols = [f'tof_{s}_v{p}' for s in range(1, 6) for p in range(64)]
    
    # Replace -1 with NaN for 'invalid' stats
    tof_data = df_feat[tof_cols].replace(-1, np.nan)
    
    # Percentage of invalid readings per row (feature)
    df_feat['tof_invalid_pct'] = tof_data.isna().mean(axis=1)
    
    # PCA components (10)
    pca = PCA(n_components=10, random_state=SEED)
    # Fill remaining NaNs with 0 before PCA
    tof_pca_features = pca.fit_transform(tof_data.fillna(0))
    
    # Add PCA features to the DF
    for i in range(tof_pca_features.shape[1]):
        df_feat[f'tof_pca_{i}'] = tof_pca_features[:, i]
        
    print(f"Created {tof_pca_features.shape[1]} ToF PCA components.")
    
    # Phase specific aggregations
    base_cols_to_agg = [col for col in df.columns if 'acc_' in col or 'rot_' in col or 'thm_' in col]
    derived_cols_to_add = ['acc_mag', 'rot_mag', 'jerk'] + [f'thm_grad_{i}_{i+1}' for i in range(1, 5)]
    tof_derived_cols_to_agg = ['tof_invalid_pct'] + [f'tof_pca_{i}' for i in range(10)]
    
    aggs = {}
    for col in base_cols_to_agg + derived_cols_to_add + tof_derived_cols_to_agg:
        aggs[col] = ['mean', 'std', 'min', 'max']
    
    phase_agg_df = df_feat.groupby(['sequence_id', 'phase']).agg(aggs)
    phase_agg_df.columns = ['_'.join(col).strip() for col in phase_agg_df.columns.values]
    phase_agg_df_unstacked = phase_agg_df.unstack(level='phase')
    phase_agg_df_unstacked.columns = ['_'.join(col).strip() for col in phase_agg_df_unstacked.columns.values]
    
    meta_df = df.groupby('sequence_id').first()
    final_df = pd.concat([meta_df[['subject', 'gesture']], phase_agg_df_unstacked], axis=1).reset_index()
    final_df['gesture_encoded'] = final_df['gesture'].map(gesture_map)
    
    print(f"Feature engineering complete. Shape of features: {final_df.shape}")
    return final_df
    

In [11]:
SEED = 42
features_df  = create_wave3_pca_features(train_df)

Created 10 ToF PCA components.
Feature engineering complete. Shape of features: (8151, 244)


In [12]:
# Setup training run and prep data
FEATURE_WAVE = "Wave 3a (PCA)"
MODEL_NAME = "Catboost"
EXPERIMENT_NAME = f"{FEATURE_WAVE}-{MODEL_NAME}-CPU"
N_SPLITS = 5

# --- Model Parameters ---
params = {
    'iterations': 1000, 'learning_rate': 0.05, 'depth': 6,
    'loss_function': 'MultiClass', 'eval_metric': 'MultiClass',
    'random_seed': SEED, 'verbose': 0
}

# --- Prepare data for CV ---
X = features_df.drop(columns=['sequence_id', 'subject', 'gesture', 'gesture_encoded'])
y = features_df['gesture_encoded']
groups = features_df['subject']

In [14]:
# Train and evaluate.
fold_scores = []
cv = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = cat.CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100)
    
    # Predict on validation set
    val_preds_proba = model.predict_proba(X_val)
    
    # Evaluate fold performance using the official metric
    fold_score = average_f1_score(y_val, val_preds_proba)
    fold_scores.append(fold_score)
    print(f"Fold {fold+1} Competition F1 Score: {fold_score:.5f}")

# --- Final Score and Logging ---
mean_cv_score = np.mean(fold_scores)
print(f"\n--- CV Summary for {MODEL_NAME} ---")
print(f"Mean Competition F1 Score: {mean_cv_score:.5f}")
print(f"Std Dev: {np.std(fold_scores):.5f}\n")

tracker.log_experiment(
    experiment_name=EXPERIMENT_NAME, model_name=MODEL_NAME, feature_wave=FEATURE_WAVE,
    cv_score=mean_cv_score, params=params,
    notes="Evaluating Wave-3 baseline (Wave2 + ToF dimensionality) + competition metric + PCA (10 components) "
)

--- Fold 1/5 ---
Fold 1 Competition F1 Score: 0.83466
--- Fold 2/5 ---
Fold 2 Competition F1 Score: 0.78716
--- Fold 3/5 ---
Fold 3 Competition F1 Score: 0.80084
--- Fold 4/5 ---
Fold 4 Competition F1 Score: 0.79071
--- Fold 5/5 ---
Fold 5 Competition F1 Score: 0.79366

--- CV Summary for Catboost ---
Mean Competition F1 Score: 0.80141
Std Dev: 0.01723

Experiment 'Wave 3a (PCA)-Catboost-CPU' logged to /home/bac/code/kaggle/kaggle-cmi-detect-behavior/experiment_log.csv


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


0,1
cv_score,▁

0,1
cv_score,0.80141
feature_wave,Wave 3a (PCA)
model_name,Catboost


Experiment 'Wave 3a (PCA)-Catboost-CPU' logged to W&B


### **UMAP + Wave 3**

In [None]:
def create_wave3_umap_features(df):
    """
    Creates Wave-3 features: Adds UMAP components from ToF sensors to the Wave-2
    feature set.
    """
    df_feat = df.copy()
    
    # Base features from wave 2
    df_feat['acc_mag'] = np.sqrt(df_feat['acc_x']**2 + df_feat['acc_y']**2 + df_feat['acc_z']**2)
    df_feat['rot_mag'] = np.sqrt(df_feat['rot_w']**2 + df_feat['rot_x']**2 + df_feat['rot_y']**2 + df_feat['rot_z']**2)
    df_feat['jerk'] = df_feat.groupby('sequence_id')['acc_mag'].diff().fillna(0)
    for i in range(1, 5):
        df_feat[f'thm_grad_{i}_{i+1}'] = df_feat[f'thm_{i}'] - df_feat[f'thm_{i+1}']
        
    # ToF feature creation using UMAP
    tof_cols = [f'tof_{s}_v{p}' for s in range(1, 6) for p in range(64)]
    tof_data = df_feat[tof_cols].replace(-1, np.nan)
    
    # Feature - percentage of invalid readings
    df_feat['tof_invalid_pct'] = tof_data.isna().mean(axis=1)
    
    # UMAP components - using standard UMAP parameters with n=10 components to
    # match PCA for a fair comparision
    
    reducer = UMAP(
        n_components=10,
        n_neighbors=15,
        min_dist=0.1,
        metric='euclidean',
        random_state=SEED,
    )
    
    # Fill remaining NaNs with 0 before applying UMAP
    tof_umap_features = reducer.fit_transform(tof_data.fillna(0))
    
    for i in range(tof_umap_features.shape[1]):
        df_feat[f'tof_umap_{i}'] = tof_umap_features[:, i]
        
    print(f"Created {tof_umap_features.shape[1]} ToF UMAP components.")

    # Phase specific aggregation
    base_cols_to_agg = [col for col in df.columns if 'acc_' in col or 'rot_' in col or 'thm_' in col]
    derived_cols_to_agg = ['acc_mag', 'rot_mag', 'jerk'] + [f'thm_grad_{i}_{i+1}' for i in range(1, 5)]
    tof_derived_cols_to_agg = ['tof_invalid_pct'] + [f'tof_umap_{i}' for i in range(10)]
    
    aggs = {}
    for col in base_cols_to_agg + derived_cols_to_agg + tof_derived_cols_to_agg:
        aggs[col] = ['mean', 'std', 'min', 'max']

    phase_agg_df = df_feat.groupby(['sequence_id', 'phase']).agg(aggs)
    phase_agg_df.columns = ['_'.join(col).strip() for col in phase_agg_df.columns.values]
    phase_agg_df_unstacked = phase_agg_df.unstack(level='phase')
    phase_agg_df_unstacked.columns = ['_'.join(col).strip() for col in phase_agg_df_unstacked.columns.values]
    
    meta_df = df.groupby('sequence_id').first()
    final_df = pd.concat([meta_df[['subject', 'gesture']], phase_agg_df_unstacked], axis=1).reset_index()
    final_df['gesture_encoded'] = final_df['gesture'].map(gesture_map)
    
    print(f"Feature engineering complete. Shape of features: {final_df.shape}")
    return final_df


In [22]:
# Create features
features_df = create_wave3_umap_features(train_df)

Created 10 ToF UMAP components.
Feature engineering complete. Shape of features: (8151, 244)


Yikes! 20 mins to train using the CPU. The algorithm's brute-force methodology doesn't scale well at all. Let's see what impact it had on the model's CV performance.

In [23]:
# Setup training run and prep data
FEATURE_WAVE = "Wave 3b (UMAP)"
MODEL_NAME = "Catboost"
EXPERIMENT_NAME = f"{FEATURE_WAVE}-{MODEL_NAME}-CPU"
N_SPLITS = 5

# --- Model Parameters ---
params = {
    'iterations': 1000, 'learning_rate': 0.05, 'depth': 6,
    'loss_function': 'MultiClass', 'eval_metric': 'MultiClass',
    'random_seed': SEED, 'verbose': 0
}

# --- Prepare data for CV ---
X = features_df.drop(columns=['sequence_id', 'subject', 'gesture', 'gesture_encoded'])
y = features_df['gesture_encoded']
groups = features_df['subject']

In [24]:
# Train and evaluate.
fold_scores = []
cv = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = cat.CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100)
    
    # Predict on validation set
    val_preds_proba = model.predict_proba(X_val)
    
    # Evaluate fold performance using the official metric
    fold_score = average_f1_score(y_val, val_preds_proba)
    fold_scores.append(fold_score)
    print(f"Fold {fold+1} Competition F1 Score: {fold_score:.5f}")

# --- Final Score and Logging ---
mean_cv_score = np.mean(fold_scores)
print(f"\n--- CV Summary for {MODEL_NAME} ---")
print(f"Mean Competition F1 Score: {mean_cv_score:.5f}")
print(f"Std Dev: {np.std(fold_scores):.5f}\n")

tracker.log_experiment(
    experiment_name=EXPERIMENT_NAME, model_name=MODEL_NAME, feature_wave=FEATURE_WAVE,
    cv_score=mean_cv_score, params=params,
    notes="Evaluating Wave-3b baseline (Wave2 + ToF dimensionality) + competition metric + UMAP (10 components) "
)

--- Fold 1/5 ---
Fold 1 Competition F1 Score: 0.84191
--- Fold 2/5 ---
Fold 2 Competition F1 Score: 0.77774
--- Fold 3/5 ---
Fold 3 Competition F1 Score: 0.80002
--- Fold 4/5 ---
Fold 4 Competition F1 Score: 0.78125
--- Fold 5/5 ---


[34m[1mwandb[0m: Currently logged in as: [33mb-a-chaudhry[0m ([33mb-a-chaudhry-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Fold 5 Competition F1 Score: 0.79240

--- CV Summary for Catboost ---
Mean Competition F1 Score: 0.79866
Std Dev: 0.02303

Experiment 'Wave 3b (UMAP)-Catboost-CPU' logged to /home/bac/code/kaggle/kaggle-cmi-detect-behavior/experiment_log.csv
Failed to log to W&B: [Errno 32] Broken pipe


**PCA Outperforms UMAP**

- Wave 3a (PCA) Score: 0.80141 (Std Dev: 0.01723)

- Wave 3b (UMAP) Score: 0.79866 (Std Dev: 0.02303)

- The model using PCA features achieved a slightly higher mean score and, importantly, had a lower standard deviation, indicating greater stability across the folds. While the difference in the mean score is small, the better stability gives us more confidence in the PCA-based features.

- For this dataset, the linear transformations of PCA are more effective at creating useful features from the ToF sensor data than the non-linear manifold learning of UMAP. 

## Feature Engineering - Wave 4 - Demographic Interaction Features

In [None]:
def create_wave4_features(df):
    """
    Creates Wave-4 features: Adds demographic features to Wave 3a-PCA
    feature set.
    """
    df_feat = df.copy()
    

SyntaxError: expected '(' (3222645179.py, line 1)