In [4]:
import os, json, joblib, numpy as np, pandas as pd
from pathlib import Path
import warnings 
import random
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedGroupKFold

import tensorflow as tf
from tensorflow.keras.utils import Sequence, to_categorical, pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Conv1D, BatchNormalization, Activation, add, MaxPooling1D, Dropout,
    Bidirectional, LSTM, GlobalAveragePooling1D, Dense, Multiply, Reshape,
    Lambda, Concatenate, GRU, GaussianNoise, Add, GlobalMaxPooling1D,
    MultiHeadAttention, LayerNormalization
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras import backend as K

import polars as pl
from scipy.spatial.transform import Rotation as R

from cmi_2025_metric_copy_for_import import CompetitionMetric

In [5]:
gpus = tf.config.list_physical_devices('GPU')
gpus

[]

In [5]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.experimental.numpy.random.seed(seed)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
seed_everything(seed=42)

In [None]:
train = True                    
raw_dir = Path("input")
pretrained_dir = Path("input/cmi-d-111")
output_dir = Path("./")                                    
batch_size = 64
pad_percentile = 95
lr_init = 5e-4
wd = 3e-3
mixup_alpha = 0.4
epochs = 220
patience = 40

print("Imports ready")

Imports ready


In [7]:
#Tensor Manipulations
def time_sum(x):
    return K.sum(x, axis=1)

def squeeze_last_axis(x):
    return tf.squeeze(x, axis=-1)

def expand_last_axis(x):
    return tf.expand_dims(x, axis=-1)

def se_block(x, reduction=8):
    ch = x.shape[-1]
    se = GlobalAveragePooling1D()(x)
    se = Dense(ch // reduction, activation='relu')(se)
    se = Dense(ch, activation='sigmoid')(se)
    se = Reshape((1, ch))(se)
    return Multiply()([x, se])

# Residual CNN Block with SE
def residual_se_cnn_block(x, filters, kernel_size, pool_size=2, drop=0.3, wd=1e-4):
    shortcut = x
    for _ in range(2):
        x = Conv1D(filters, kernel_size, padding='same', use_bias=False,
                   kernel_regularizer=l2(wd))(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
    x = se_block(x)
    if shortcut.shape[-1] != filters:
        shortcut = Conv1D(filters, 1, padding='same', use_bias=False,
                          kernel_regularizer=l2(wd))(shortcut)
        shortcut = BatchNormalization()(shortcut)
    x = add([x, shortcut])
    x = Activation('relu')(x)
    x = MaxPooling1D(pool_size)(x)
    x = Dropout(drop)(x)
    return x

def attention_layer(inputs):
    score = Dense(1, activation='tanh')(inputs)
    score = Lambda(squeeze_last_axis)(score)
    weights = Activation('softmax')(score)
    weights = Lambda(expand_last_axis)(weights)
    context = Multiply()([inputs, weights])
    context = Lambda(time_sum)(context)
    return context

In [8]:
class EnhancedMixupGenerator(Sequence):
    def __init__(self, X, y, batch_size, alpha=0.4, augment=True):
        self.X, self.y = X, y
        self.batch = batch_size
        self.alpha = alpha
        self.augment = augment
        self.indices = np.arange(len(X))
        self.n_imu_features = 24  # Based on your IMU feature count
        
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch))
        
    def __getitem__(self, i):
        idx = self.indices[i*self.batch:(i+1)*self.batch]
        Xb, yb = self.X[idx].copy(), self.y[idx].copy()
        
        if self.augment:
            # Time shift instead of warping (simpler)
            if np.random.rand() > 0.5:
                shift = np.random.randint(-5, 6)  # Shift by up to 5 timesteps
                if shift > 0:
                    Xb[:, shift:, :] = Xb[:, :-shift, :]
                    Xb[:, :shift, :] = 0
                elif shift < 0:
                    Xb[:, :shift, :] = Xb[:, -shift:, :]
                    Xb[:, shift:, :] = 0
                    
            # Magnitude scaling - only for IMU features
            if np.random.rand() > 0.5:
                scale = np.random.uniform(0.8, 1.2)
                Xb[:, :, :self.n_imu_features] *= scale
                
            # Add noise - only to IMU features
            if np.random.rand() > 0.5:
                noise = np.random.normal(0, 0.05, 
                                       (len(Xb), Xb.shape[1], self.n_imu_features))
                Xb[:, :, :self.n_imu_features] += noise
                
            # Random feature dropout for ToF/thermal features
            if np.random.rand() > 0.5:
                # Randomly drop some ToF/thermal features
                n_drop = np.random.randint(1, 6)
                drop_indices = np.random.choice(
                    range(self.n_imu_features, Xb.shape[2]), 
                    size=n_drop, 
                    replace=False
                )
                Xb[:, :, drop_indices] = 0
        
        # Mixup
        lam = np.random.beta(self.alpha, self.alpha)
        perm = np.random.permutation(len(Xb))
        X_mix = lam * Xb + (1-lam) * Xb[perm]
        y_mix = lam * yb + (1-lam) * yb[perm]
        
        return X_mix.astype('float32'), y_mix.astype('float32')
        
    def on_epoch_end(self):
        np.random.shuffle(self.indices)

In [9]:
# Cosine annealing with warm restarts
def cosine_annealing_with_warmup(epoch, lr, warmup_epochs=10, total_epochs=160, min_lr=1e-6):
    if epoch < warmup_epochs:
        return lr_init * (epoch + 1) / warmup_epochs
    else:
        progress = (epoch - warmup_epochs) / (total_epochs - warmup_epochs)
        return min_lr + (lr_init - min_lr) * 0.5 * (1 + np.cos(np.pi * progress))

# Create learning rate scheduler
lr_scheduler = LearningRateScheduler(lambda epoch: cosine_annealing_with_warmup(epoch, lr_init))

# Training callbacks
training_callbacks = [
    EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True),
    lr_scheduler,
    tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=10, min_lr=1e-7, verbose=1),
    tf.keras.callbacks.ModelCheckpoint(
        str(output_dir / 'best_model_{epoch:02d}_{val_loss:.4f}.h5'),
        save_best_only=True,
        monitor='val_loss'
    )
]

In [10]:
def remove_gravity_from_acc(acc_data, rot_data):

    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    else:
        acc_values = acc_data

    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    
    gravity_world = np.array([0, 0, 9.81])

    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :] 
            continue

        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]
             
    return linear_accel

def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200): # Assuming 200Hz sampling rate
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))

    for i in range(num_samples - 1):
        q_t = quat_values[i]
        q_t_plus_dt = quat_values[i+1]

        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \
           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):
            continue

        try:
            rot_t = R.from_quat(q_t)
            rot_t_plus_dt = R.from_quat(q_t_plus_dt)

            # Calculate the relative rotation
            delta_rot = rot_t.inv() * rot_t_plus_dt
            
            # Convert delta rotation to angular velocity vector
            # The rotation vector (Euler axis * angle) scaled by 1/dt
            # is a good approximation for small delta_rot
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError:
            # If quaternion is invalid, angular velocity remains zero
            pass
            
    return angular_vel

In [11]:
def calculate_angular_distance(rot_data):
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_dist = np.zeros(num_samples)

    for i in range(num_samples - 1):
        q1 = quat_values[i]
        q2 = quat_values[i+1]

        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \
           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):
            angular_dist[i] = 0 
            continue
        try:
            
            r1 = R.from_quat(q1)
            r2 = R.from_quat(q2)

            relative_rotation = r1.inv() * r2
            
 
            angle = np.linalg.norm(relative_rotation.as_rotvec())
            angular_dist[i] = angle
        except ValueError:
            angular_dist[i] = 0 
            pass
            
    return angular_dist

In [12]:
def SEBlock(ratio=16):
    def block(inputs):
        channels = inputs.shape[-1]
        x = GlobalAveragePooling1D()(inputs)
        x = Dense(channels // ratio, activation='relu')(x)
        x = Dense(channels, activation='sigmoid')(x)
        x = Reshape((1, channels))(x)
        return Multiply()([inputs, x])
    return block

In [13]:
def create_enhanced_imu_features(acc_data, rot_data, sampling_rate=200):
    # Get gravity-compensated acceleration
    linear_acc = remove_gravity_from_acc(acc_data, rot_data)
    
    # Angular velocity
    angular_vel = calculate_angular_velocity_from_quat(rot_data, 1/sampling_rate)
    
    # Angular distance/displacement
    angular_dist = calculate_angular_distance(rot_data)
    
    # Magnitude features
    acc_magnitude = np.linalg.norm(acc_data[['acc_x', 'acc_y', 'acc_z']].values, axis=1)
    linear_acc_magnitude = np.linalg.norm(linear_acc, axis=1)
    angular_vel_magnitude = np.linalg.norm(angular_vel, axis=1)
    
    # Jerk (derivative of acceleration)
    jerk = np.gradient(linear_acc, axis=0) * sampling_rate
    
    # Frequency domain features (per window)
    # Could add FFT magnitudes for dominant frequencies
    
    # Combine all features
    enhanced_features = np.column_stack([
        acc_data[['acc_x', 'acc_y', 'acc_z']].values,  # 3 features
        rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values,  # 4 features
        linear_acc,  # 3 features
        angular_vel,  # 3 features
        jerk,  # 3 features
        acc_magnitude.reshape(-1, 1),  # 1 feature
        linear_acc_magnitude.reshape(-1, 1),  # 1 feature
        angular_vel_magnitude.reshape(-1, 1),  # 1 feature
        angular_dist.reshape(-1, 1)  # 1 feature
    ])  # Total: 20 features instead of 7
    
    return enhanced_features

In [14]:
def build_two_branch_model_v2(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):
    inp = Input(shape=(pad_len, imu_dim+tof_dim))
    imu = Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = Lambda(lambda t: t[:, :, imu_dim:])(inp)
    
    # Enhanced IMU branch with multi-scale
    x1_3 = residual_se_cnn_block(imu, 64, 3, drop=0.1, wd=wd)
    x1_5 = residual_se_cnn_block(imu, 64, 5, drop=0.1, wd=wd)
    x1_7 = residual_se_cnn_block(imu, 64, 7, drop=0.1, wd=wd)
    x1 = Concatenate()([x1_3, x1_5, x1_7])
    x1 = Conv1D(128, 1, activation='relu')(x1)  # Channel fusion
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.1, wd=wd)
    
    # Enhanced TOF branch with 1x1 conv first
    x2 = Conv1D(128, 1, padding='same', use_bias=False, kernel_regularizer=l2(wd))(tof)
    x2 = BatchNormalization()(x2)
    x2 = Activation('relu')(x2)
    
    # Multi-scale for ToF
    x2_3 = Conv1D(96, 3, padding='same', activation='relu')(x2)
    x2_5 = Conv1D(96, 5, padding='same', activation='relu')(x2)
    x2_7 = Conv1D(96, 7, padding='same', activation='relu')(x2)
    x2 = Concatenate()([x2_3, x2_5, x2_7])
    x2 = BatchNormalization()(x2)
    x2 = SEBlock(ratio=16)(x2)
    x2 = MaxPooling1D(2)(x2)
    x2 = Dropout(0.2)(x2)
    
    x2 = residual_se_cnn_block(x2, 256, 3, drop=0.2, wd=wd)
    
    # Cross-modal attention before merging
    x1_att = MultiHeadAttention(num_heads=4, key_dim=32)(x1, x2)
    x2_att = MultiHeadAttention(num_heads=4, key_dim=32)(x2, x1)
    
    merged = Concatenate()([x1, x2, x1_att, x2_att])
    
    # Enhanced temporal modeling
    xa = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(wd),
                           dropout=0.2, recurrent_dropout=0.2))(merged)
    xb = Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(wd),
                          dropout=0.2, recurrent_dropout=0.2))(merged)
    xc = GaussianNoise(0.09)(merged)
    xc = Dense(128, activation='elu')(xc)
    
    x = Concatenate()([xa, xb, xc])
    x = Dropout(0.4)(x)
    
    # Multiple attention mechanisms
    x_att1 = attention_layer(x)
    x_att2 = GlobalAveragePooling1D()(x)
    x_att3 = GlobalMaxPooling1D()(x)
    x = Concatenate()([x_att1, x_att2, x_att3])
    
    # Enhanced classifier with skip connection
    x_skip = x
    x = Dense(512, use_bias=False, kernel_regularizer=l2(wd))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(256, use_bias=False, kernel_regularizer=l2(wd))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x_skip = Dense(256, kernel_regularizer=l2(wd))(x_skip)
    x = Add()([x, x_skip])
    x = Dropout(0.4)(x)
    
    x = Dense(128, use_bias=False, kernel_regularizer=l2(wd))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.3)(x)
    
    out = Dense(n_classes, activation='softmax', kernel_regularizer=l2(wd))(x)
    return Model(inp, out)
    
tmp_model = build_two_branch_model_v2(127,7,325,18)

In [None]:
# Training configuration
n_splits = 5  # Number of folds for cross-validation
gate_loss_weight = 0.1  # If using gated model
masking_prob = 0.2  # For data augmentation

if train:
    print("Loading dataset...")
    
    # Load data
    df = pd.read_csv(raw_dir / "train.csv")
    train_dem_df = pd.read_csv(raw_dir / "train_demographics.csv")
    df = pd.merge(df, train_dem_df, on='subject', how='left')
    
    # Encode labels
    le = LabelEncoder()
    df['gesture_int'] = le.fit_transform(df['gesture'])
    gesture_classes = le.classes_
    np.save(output_dir / "gesture_classes.npy", gesture_classes)
    
    # ===== PHYSICAL FEATURE ENGINEERING =====
    print("Removing gravity and calculating linear acceleration features...")
    
    # Process by sequence for efficiency
    sequence_features = []
    for seq_id, group in df.groupby('sequence_id'):
        seq_data = group.copy()
        
        # Linear acceleration (gravity removed)
        linear_accel = remove_gravity_from_acc(
            seq_data[['acc_x', 'acc_y', 'acc_z']], 
            seq_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
        )
        seq_data['linear_acc_x'] = linear_accel[:, 0]
        seq_data['linear_acc_y'] = linear_accel[:, 1]
        seq_data['linear_acc_z'] = linear_accel[:, 2]
        
        # Acceleration magnitudes
        seq_data['acc_mag'] = np.sqrt(
            seq_data['acc_x']**2 + seq_data['acc_y']**2 + seq_data['acc_z']**2
        )
        seq_data['linear_acc_mag'] = np.sqrt(
            seq_data['linear_acc_x']**2 + seq_data['linear_acc_y']**2 + seq_data['linear_acc_z']**2
        )
        seq_data['linear_acc_mag_jerk'] = seq_data['linear_acc_mag'].diff().fillna(0)
        
        # Jerk features
        seq_data['jerk_x'] = seq_data['linear_acc_x'].diff() * 200  # 200Hz
        seq_data['jerk_y'] = seq_data['linear_acc_y'].diff() * 200
        seq_data['jerk_z'] = seq_data['linear_acc_z'].diff() * 200
        seq_data[['jerk_x', 'jerk_y', 'jerk_z']] = seq_data[['jerk_x', 'jerk_y', 'jerk_z']].fillna(0)
        
        # Angular velocity
        angular_vel = calculate_angular_velocity_from_quat(
            seq_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
        )
        seq_data['angular_vel_x'] = angular_vel[:, 0]
        seq_data['angular_vel_y'] = angular_vel[:, 1]
        seq_data['angular_vel_z'] = angular_vel[:, 2]
        seq_data['angular_vel_mag'] = np.sqrt(
            angular_vel[:, 0]**2 + angular_vel[:, 1]**2 + angular_vel[:, 2]**2
        )
        
        # Angular distance
        seq_data['angular_distance'] = calculate_angular_distance(
            seq_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
        )
        
        # Angular acceleration
        seq_data['angular_acc_x'] = seq_data['angular_vel_x'].diff() * 200
        seq_data['angular_acc_y'] = seq_data['angular_vel_y'].diff() * 200
        seq_data['angular_acc_z'] = seq_data['angular_vel_z'].diff() * 200
        seq_data[['angular_acc_x', 'angular_acc_y', 'angular_acc_z']] = \
            seq_data[['angular_acc_x', 'angular_acc_y', 'angular_acc_z']].fillna(0)
        
        sequence_features.append(seq_data)
    
    # Combine all sequences
    df = pd.concat(sequence_features, ignore_index=True)
    
    # ===== FEATURE COLUMN ORGANIZATION =====
    # IMU features (enhanced)
    imu_cols_base = ['acc_x', 'acc_y', 'acc_z', 
                     'linear_acc_x', 'linear_acc_y', 'linear_acc_z',
                     'rot_x', 'rot_y', 'rot_z', 'rot_w']
    
    imu_engineered = ['acc_mag', 'linear_acc_mag', 'linear_acc_mag_jerk',
                      'jerk_x', 'jerk_y', 'jerk_z',
                      'angular_vel_x', 'angular_vel_y', 'angular_vel_z', 'angular_vel_mag',
                      'angular_distance',
                      'angular_acc_x', 'angular_acc_y', 'angular_acc_z']
    
    imu_cols = list(dict.fromkeys(imu_cols_base + imu_engineered))
    
    # Thermal columns
    thm_cols = [c for c in df.columns if c.startswith('thm_')]
    
    # ToF aggregated columns (will be computed per sequence)
    tof_aggregated_cols = []
    for i in range(1, 6):
        tof_aggregated_cols.extend([
            f'tof_{i}_mean', f'tof_{i}_std', f'tof_{i}_min', f'tof_{i}_max',
            f'tof_{i}_median', f'tof_{i}_q25', f'tof_{i}_q75', 
            f'tof_{i}_range', f'tof_{i}_iqr', f'tof_{i}_valid_count'
        ])
    
    # All features
    final_feature_cols = imu_cols + thm_cols + tof_aggregated_cols
    imu_dim = len(imu_cols)
    tof_thm_dim = len(thm_cols) + len(tof_aggregated_cols)
    
    print(f"IMU features: {imu_dim} | THM + ToF features: {tof_thm_dim} | Total: {len(final_feature_cols)}")
    np.save(output_dir / "feature_cols.npy", np.array(final_feature_cols))
    
    # ===== BUILD SEQUENCES =====
    print("Building sequences...")
    seq_gp = df.groupby('sequence_id')
    X_list_unscaled, y_list_int, groups_list, lens = [], [], [], []
    
    for seq_id, seq_df in seq_gp:
        seq_df_copy = seq_df.copy()
        
        # Compute ToF aggregations
        for i in range(1, 6):
            pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]
            tof_data = seq_df_copy[pixel_cols].replace(-1, np.nan)
            
            # Basic stats
            seq_df_copy[f'tof_{i}_mean'] = tof_data.mean(axis=1)
            seq_df_copy[f'tof_{i}_std'] = tof_data.std(axis=1)
            seq_df_copy[f'tof_{i}_min'] = tof_data.min(axis=1)
            seq_df_copy[f'tof_{i}_max'] = tof_data.max(axis=1)
            
            # Enhanced stats
            seq_df_copy[f'tof_{i}_median'] = tof_data.median(axis=1)
            seq_df_copy[f'tof_{i}_q25'] = tof_data.quantile(0.25, axis=1)
            seq_df_copy[f'tof_{i}_q75'] = tof_data.quantile(0.75, axis=1)
            seq_df_copy[f'tof_{i}_range'] = seq_df_copy[f'tof_{i}_max'] - seq_df_copy[f'tof_{i}_min']
            seq_df_copy[f'tof_{i}_iqr'] = seq_df_copy[f'tof_{i}_q75'] - seq_df_copy[f'tof_{i}_q25']
            seq_df_copy[f'tof_{i}_valid_count'] = tof_data.notna().sum(axis=1)
        
        # Extract features and labels
        X_list_unscaled.append(
            seq_df_copy[final_feature_cols].ffill().bfill().fillna(0).values.astype('float32')
        )
        y_list_int.append(seq_df_copy['gesture_int'].iloc[0])
        groups_list.append(seq_df_copy['subject'].iloc[0])
        lens.append(len(seq_df_copy))
    
    # ===== SCALING AND PADDING =====
    print("Fitting StandardScaler...")
    all_steps_concatenated = np.concatenate(X_list_unscaled, axis=0)
    scaler = StandardScaler().fit(all_steps_concatenated)
    joblib.dump(scaler, output_dir / "scaler.pkl")
    
    print("Scaling and padding sequences...")
    X_scaled_list = [scaler.transform(x_seq) for x_seq in X_list_unscaled]
    
    # Determine padding length
    pad_len = int(np.percentile(lens, pad_percentile))
    np.save(output_dir / "sequence_maxlen.npy", pad_len)
    print(f"Padding sequences to length {pad_len}")
    
    # Pad sequences
    X = pad_sequences(X_scaled_list, maxlen=pad_len, padding='post', 
                      truncating='post', dtype='float32')
    y_stratify = np.array(y_list_int)
    groups = np.array(groups_list)
    y = to_categorical(y_list_int, num_classes=len(le.classes_))
    
    print(f"Final data shape: X={X.shape}, y={y.shape}")
    
    # ===== STRATIFIED GROUP K-FOLD CROSS-VALIDATION =====
    print(f"\n  Starting training with Stratified Group K-Fold CV (n_splits={n_splits})...")
    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_preds = np.zeros_like(y, dtype='float32')
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y_stratify, groups)):
        print(f"\n{'='*20} FOLD {fold+1}/{n_splits} {'='*20}")
        
        # Split data
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        print(f"  Train: {len(X_tr)} samples | Val: {len(X_val)} samples")
        
        # Build model
        model = build_two_branch_model_v2(
            pad_len=pad_len, 
            imu_dim=imu_dim, 
            tof_dim=tof_thm_dim, 
            n_classes=len(le.classes_), 
            wd=wd
        )
        
        # Compile with label smoothing
        model.compile(
            optimizer=Adam(learning_rate=lr_init),
            loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
            metrics=['accuracy']
        )
        
        # Class weights for imbalanced data
        class_weights = compute_class_weight(
            'balanced', 
            classes=np.arange(len(le.classes_)), 
            y=y_tr.argmax(1)
        )
        class_weight_dict = dict(enumerate(class_weights))
        
        # Data generators
        train_gen = EnhancedMixupGenerator(
            X_tr, y_tr, 
            batch_size=batch_size, 
            alpha=mixup_alpha, 
            augment=True
        )
        
        # Callbacks with fold-specific naming
        fold_callbacks = [
            EarlyStopping(
                monitor='val_loss', 
                patience=patience, 
                restore_best_weights=True, 
                verbose=1
            ),
            lr_scheduler,
            tf.keras.callbacks.ReduceLROnPlateau(
                factor=0.5, 
                patience=10, 
                min_lr=1e-7, 
                verbose=1
            ),
            tf.keras.callbacks.ModelCheckpoint(
                str(output_dir / f'model_fold_{fold}_best.h5'),
                save_best_only=True,
                monitor='val_loss',
                verbose=1
            )
        ]
        
        # Train
        history = model.fit(
            train_gen,
            epochs=epochs,
            validation_data=(X_val, y_val),
            callbacks=fold_callbacks,
            class_weight=class_weight_dict,
            verbose=1
        )
        
        # Save final model for this fold
        model.save(output_dir / f"model_fold_{fold}_final.h5")
        
        # Get predictions for OOF
        preds_val = model.predict(X_val, verbose=0)
        oof_preds[val_idx] = preds_val
        
        # Calculate fold score
        fold_acc = np.mean(preds_val.argmax(1) == y_val.argmax(1))
        fold_scores.append(fold_acc)
        print(f"Fold {fold+1} Validation Accuracy: {fold_acc:.4f}")
        
        # Clear session to free memory
        tf.keras.backend.clear_session()
    
    # ===== CALCULATE OOF SCORE =====
    print(f"\n{'='*50}")
    print("Training Complete!")
    print(f"Average Fold Score: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})")
    
    # Calculate overall OOF accuracy
    oof_acc = np.mean(oof_preds.argmax(1) == y.argmax(1))
    print(f"Overall OOF Accuracy: {oof_acc:.4f}")
    
    # Save OOF predictions
    np.save(output_dir / "oof_predictions.npy", oof_preds)
    
    # If competition metric is available
    try:
        true_oof_int = y.argmax(1)
        pred_oof_int = oof_preds.argmax(1)
        
        h_f1_oof = CompetitionMetric().calculate_hierarchical_f1(
            pd.DataFrame({'gesture': le.classes_[true_oof_int]}),
            pd.DataFrame({'gesture': le.classes_[pred_oof_int]})
        )
        print(f"Overall OOF Hierarchical F1 Score: {h_f1_oof:.4f}")
    except ImportError:
        print("Competition metric not available - skipping H-F1 calculation")
    
    print("Training pipeline completed successfully!")

Loading dataset...
Removing gravity and calculating linear acceleration features...
IMU features: 24 | THM + ToF features: 55 | Total: 79
Building sequences...
