In [None]:
import os, json, joblib, numpy as np, pandas as pd
from pathlib import Path
import warnings 
import random
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedGroupKFold

import tensorflow as tf
from tensorflow.keras.utils import Sequence, to_categorical, pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, Conv1D, BatchNormalization, Activation, add, MaxPooling1D, Dropout,
    Bidirectional, LSTM, GlobalAveragePooling1D, Dense, Multiply, Reshape,
    Lambda, Concatenate, GRU, GaussianNoise, Add, GlobalMaxPooling1D,
    MultiHeadAttention, LayerNormalization
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras import backend as K

import polars as pl
from scipy.spatial.transform import Rotation as R
from joblib import Parallel, delayed
import multiprocessing

# Enable mixed precision training for faster computation
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [None]:
gpus = tf.config.list_physical_devices('GPU')
gpus

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.experimental.numpy.random.seed(seed)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
seed_everything(seed=42)

In [None]:
train = True                    
raw_dir = Path("input")
pretrained_dir = Path("input/cmi-d-111")
output_dir = Path("./")                                    
batch_size = 64 
pad_percentile = 95
lr_init = 5e-4
wd = 3e-3
mixup_alpha = 0.4
epochs = 160  
patience = 40 
n_splits = 5 
patience = 40

print("Imports ready")

In [None]:
#Tensor Manipulations
def time_sum(x):
    return K.sum(x, axis=1)

def squeeze_last_axis(x):
    return tf.squeeze(x, axis=-1)

def expand_last_axis(x):
    return tf.expand_dims(x, axis=-1)

def se_block(x, reduction=8):
    ch = x.shape[-1]
    se = GlobalAveragePooling1D()(x)
    se = Dense(ch // reduction, activation='relu')(se)
    se = Dense(ch, activation='sigmoid')(se)
    se = Reshape((1, ch))(se)
    return Multiply()([x, se])

# Residual CNN Block with SE
def residual_se_cnn_block(x, filters, kernel_size, pool_size=2, drop=0.3, wd=1e-4):
    shortcut = x
    for _ in range(2):
        x = Conv1D(filters, kernel_size, padding='same', use_bias=False,
                   kernel_regularizer=l2(wd))(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
    x = se_block(x)
    if shortcut.shape[-1] != filters:
        shortcut = Conv1D(filters, 1, padding='same', use_bias=False,
                          kernel_regularizer=l2(wd))(shortcut)
        shortcut = BatchNormalization()(shortcut)
    x = add([x, shortcut])
    x = Activation('relu')(x)
    x = MaxPooling1D(pool_size)(x)
    x = Dropout(drop)(x)
    return x

def attention_layer(inputs):
    score = Dense(1, activation='tanh')(inputs)
    score = Lambda(squeeze_last_axis)(score)
    weights = Activation('softmax')(score)
    weights = Lambda(expand_last_axis)(weights)
    context = Multiply()([inputs, weights])
    context = Lambda(time_sum)(context)
    return context

In [None]:
# Optimized physics calculations using vectorization
def remove_gravity_from_acc_vectorized(acc_values, quat_values):
    """Vectorized gravity removal for better performance"""
    num_samples = acc_values.shape[0]
    linear_accel = acc_values.copy()
    
    # Filter valid quaternions
    valid_mask = ~(np.any(np.isnan(quat_values), axis=1) | 
                   np.all(np.isclose(quat_values, 0), axis=1))
    
    if np.any(valid_mask):
        # Process all valid quaternions at once
        valid_quats = quat_values[valid_mask]
        
        # Batch rotation computation
        try:
            rotations = R.from_quat(valid_quats)
            gravity_world = np.array([0, 0, 9.81])
            
            # Apply rotations in batch
            gravity_sensor_frames = rotations.apply(gravity_world, inverse=True)
            linear_accel[valid_mask] = acc_values[valid_mask] - gravity_sensor_frames
        except:
            pass
            
    return linear_accel

In [None]:
def calculate_angular_velocity_vectorized(quat_values, time_delta=1/200):
    """Vectorized angular velocity calculation"""
    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))
    
    if num_samples < 2:
        return angular_vel
        
    # Process in chunks for memory efficiency
    chunk_size = 1000
    for start in range(0, num_samples - 1, chunk_size):
        end = min(start + chunk_size, num_samples - 1)
        
        q_t = quat_values[start:end]
        q_t_plus_dt = quat_values[start+1:end+1]
        
        # Find valid pairs
        valid_mask = ~(np.any(np.isnan(q_t), axis=1) | 
                      np.all(np.isclose(q_t, 0), axis=1) |
                      np.any(np.isnan(q_t_plus_dt), axis=1) | 
                      np.all(np.isclose(q_t_plus_dt, 0), axis=1))
        
        if np.any(valid_mask):
            try:
                valid_indices = np.where(valid_mask)[0]
                rot_t = R.from_quat(q_t[valid_mask])
                rot_t_plus_dt = R.from_quat(q_t_plus_dt[valid_mask])
                
                # Batch computation
                delta_rot = rot_t.inv() * rot_t_plus_dt
                angular_vel[start + valid_indices] = delta_rot.as_rotvec() / time_delta
            except:
                pass
                
    return angular_vel

In [None]:
# Simplified MixUp generator
class FastMixupGenerator(Sequence):
    def __init__(self, X, y, batch_size, alpha=0.4):
        self.X, self.y = X, y
        self.batch = batch_size
        self.alpha = alpha
        self.indices = np.arange(len(X))
        
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch))
        
    def __getitem__(self, i):
        idx = self.indices[i*self.batch:(i+1)*self.batch]
        Xb, yb = self.X[idx], self.y[idx]
        
        # Simple mixup
        lam = np.random.beta(self.alpha, self.alpha)
        perm = np.random.permutation(len(Xb))
        X_mix = lam * Xb + (1-lam) * Xb[perm]
        y_mix = lam * yb + (1-lam) * yb[perm]
        
        return X_mix.astype('float32'), y_mix.astype('float32')
        
    def on_epoch_end(self):
        np.random.shuffle(self.indices)

In [None]:
# Simplified model architecture (slightly reduced complexity)
def build_two_branch_model_optimized(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):
    inp = Input(shape=(pad_len, imu_dim+tof_dim))
    imu = Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = Lambda(lambda t: t[:, :, imu_dim:])(inp)
    
    # IMU branch - simplified
    x1 = residual_se_cnn_block(imu, 64, 5, drop=0.1, wd=wd)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.1, wd=wd)
    
    # TOF branch - simplified
    x2 = Conv1D(128, 1, padding='same', use_bias=False, kernel_regularizer=l2(wd))(tof)
    x2 = BatchNormalization()(x2)
    x2 = Activation('relu')(x2)
    x2 = residual_se_cnn_block(x2, 192, 3, drop=0.2, wd=wd)
    x2 = residual_se_cnn_block(x2, 256, 3, drop=0.2, wd=wd)
    
    # Simple concatenation
    merged = Concatenate()([x1, x2])
    
    # Single RNN layer instead of multiple
    x = Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(wd),
                         dropout=0.2, recurrent_dropout=0.2))(merged)
    x = Dropout(0.3)(x)
    
    # Attention
    x_att = attention_layer(x)
    x_pool = GlobalAveragePooling1D()(x)
    x = Concatenate()([x_att, x_pool])
    
    # Simplified classifier
    x = Dense(256, use_bias=False, kernel_regularizer=l2(wd))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.4)(x)
    
    x = Dense(128, use_bias=False, kernel_regularizer=l2(wd))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.3)(x)
    
    # Output layer with float32 for stability with mixed precision
    x = Dense(n_classes, kernel_regularizer=l2(wd), dtype='float32')(x)
    out = Activation('softmax', dtype='float32')(x)
    
    return Model(inp, out)

In [None]:
# Parallel processing for sequence features
def process_sequence(seq_data):
    """Process a single sequence with all feature engineering"""
    seq_data = seq_data.copy()
    
    # Get numpy arrays for faster processing
    acc_values = seq_data[['acc_x', 'acc_y', 'acc_z']].values
    quat_values = seq_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    
    # Linear acceleration
    linear_accel = remove_gravity_from_acc_vectorized(acc_values, quat_values)
    seq_data['linear_acc_x'] = linear_accel[:, 0]
    seq_data['linear_acc_y'] = linear_accel[:, 1]
    seq_data['linear_acc_z'] = linear_accel[:, 2]
    
    # Magnitudes
    seq_data['acc_mag'] = np.linalg.norm(acc_values, axis=1)
    seq_data['linear_acc_mag'] = np.linalg.norm(linear_accel, axis=1)
    
    # Jerk (simplified)
    seq_data['linear_acc_mag_jerk'] = np.gradient(seq_data['linear_acc_mag']) * 200
    
    # Angular velocity
    angular_vel = calculate_angular_velocity_vectorized(quat_values)
    seq_data['angular_vel_x'] = angular_vel[:, 0]
    seq_data['angular_vel_y'] = angular_vel[:, 1]
    seq_data['angular_vel_z'] = angular_vel[:, 2]
    
    # ToF aggregations (vectorized)
    for i in range(1, 6):
        pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]
        tof_data = seq_data[pixel_cols].values
        tof_data[tof_data == -1] = np.nan
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            seq_data[f'tof_{i}_mean'] = np.nanmean(tof_data, axis=1)
            seq_data[f'tof_{i}_std'] = np.nanstd(tof_data, axis=1)
            seq_data[f'tof_{i}_min'] = np.nanmin(tof_data, axis=1)
            seq_data[f'tof_{i}_max'] = np.nanmax(tof_data, axis=1)
    
    return seq_data

In [None]:
if train:
    print("Loading dataset...")
    
    # Load data
    df = pd.read_csv(raw_dir / "train.csv")
    train_dem_df = pd.read_csv(raw_dir / "train_demographics.csv")
    df = pd.merge(df, train_dem_df, on='subject', how='left')
    
    # Encode labels
    le = LabelEncoder()
    df['gesture_int'] = le.fit_transform(df['gesture'])
    gesture_classes = le.classes_
    np.save(output_dir / "gesture_classes.npy", gesture_classes)
    
    print("Processing sequences with parallel computation...")
    
    # Group by sequence
    sequences = [group for _, group in df.groupby('sequence_id')]
    
    # Process in parallel
    n_cores = multiprocessing.cpu_count()
    print(f"Using {n_cores} cores for parallel processing")
    
    processed_sequences = Parallel(n_jobs=n_cores)(
        delayed(process_sequence)(seq) for seq in sequences
    )
    
    # Combine processed sequences
    df = pd.concat(processed_sequences, ignore_index=True)
    
    # Define feature columns
    imu_cols = ['acc_x', 'acc_y', 'acc_z', 
                'linear_acc_x', 'linear_acc_y', 'linear_acc_z',
                'rot_x', 'rot_y', 'rot_z', 'rot_w',
                'acc_mag', 'linear_acc_mag', 'linear_acc_mag_jerk',
                'angular_vel_x', 'angular_vel_y', 'angular_vel_z']
    
    thm_cols = [c for c in df.columns if c.startswith('thm_')]
    tof_aggregated_cols = []
    for i in range(1, 6):
        tof_aggregated_cols.extend([
            f'tof_{i}_mean', f'tof_{i}_std', f'tof_{i}_min', f'tof_{i}_max'
        ])
    
    final_feature_cols = imu_cols + thm_cols + tof_aggregated_cols
    imu_dim = len(imu_cols)
    tof_thm_dim = len(thm_cols) + len(tof_aggregated_cols)
    
    print(f"IMU features: {imu_dim} | THM + ToF features: {tof_thm_dim}")
    np.save(output_dir / "feature_cols.npy", np.array(final_feature_cols))
    
    # Build sequences efficiently
    print("Building sequences...")
    seq_gp = df.groupby('sequence_id')
    X_list_unscaled = []
    y_list_int = []
    groups_list = []
    lens = []
    
    for seq_id, seq_df in seq_gp:
        X_list_unscaled.append(
            seq_df[final_feature_cols].fillna(0).values.astype('float32')
        )
        y_list_int.append(seq_df['gesture_int'].iloc[0])
        groups_list.append(seq_df['subject'].iloc[0])
        lens.append(len(seq_df))
    
    # Scaling
    print("Fitting StandardScaler...")
    all_steps_concatenated = np.concatenate(X_list_unscaled, axis=0)
    scaler = StandardScaler().fit(all_steps_concatenated)
    joblib.dump(scaler, output_dir / "scaler.pkl")
    
    # Scale and pad
    X_scaled_list = [scaler.transform(x_seq) for x_seq in X_list_unscaled]
    pad_len = int(np.percentile(lens, pad_percentile))
    np.save(output_dir / "sequence_maxlen.npy", pad_len)
    
    X = pad_sequences(X_scaled_list, maxlen=pad_len, padding='post', 
                      truncating='post', dtype='float32')
    y_stratify = np.array(y_list_int)
    groups = np.array(groups_list)
    y = to_categorical(y_list_int, num_classes=len(le.classes_))
    
    print(f"Final data shape: X={X.shape}, y={y.shape}")
    
    # Cross-validation with reduced folds
    print(f"\nStarting training with {n_splits}-fold CV...")
    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y_stratify, groups)):
        print(f"\n{'='*20} FOLD {fold+1}/{n_splits} {'='*20}")
        
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        # Build model
        model = build_two_branch_model_optimized(
            pad_len=pad_len, 
            imu_dim=imu_dim, 
            tof_dim=tof_thm_dim, 
            n_classes=len(le.classes_), 
            wd=wd
        )
        
        # Compile with mixed precision optimizer
        opt = Adam(learning_rate=lr_init)
        opt = mixed_precision.LossScaleOptimizer(opt)
        
        model.compile(
            optimizer=opt,
            loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
            metrics=['accuracy']
        )
        
        # Simplified callbacks
        callbacks = [
            EarlyStopping(
                monitor='val_loss', 
                patience=patience, 
                restore_best_weights=True, 
                verbose=1
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                factor=0.5, 
                patience=10, 
                min_lr=1e-6, 
                verbose=1
            ),
            tf.keras.callbacks.ModelCheckpoint(
                str(output_dir / f'model_fold_{fold}_best.h5'),
                save_best_only=True,
                monitor='val_loss'
            )
        ]
        
        # Train with larger batch size
        train_gen = FastMixupGenerator(
            X_tr, y_tr, 
            batch_size=batch_size, 
            alpha=mixup_alpha
        )
        
        history = model.fit(
            train_gen,
            epochs=epochs,
            validation_data=(X_val, y_val),
            callbacks=callbacks,
            verbose=1
        )
        
        # Save model
        model.save(output_dir / f"model_fold_{fold}_final.h5")
        
        # Clear session
        tf.keras.backend.clear_session()
    
    print("\nTraining completed!")