In [1]:
import numpy as np

In [2]:
data = np.load("X_imu_train X_thermal_train X_tof_train Y unscaled.npz")
X_imu_train = data["X_imu_train"]
X_thermal_train = data["X_thermal_train"]
X_tof_train = data["X_tof_train"]
Y = data["Y"]


print(X_imu_train.shape, X_thermal_train.shape, X_tof_train.shape, Y.shape)

(8151, 400, 7) (8151, 400, 5) (8151, 400, 320) (8151,)


In [3]:
from sklearn.model_selection import train_test_split
import numpy as np

def split_multimodal_data(X_imu, X_thermal, X_tof, y, test_size=0.15, random_state=42):
    """
    Properly split multi-modal data maintaining sample correspondence
    """
    
    # Get total number of samples
    n_samples = X_imu.shape[0]
    
    # Create indices for splitting
    indices = np.arange(n_samples)
    
    # Split indices (not the data directly)
    train_idx, val_idx = train_test_split(
        indices, 
        test_size=test_size, 
        random_state=random_state,
        stratify=y  # Maintain class distribution
    )
    
    # Use indices to split all arrays consistently
    X_train_imu = X_imu[train_idx]
    X_val_imu = X_imu[val_idx]
    
    X_train_thermal = X_thermal[train_idx]
    X_val_thermal = X_thermal[val_idx]
    
    X_train_tof = X_tof[train_idx]
    X_val_tof = X_tof[val_idx]
    
    y_train = y[train_idx]
    y_val = y[val_idx]
    
    # Verify shapes match
    assert X_train_imu.shape[0] == X_train_thermal.shape[0] == X_train_tof.shape[0] == y_train.shape[0]
    assert X_val_imu.shape[0] == X_val_thermal.shape[0] == X_val_tof.shape[0] == y_val.shape[0]
    
    print(f"Training samples: {X_train_imu.shape[0]}")
    print(f"Validation samples: {X_val_imu.shape[0]}")
    
    return (X_train_imu, X_train_thermal, X_train_tof, y_train), (X_val_imu, X_val_thermal, X_val_tof, y_val)

In [4]:
# Usage:
train_data, val_data = split_multimodal_data(X_imu_train, X_thermal_train, X_tof_train, Y)
X_train_imu, X_train_thermal, X_train_tof, y_train = train_data
X_val_imu, X_val_thermal, X_val_tof, y_val = val_data

# Prepare for model training
X_train_list = [X_train_imu, X_train_thermal, X_train_tof]
X_val_list = [X_val_imu, X_val_thermal, X_val_tof]

Training samples: 6928
Validation samples: 1223


In [5]:
print(X_train_imu.shape, X_train_thermal.shape, X_train_tof.shape)
print(X_val_imu.shape, X_val_thermal.shape, X_val_tof.shape)

(6928, 400, 7) (6928, 400, 5) (6928, 400, 320)
(1223, 400, 7) (1223, 400, 5) (1223, 400, 320)


In [6]:
del X_imu_train 
del X_thermal_train 
del X_tof_train 
import gc 
gc.collect()

0

In [7]:
del X_train_imu 
del X_train_thermal 
del X_train_tof 

del X_val_imu 
del X_val_thermal 
del X_val_tof 
import gc 
gc.collect()

0

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv1D, LSTM, Dense, Dropout, BatchNormalization,
    GlobalAveragePooling1D, LayerNormalization, Masking, 
    Concatenate, MultiHeadAttention, Reshape, Flatten, Add
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import numpy as np

def create_imu_stream(input_shape, stream_name="imu"):
    """
    IMU Stream: Conv1D + LSTM
    Input shape: (batch_size, 400, 7)
    """
    input_layer = Input(shape=input_shape, name=f'{stream_name}_input')
    x = Masking(mask_value=0.0, name=f'{stream_name}_masking')(input_layer)
    
    # Conv1D blocks for local temporal patterns
    x = Conv1D(64, 3, activation='relu', padding='same', name=f'{stream_name}_conv1')(x)
    x = BatchNormalization(name=f'{stream_name}_bn1')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout1')(x)
    
    x = Conv1D(128, 3, activation='relu', padding='same', name=f'{stream_name}_conv2')(x)
    x = BatchNormalization(name=f'{stream_name}_bn2')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout2')(x)
    
    x = Conv1D(64, 3, activation='relu', padding='same', name=f'{stream_name}_conv3')(x)
    x = BatchNormalization(name=f'{stream_name}_bn3')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout3')(x)
    
    # LSTM layers for temporal modeling
    x = LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3, name=f'{stream_name}_lstm1')(x)
    x = LSTM(64, return_sequences=False, dropout=0.3, recurrent_dropout=0.3, name=f'{stream_name}_lstm2')(x)
    
    return input_layer, x

def create_thermal_stream(input_shape, stream_name="thermal"):
    """
    Thermal Stream: Conv1D + LSTM
    Input shape: (batch_size, 400, 5)
    """
    input_layer = Input(shape=input_shape, name=f'{stream_name}_input')
    x = Masking(mask_value=0.0, name=f'{stream_name}_masking')(input_layer)
    
    # Conv1D blocks for thermal patterns
    x = Conv1D(32, 3, activation='relu', padding='same', name=f'{stream_name}_conv1')(x)
    x = BatchNormalization(name=f'{stream_name}_bn1')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout1')(x)
    
    x = Conv1D(64, 3, activation='relu', padding='same', name=f'{stream_name}_conv2')(x)
    x = BatchNormalization(name=f'{stream_name}_bn2')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout2')(x)
    
    x = Conv1D(32, 3, activation='relu', padding='same', name=f'{stream_name}_conv3')(x)
    x = BatchNormalization(name=f'{stream_name}_bn3')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout3')(x)
    
    # LSTM layers
    x = LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3, name=f'{stream_name}_lstm1')(x)
    x = LSTM(32, return_sequences=False, dropout=0.3, recurrent_dropout=0.3, name=f'{stream_name}_lstm2')(x)
    
    return input_layer, x

def create_tof_stream(input_shape, stream_name="tof"):
    """
    ToF Stream: Conv1D + LSTM (modified to use Conv1D instead of Conv2D)
    Input shape: (batch_size, 400, 320)
    """
    input_layer = Input(shape=input_shape, name=f'{stream_name}_input')
    
    # Handle missing ToF data (replace -1 with 0 and mask)
    x = tf.keras.layers.Lambda(lambda x: tf.where(x == -1, 0.0, x), name=f'{stream_name}_handle_missing')(input_layer)
    x = Masking(mask_value=0.0, name=f'{stream_name}_masking')(x)
    
    # Conv1D blocks for ToF patterns - treating 320 features as channels
    x = Conv1D(64, 3, activation='relu', padding='same', name=f'{stream_name}_conv1')(x)
    x = BatchNormalization(name=f'{stream_name}_bn1')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout1')(x)
    
    x = Conv1D(128, 3, activation='relu', padding='same', name=f'{stream_name}_conv2')(x)
    x = BatchNormalization(name=f'{stream_name}_bn2')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout2')(x)
    
    x = Conv1D(64, 3, activation='relu', padding='same', name=f'{stream_name}_conv3')(x)
    x = BatchNormalization(name=f'{stream_name}_bn3')(x)
    x = Dropout(0.3, name=f'{stream_name}_dropout3')(x)
    
    # LSTM for temporal modeling
    x = LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3, name=f'{stream_name}_lstm1')(x)
    x = LSTM(32, return_sequences=False, dropout=0.3, recurrent_dropout=0.3, name=f'{stream_name}_lstm2')(x)
    
    return input_layer, x



def create_attention_fusion(feature_streams, fusion_dim=128):
    """
    Create attention-based fusion for multiple feature streams
    """
    # Project all streams to the same dimension
    projected_streams = []
    for i, features in enumerate(feature_streams):
        projected = Dense(fusion_dim, activation='relu', name=f'projection_{i}')(features)
        projected = LayerNormalization(name=f'projection_ln_{i}')(projected)
        projected_streams.append(projected)
    
    # Stack features for attention - reshape to (batch, seq_len=num_streams, features)
    stacked_features = tf.keras.layers.Lambda(
        lambda x: tf.stack(x, axis=1), 
        name='stack_features'
    )(projected_streams)
    
    # Multi-head self-attention
    attention_output = MultiHeadAttention(
        num_heads=4,
        key_dim=fusion_dim // 4,
        dropout=0.1,
        name='multihead_attention'
    )(stacked_features, stacked_features)
    
    # Add residual connection
    attention_output = Add(name='attention_residual')([stacked_features, attention_output])
    attention_output = LayerNormalization(name='attention_ln')(attention_output)
    
    # Global average pooling to get final fused representation
    fused_features = GlobalAveragePooling1D(name='attention_pool')(attention_output)
    
    return fused_features


In [11]:
def create_multimodal_bfrb_model(
    imu_shape=(400, 7),
    thermal_shape=(400, 5), 
    tof_shape=(400, 320),
    n_classes=18,
    fusion_dim=128
):
    """
    Multi-modal model with IMU, Thermal, and ToF streams using Conv1D and attention fusion
    """
    # Create individual streams
    imu_input, imu_features = create_imu_stream(imu_shape, "imu")
    thermal_input, thermal_features = create_thermal_stream(thermal_shape, "thermal") 
    tof_input, tof_features = create_tof_stream(tof_shape, "tof")
    
    # Attention-based fusion
    feature_streams = [imu_features, thermal_features, tof_features]
    fused_features = create_attention_fusion(feature_streams, fusion_dim)
    
    # Classification layers
    x = Dense(128, activation='relu', name='fusion_dense1')(fused_features)
    x = Dropout(0.4, name='fusion_dropout1')(x)
    x = Dense(64, activation='relu', name='fusion_dense2')(x)
    x = Dropout(0.4, name='fusion_dropout2')(x)
    
    output = Dense(n_classes, activation='softmax', name='gesture_output')(x)
    
    model = Model(
        inputs=[imu_input, thermal_input, tof_input],
        outputs=output,
        name='Attention_MultiModal_BFRB'
    )
    
    return model

In [12]:
all_sensor_fusion_model = create_multimodal_bfrb_model()
all_sensor_fusion_model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss=SparseCategoricalCrossentropy(from_logits=False),
        metrics=['accuracy', 'sparse_top_k_categorical_accuracy']
    )
# all_sensor_simple_fusion_model.summary()

2025-09-01 15:27:27.785699: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2025-09-01 15:27:27.820202: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:171] verbose logging is disabled. Rerun with verbose logging (usually --v=1 or --vmodule=cuda_diagnostics=1) to get more diagnostic output from this module
2025-09-01 15:27:27.820218: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:176] retrieving CUDA diagnostic information for host: ankur-Legion-5-15IRX9
2025-09-01 15:27:27.820221: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:183] hostname: ankur-Legion-5-15IRX9
2025-09-01 15:27:27.820336: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:190] libcuda reported version is: 575.64.3
2025-09-01 15:27:27.820352: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:194] kernel reported 

In [14]:
    
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
    
callbacks = [
        EarlyStopping(patience=8, restore_best_weights=True, verbose=1,monitor='val_loss'),
        ReduceLROnPlateau(patience=3, factor=0.5, verbose=1, min_lr=1e-7),
        ModelCheckpoint('best_simple_fusion_multimodal_model.h5', save_best_only=True, verbose=1)
]
    
    # Train model
history = all_sensor_fusion_model.fit(
        X_train_list,  # List of [IMU, thermal, ToF] arrays
        y_train,
        epochs=100,
        batch_size=32,
        validation_data=(X_val_list, y_val),
        callbacks=callbacks,
        verbose=1
    )
history

Epoch 1/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334ms/step - accuracy: 0.0851 - loss: 2.8314 - sparse_top_k_categorical_accuracy: 0.3794
Epoch 1: val_loss improved from None to 2.79409, saving model to best_simple_fusion_multimodal_model.h5




[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 356ms/step - accuracy: 0.0798 - loss: 2.8201 - sparse_top_k_categorical_accuracy: 0.3786 - val_accuracy: 0.0801 - val_loss: 2.7941 - val_sparse_top_k_categorical_accuracy: 0.3925 - learning_rate: 0.0010
Epoch 2/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 357ms/step - accuracy: 0.0737 - loss: 2.7966 - sparse_top_k_categorical_accuracy: 0.3797
Epoch 2: val_loss improved from 2.79409 to 2.76609, saving model to best_simple_fusion_multimodal_model.h5




[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 372ms/step - accuracy: 0.0722 - loss: 2.7940 - sparse_top_k_categorical_accuracy: 0.3736 - val_accuracy: 0.0801 - val_loss: 2.7661 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 0.0010
Epoch 3/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 362ms/step - accuracy: 0.0784 - loss: 2.7822 - sparse_top_k_categorical_accuracy: 0.3799
Epoch 3: val_loss improved from 2.76609 to 2.76385, saving model to best_simple_fusion_multimodal_model.h5




[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 375ms/step - accuracy: 0.0801 - loss: 2.7817 - sparse_top_k_categorical_accuracy: 0.3776 - val_accuracy: 0.0801 - val_loss: 2.7639 - val_sparse_top_k_categorical_accuracy: 0.3925 - learning_rate: 0.0010
Epoch 4/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 340ms/step - accuracy: 0.0805 - loss: 2.7781 - sparse_top_k_categorical_accuracy: 0.3817
Epoch 4: val_loss improved from 2.76385 to 2.75861, saving model to best_simple_fusion_multimodal_model.h5




[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 353ms/step - accuracy: 0.0837 - loss: 2.7727 - sparse_top_k_categorical_accuracy: 0.3899 - val_accuracy: 0.0801 - val_loss: 2.7586 - val_sparse_top_k_categorical_accuracy: 0.3925 - learning_rate: 0.0010
Epoch 5/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 354ms/step - accuracy: 0.0819 - loss: 2.7806 - sparse_top_k_categorical_accuracy: 0.3830
Epoch 5: val_loss improved from 2.75861 to 2.75713, saving model to best_simple_fusion_multimodal_model.h5




[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 370ms/step - accuracy: 0.0844 - loss: 2.7736 - sparse_top_k_categorical_accuracy: 0.3835 - val_accuracy: 0.0801 - val_loss: 2.7571 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 0.0010
Epoch 6/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 364ms/step - accuracy: 0.0810 - loss: 2.7595 - sparse_top_k_categorical_accuracy: 0.3994
Epoch 6: val_loss did not improve from 2.75713
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 380ms/step - accuracy: 0.0746 - loss: 2.7654 - sparse_top_k_categorical_accuracy: 0.3887 - val_accuracy: 0.0801 - val_loss: 2.7591 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 0.0010
Epoch 7/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350ms/step - accuracy: 0.0849 - loss: 2.7654 - sparse_top_k_categorical_accuracy: 0.3995
Epoch 7: val_loss improved from 2.75713 to 2.75683, saving model to best_simple



[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 364ms/step - accuracy: 0.0817 - loss: 2.7649 - sparse_top_k_categorical_accuracy: 0.3933 - val_accuracy: 0.0801 - val_loss: 2.7568 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 0.0010
Epoch 8/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step - accuracy: 0.0805 - loss: 2.7590 - sparse_top_k_categorical_accuracy: 0.3913
Epoch 8: val_loss did not improve from 2.75683
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 350ms/step - accuracy: 0.0800 - loss: 2.7622 - sparse_top_k_categorical_accuracy: 0.3912 - val_accuracy: 0.0801 - val_loss: 2.7569 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 0.0010
Epoch 9/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 335ms/step - accuracy: 0.0896 - loss: 2.7560 - sparse_top_k_categorical_accuracy: 0.3993
Epoch 9: val_loss improved from 2.75683 to 2.75567, saving model to best_simple



[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 349ms/step - accuracy: 0.0839 - loss: 2.7625 - sparse_top_k_categorical_accuracy: 0.3930 - val_accuracy: 0.0801 - val_loss: 2.7557 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 0.0010
Epoch 10/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step - accuracy: 0.0748 - loss: 2.7721 - sparse_top_k_categorical_accuracy: 0.3858
Epoch 10: val_loss improved from 2.75567 to 2.75524, saving model to best_simple_fusion_multimodal_model.h5




[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 350ms/step - accuracy: 0.0801 - loss: 2.7619 - sparse_top_k_categorical_accuracy: 0.3881 - val_accuracy: 0.0801 - val_loss: 2.7552 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 0.0010
Epoch 11/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319ms/step - accuracy: 0.0829 - loss: 2.7553 - sparse_top_k_categorical_accuracy: 0.3939
Epoch 11: val_loss did not improve from 2.75524
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 331ms/step - accuracy: 0.0784 - loss: 2.7622 - sparse_top_k_categorical_accuracy: 0.3834 - val_accuracy: 0.0801 - val_loss: 2.7556 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 0.0010
Epoch 12/100
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 314ms/step - accuracy: 0.0745 - loss: 2.7729 - sparse_top_k_categorical_accuracy: 0.3838
Epoch 12: val_loss did not improve from 2.75524
[1m217/217[0m [32m━━━━━━━



[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 346ms/step - accuracy: 0.0792 - loss: 2.7595 - sparse_top_k_categorical_accuracy: 0.3850 - val_accuracy: 0.0801 - val_loss: 2.7552 - val_sparse_top_k_categorical_accuracy: 0.3941 - learning_rate: 5.0000e-04
Epoch 17/100
[1m 71/217[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m53s[0m 365ms/step - accuracy: 0.0730 - loss: 2.7708 - sparse_top_k_categorical_accuracy: 0.3876

KeyboardInterrupt: 