#  CogniVue: Training

** FULLY COMPATIBLE with your preprocessing notebook!**

This notebook:
-  Loads data from single `.pkl` files per split
-  Handles variable channel count (typically 58 channels)
-  Correctly transposes X from `(n_ch, 256)` to `(256, n_ch)`
-  Robust checkpointing and error handling
-  Resume training from interruptions
-  



##  1. Install Dependencies

##  2. Configuration

In [1]:
import os
import sys
import json
import time
import shutil
import numpy as np
import tensorflow as tf
import pickle
from datetime import datetime

print(f"TensorFlow version: {tf.__version__}")
print(f"Python version: {sys.version}")

# =====================================================
# PATHS CONFIGURATION
# =====================================================

# UPDATE THIS to match your dataset name!
DATASET_NAME = "preprocessed-cog-eeg-dataset"  

# Input paths
DATA_INPUT_DIR = f"/kaggle/input/{DATASET_NAME}/processed"

# Output paths
WORKING_DIR = "/kaggle/working"
CHECKPOINT_DIR = os.path.join(WORKING_DIR, "checkpoints")
RESULTS_DIR = os.path.join(WORKING_DIR, "results")
LOGS_DIR = os.path.join(WORKING_DIR, "logs")

for d in [CHECKPOINT_DIR, RESULTS_DIR, LOGS_DIR]:
    os.makedirs(d, exist_ok=True)

print(f"\n Paths configured:")
print(f"  Input: {DATA_INPUT_DIR}")
print(f"  Checkpoints: {CHECKPOINT_DIR}")
print(f"  Results: {RESULTS_DIR}")

# =====================================================
# DATA CONSTANTS (from preprocessing)
# =====================================================

WINDOW_SIZE_SAMPLES = 256
NUM_BANDS = 5  # delta, theta, alpha, beta, gamma
NUM_TASKS = 4  # N-back, MATB-II, PVT, Flanker

# Output classes
NUM_OUTPUT_REGIONS = 7
NUM_OUTPUT_BANDS = 5
NUM_OUTPUT_STATES = 4

# Note: NUM_CHANNELS and NUM_OUTPUT_CHANNELS will be determined from data!

# =====================================================
# MODEL HYPERPARAMETERS
# =====================================================

D_MODEL = 256
NUM_LAYERS = 6
NUM_HEADS = 8
FF_DIM = 1024
DROPOUT = 0.15

BANDPOWER_HIDDEN_DIM = 128
BANDPOWER_OUTPUT_DIM = 128
TASK_EMBEDDING_DIM = 16

# =====================================================
# TRAINING HYPERPARAMETERS
# =====================================================

EPOCHS = 20
INITIAL_LR = 1e-4
WARMUP_EPOCHS = 5
WEIGHT_DECAY = 0.01
GRADIENT_CLIP_NORM = 1.0

SAVE_CHECKPOINT_EVERY = 2
EARLY_STOPPING_PATIENCE = 15

print(f"\nüîß Configuration:")
print(f"  Model: {NUM_LAYERS} layers, {NUM_HEADS} heads, D_MODEL={D_MODEL}")
print(f"  Training: {EPOCHS} epochs, LR={INITIAL_LR}")
print(f"  Checkpointing: every {SAVE_CHECKPOINT_EVERY} epochs")

2025-12-29 15:43:25.971333: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767023006.203954      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767023006.268197      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767023006.787120      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767023006.787153      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767023006.787156      55 computation_placer.cc:177] computation placer alr

TensorFlow version: 2.19.0
Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]

 Paths configured:
  Input: /kaggle/input/preprocessed-cog-eeg-dataset/processed
  Checkpoints: /kaggle/working/checkpoints
  Results: /kaggle/working/results

üîß Configuration:
  Model: 6 layers, 8 heads, D_MODEL=256
  Training: 20 epochs, LR=0.0001
  Checkpointing: every 2 epochs


##  3. TPU Initialization

In [2]:
import tensorflow as tf

print("=" * 70)
print("üöÄ GPU T4 x2 INITIALIZATION")
print("=" * 70)

print(f"\nTensorFlow version: {tf.__version__}")

# Create GPU strategy
print("\nüìä Creating GPU strategy...")
strategy = tf.distribute.MirroredStrategy()

num_replicas = strategy.num_replicas_in_sync
print(f"‚úÖ Strategy: {strategy.__class__.__name__}")
print(f"   GPUs detected: {num_replicas}")

# List GPUs
gpus = tf.config.list_physical_devices('GPU')
print(f"\nüéÆ GPU devices:")
for i, gpu in enumerate(gpus):
    print(f"   {i+1}. {gpu.name}")

# Batch size configuration
BATCH_SIZE_PER_REPLICA = 32  # Reduce to 16 if you get OOM errors
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * num_replicas

print(f"\nüì¶ Batch Configuration:")
print(f"   Per-GPU batch: {BATCH_SIZE_PER_REPLICA}")
print(f"   Global batch: {BATCH_SIZE}")

print("\n" + "=" * 70)
print("‚úÖ GPU READY FOR TRAINING")
print("=" * 70)

# Make variables global
globals()['strategy'] = strategy
globals()['BATCH_SIZE'] = BATCH_SIZE

print("\nüí° Next: Create model in strategy scope")


üöÄ GPU T4 x2 INITIALIZATION

TensorFlow version: 2.19.0

üìä Creating GPU strategy...
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
‚úÖ Strategy: MirroredStrategy
   GPUs detected: 2

üéÆ GPU devices:
   1. /physical_device:GPU:0
   2. /physical_device:GPU:1

üì¶ Batch Configuration:
   Per-GPU batch: 32
   Global batch: 64

‚úÖ GPU READY FOR TRAINING

üí° Next: Create model in strategy scope


I0000 00:00:1767023020.441665      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1767023020.445574      55 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [3]:
# ============================================================
# AUTO-SAVE CALLBACK (Add this BEFORE training section)
# ============================================================

class AutoSaveCallback(tf.keras.callbacks.Callback):
    """Custom callback to create marker files for auto-committing"""
    
    def on_epoch_end(self, epoch, logs=None):
        # Create a marker file every epoch
        # Kaggle auto-commits when new files appear
        marker_path = f"/kaggle/working/progress_epoch_{epoch+1}.txt"
        with open(marker_path, 'w') as f:
            f.write(f"Completed epoch {epoch+1}/{EPOCHS}\n")
            f.write(f"Loss: {logs.get('loss', 0):.4f}\n")
            f.write(f"Val Loss: {logs.get('val_loss', 0):.4f}\n")
            f.write(f"Timestamp: {datetime.now()}\n")
        
        print(f"   Progress saved: progress_epoch_{epoch+1}.txt")

print(" Auto-save callback ready!")

 Auto-save callback ready!


##  4. Data Loading (Compatible with Preprocessing)

In [4]:
def load_preprocessed_data(split='train'):
    """
    Load preprocessed .pkl file (matches preprocessing output format).
    
    Returns:
        Tuple of (X, y, metadata) where:
        X = (X_eeg, X_bp, X_task)
        y = (y_channel, y_region, y_band, y_state)
        metadata = dict with num_channels, etc.
    """
    pkl_path = os.path.join(DATA_INPUT_DIR, split, f"{split}_data.pkl")
    
    print(f"\n Loading {split} data from: {pkl_path}")
    
    if not os.path.exists(pkl_path):
        print(f"   File not found!")
        print(f"   Check that dataset is attached and DATASET_NAME is correct")
        return None
    
    # Load pickle file
    try:
        with open(pkl_path, 'rb') as f:
            samples = pickle.load(f)
        
        print(f"   Loaded {len(samples):,} samples")
        
        if len(samples) == 0:
            print(f"   No samples in file!")
            return None
        
        # Inspect first sample to get dimensions
        sample = samples[0]
        X_shape = sample['X'].shape  # Should be (n_channels, 256)
        bp_shape = sample['bp'].shape  # Should be (n_channels, 5)
        
        num_channels = X_shape[0]
        
        print(f"\n  Data format:")
        print(f"     X shape: {X_shape} (channels, time)")
        print(f"     bp shape: {bp_shape} (channels, bands)")
        print(f"     Channels: {num_channels}")
        
        # Extract arrays
        print(f"\n   Converting to arrays...")
        
        # X: Transpose from (n_channels, 256) to (256, n_channels)
        X_eeg = np.array([s['X'].T for s in samples], dtype=np.float32)
        
        # bp: Flatten from (n_channels, 5) to (n_channels*5,)
        X_bp = np.array([s['bp'].flatten() for s in samples], dtype=np.float32)
        
        # task_idx
        X_task = np.array([s['task_idx'] for s in samples], dtype=np.int32)
        
        # Labels
        y_channel = np.array([s['y_channel'] for s in samples], dtype=np.int32)
        y_region = np.array([s['y_region'] for s in samples], dtype=np.int32)
        y_band = np.array([s['y_band'] for s in samples], dtype=np.int32)
        y_state = np.array([s['y_state'] for s in samples], dtype=np.int32)
        
        print(f"   Final shapes:")
        print(f"     X_eeg: {X_eeg.shape} (N, time, channels)")
        print(f"     X_bp: {X_bp.shape} (N, features)")
        print(f"     X_task: {X_task.shape}")
        print(f"     Labels: {y_channel.shape} each")
        
        metadata = {
            'num_channels': num_channels,
            'num_samples': len(samples),
            'bandpower_dim': X_bp.shape[1]
        }
        
        return (X_eeg, X_bp, X_task), (y_channel, y_region, y_band, y_state), metadata
        
    except Exception as e:
        print(f"  ‚ùå Error loading data: {e}")
        import traceback
        traceback.print_exc()
        return None

print(" Data loading function defined")

 Data loading function defined


##  5. Model Architecture (Flexible Channels)

In [5]:
def create_model(num_channels, bandpower_input_dim, num_output_channels):
    """
    Create EEG Transformer with flexible channel dimensions.
    
    Args:
        num_channels: Number of EEG channels (e.g., 58)
        bandpower_input_dim: Bandpower feature dimension (num_channels * 5)
        num_output_channels: Number of output classes for channel prediction
    """
    # Inputs
    eeg_input = tf.keras.Input(shape=(WINDOW_SIZE_SAMPLES, num_channels), name='eeg')
    bp_input = tf.keras.Input(shape=(bandpower_input_dim,), name='bp')
    task_input = tf.keras.Input(shape=(1,), dtype='int32', name='task')
    
    # ==================== EEG STREAM ====================
    x = tf.keras.layers.Dense(D_MODEL, name='eeg_projection')(eeg_input)
    
    # Positional encoding
    positions = tf.range(start=0, limit=WINDOW_SIZE_SAMPLES, delta=1)
    pos_emb = tf.keras.layers.Embedding(
        input_dim=WINDOW_SIZE_SAMPLES,
        output_dim=D_MODEL,
        name='positional_embedding'
    )(positions)
    x = x + pos_emb
    
    # Transformer layers
    for i in range(NUM_LAYERS):
        attn = tf.keras.layers.MultiHeadAttention(
            num_heads=NUM_HEADS,
            key_dim=D_MODEL // NUM_HEADS,
            dropout=DROPOUT,
            name=f'mha_{i}'
        )(x, x)
        x = tf.keras.layers.Add(name=f'add_attn_{i}')([x, attn])
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6, name=f'ln_attn_{i}')(x)
        
        ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(FF_DIM, activation='relu'),
            tf.keras.layers.Dense(D_MODEL),
            tf.keras.layers.Dropout(DROPOUT)
        ], name=f'ffn_{i}')
        
        ffn_out = ffn(x)
        x = tf.keras.layers.Add(name=f'add_ffn_{i}')([x, ffn_out])
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6, name=f'ln_ffn_{i}')(x)
    
    eeg_emb = tf.keras.layers.GlobalAveragePooling1D(name='eeg_pool')(x)
    
    # ==================== BANDPOWER STREAM ====================
    bp_x = tf.keras.layers.Dense(BANDPOWER_HIDDEN_DIM, activation='relu', name='bp_hidden')(bp_input)
    bp_emb = tf.keras.layers.Dense(BANDPOWER_OUTPUT_DIM, activation='relu', name='bp_output')(bp_x)
    
    # ==================== TASK STREAM ====================
    task_emb = tf.keras.layers.Embedding(NUM_TASKS, TASK_EMBEDDING_DIM, name='task_emb')(task_input)
    task_emb = tf.keras.layers.Flatten(name='task_flatten')(task_emb)
    
    # ==================== FUSION ====================
    fused = tf.keras.layers.Concatenate(name='fusion')([eeg_emb, bp_emb, task_emb])
    
    # ==================== MULTI-TASK HEADS ====================
    out_channel = tf.keras.layers.Dense(num_output_channels, name='channel')(fused)
    out_region = tf.keras.layers.Dense(NUM_OUTPUT_REGIONS, name='region')(fused)
    out_band = tf.keras.layers.Dense(NUM_OUTPUT_BANDS, name='band')(fused)
    out_state = tf.keras.layers.Dense(NUM_OUTPUT_STATES, name='state')(fused)
    
    model = tf.keras.Model(
        inputs=[eeg_input, bp_input, task_input],
        outputs={
            'channel': out_channel,
            'region': out_region,
            'band': out_band,
            'state': out_state
        },
        name='CogniVue_Transformer'
    )
    
    return model

print(" Model architecture defined")

 Model architecture defined


##  6. Learning Rate Schedule

In [6]:
class WarmupCosineDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, warmup_steps, total_steps):
        super().__init__()
        self.initial_learning_rate = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
    
    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        warmup_steps = tf.cast(self.warmup_steps, tf.float32)
        total_steps = tf.cast(self.total_steps, tf.float32)
        
        warmup_lr = (step / warmup_steps) * self.initial_learning_rate
        
        decay_steps = total_steps - warmup_steps
        decay_step = step - warmup_steps
        cosine_decay = 0.5 * (1 + tf.cos(tf.constant(np.pi) * decay_step / decay_steps))
        decay_lr = self.initial_learning_rate * cosine_decay
        
        return tf.cond(
            step < warmup_steps,
            lambda: warmup_lr,
            lambda: decay_lr
        )
    
    def get_config(self):
        return {
            "initial_learning_rate": self.initial_learning_rate,
            "warmup_steps": self.warmup_steps,
            "total_steps": self.total_steps,
        }

print(" LR schedule defined")

 LR schedule defined


##  7. Data Pipeline & Callbacks

In [7]:
def create_tf_dataset(X, y, is_train=True):
    dataset = tf.data.Dataset.from_tensor_slices((
        {'eeg': X[0], 'bp': X[1], 'task': X[2]},
        {'channel': y[0], 'region': y[1], 'band': y[2], 'state': y[3]}
    ))
    
    if is_train:
        dataset = dataset.shuffle(10000)
    
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset


class PeriodicCheckpoint(tf.keras.callbacks.Callback):
    def __init__(self, save_freq=5):
        super().__init__()
        self.save_freq = save_freq
    
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.save_freq == 0:
            filepath = os.path.join(CHECKPOINT_DIR, f"checkpoint_epoch_{epoch+1:03d}.keras")
            self.model.save(filepath)
            print(f"\n   Saved checkpoint: {os.path.basename(filepath)}")


class LearningRateLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        lr = self.model.optimizer.learning_rate
        if hasattr(lr, '__call__'):
            lr_value = lr(self.model.optimizer.iterations)
        else:
            lr_value = lr
        lr_float = float(tf.keras.backend.get_value(lr_value))
        if logs is not None:
            logs['learning_rate'] = lr_float
        print(f"\n   LR = {lr_float:.6f}")


class ProgressLogger(tf.keras.callbacks.Callback):
    def __init__(self):
        super().__init__()
        self.epoch_start = None
    
    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start = time.time()
        print(f"\n{'='*70}")
        print(f" Epoch {epoch+1}/{EPOCHS}")
        print(f"{'='*70}")
    
    def on_epoch_end(self, epoch, logs=None):
        elapsed = time.time() - self.epoch_start
        print(f"\n Epoch {epoch+1} done in {elapsed:.1f}s")
        if logs:
            print(f"   Loss: {logs.get('loss', 0):.4f} | Val Loss: {logs.get('val_loss', 0):.4f}")
            print(f"   Region Acc: {logs.get('region_accuracy', 0):.4f} | Val: {logs.get('val_region_accuracy', 0):.4f}")
        print(f"{'='*70}\n")

print(" Pipeline & callbacks defined")

 Pipeline & callbacks defined


##  8. Main Training Function

In [8]:
def train_cognivue():
    print("\n" + "="*70)
    print(" CogniVue Training Pipeline")
    print("="*70)
    
    # Load ONLY training data
    print("\n Loading data...")
    train_result = load_preprocessed_data('train')
    
    if train_result is None:
        print("\n‚ùå Data loading failed!")
        return None
    
    train_data, train_labels, train_meta = train_result
    
    # =====================================================
    # SPLIT TRAIN DATA INTO TRAIN/VAL (80/20)
    # =====================================================
    print("\n Splitting data into train/val...")
    
    num_samples = len(train_data[0])
    indices = np.arange(num_samples)
    np.random.seed(42)  # For reproducibility
    np.random.shuffle(indices)
    
    # 80% train, 20% val
    split_idx = int(0.8 * num_samples)
    train_idx = indices[:split_idx]
    val_idx = indices[split_idx:]
    
    # Split X data
    train_X = (
        train_data[0][train_idx],  # X_eeg
        train_data[1][train_idx],  # X_bp
        train_data[2][train_idx]   # X_task
    )
    val_X = (
        train_data[0][val_idx],
        train_data[1][val_idx],
        train_data[2][val_idx]
    )
    
    # Split y data
    train_y = (
        train_labels[0][train_idx],  # y_channel
        train_labels[1][train_idx],  # y_region
        train_labels[2][train_idx],  # y_band
        train_labels[3][train_idx]   # y_state
    )
    val_y = (
        train_labels[0][val_idx],
        train_labels[1][val_idx],
        train_labels[2][val_idx],
        train_labels[3][val_idx]
    )
    
    print(f"   Train samples: {len(train_idx):,}")
    print(f"   Val samples: {len(val_idx):,}")
    
    # Get dimensions from data
    NUM_CHANNELS = train_meta['num_channels']
    BANDPOWER_INPUT_DIM = train_meta['bandpower_dim']
    
    # Determine NUM_OUTPUT_CHANNELS from labels
    NUM_OUTPUT_CHANNELS = train_labels[0].max() + 1
    
    print(f"\n Model dimensions:")
    print(f"   Input channels: {NUM_CHANNELS}")
    print(f"   Bandpower dim: {BANDPOWER_INPUT_DIM}")
    print(f"   Output channels: {NUM_OUTPUT_CHANNELS}")
    
    # Create datasets
    print(f"\n Creating TF datasets...")
    train_ds = create_tf_dataset(train_X, train_y, is_train=True)
    val_ds = create_tf_dataset(val_X, val_y, is_train=False)
    
    steps_per_epoch = len(train_idx) // BATCH_SIZE
    total_steps = steps_per_epoch * EPOCHS
    warmup_steps = steps_per_epoch * WARMUP_EPOCHS
    
    print(f"   Steps/epoch: {steps_per_epoch:,}")
    print(f"   Total steps: {total_steps:,}")
    
    # Build model
    print(f"\nÔøΩÔ∏è Building model...")
    
    with strategy.scope():
        model = create_model(NUM_CHANNELS, BANDPOWER_INPUT_DIM, NUM_OUTPUT_CHANNELS)
        
        print(f"   Parameters: {model.count_params():,}")
        
        lr_schedule = WarmupCosineDecay(INITIAL_LR, warmup_steps, total_steps)
        
        optimizer = tf.keras.optimizers.AdamW(
            learning_rate=lr_schedule,
            weight_decay=WEIGHT_DECAY,
            clipnorm=GRADIENT_CLIP_NORM
        )
        
        loss_weights = {'channel': 0.4, 'region': 0.4, 'band': 0.1, 'state': 0.1}
        model.compile(
    optimizer=optimizer,
    loss={
        'channel': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        'region': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        'band': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        'state': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    },
    loss_weights={
        'channel': 0.4,
        'region': 0.4,
        'band': 0.1,
        'state': 0.1
    },
    metrics={
        'channel': ['accuracy'],
        'region': ['accuracy'],
        'band': ['accuracy'],
        'state': ['accuracy']
    }
)
    print(f"    Compiled")
    
    # Callbacks
    callbacks = [
        ProgressLogger(),
        LearningRateLogger(),
        tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(CHECKPOINT_DIR, 'best_model.keras'),
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        ),
        PeriodicCheckpoint(save_freq=SAVE_CHECKPOINT_EVERY),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=EARLY_STOPPING_PATIENCE,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.TensorBoard(log_dir=LOGS_DIR)
    ]
    
    # Train
    print(f"\n Starting training...")
    print(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    
    try:
        history = model.fit(
            train_ds,
            epochs=EPOCHS,
            validation_data=val_ds,
            callbacks=callbacks,
            verbose=1
        )
        
        print("\n" + "="*70)
        print(" TRAINING COMPLETE!")
        print("="*70)
        print(f"\n Saved to: {CHECKPOINT_DIR}")
        
        # Save final
        model.save(os.path.join(CHECKPOINT_DIR, 'final_model.keras'))
        print(f"   - final_model.keras")
        print(f"   - best_model.keras")
        
        return history
        
    except KeyboardInterrupt:
        print("\n Training interrupted!")
        model.save(os.path.join(CHECKPOINT_DIR, 'interrupted.keras'))
        print(f"    Saved: interrupted.keras")
        return None
        
    except Exception as e:
        print(f"\n‚ùå Training failed: {e}")
        import traceback
        traceback.print_exc()
        return None

print(" Training function ready")

 Training function ready


##  9. Run Training

In [9]:
history = train_cognivue()

if history:
    print("\n Training Summary:")
    print(f"   Best val_loss: {min(history.history['val_loss']):.4f}")
    print(f"   Final region acc: {history.history['region_accuracy'][-1]:.4f}")
    print(f"\n Next: Click 'Save Version' to commit checkpoints!")
else:
    print("\n‚ö†Ô∏è Training did not complete successfully")


 CogniVue Training Pipeline

 Loading data...

 Loading train data from: /kaggle/input/preprocessed-cog-eeg-dataset/processed/train/train_data.pkl
   Loaded 21,538 samples

  Data format:
     X shape: (58, 256) (channels, time)
     bp shape: (58, 5) (channels, bands)
     Channels: 58

   Converting to arrays...
   Final shapes:
     X_eeg: (21538, 256, 58) (N, time, channels)
     X_bp: (21538, 290) (N, features)
     X_task: (21538,)
     Labels: (21538,) each

 Splitting data into train/val...
   Train samples: 17,230
   Val samples: 4,308

 Model dimensions:
   Input channels: 58
   Bandpower dim: 290
   Output channels: 58

 Creating TF datasets...
   Steps/epoch: 269
   Total steps: 5,380

ÔøΩÔ∏è Building model...
   Parameters: 4,837,162
    Compiled

 Starting training...
 2025-12-29 15:43:56

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/rep

10: Save Training Results & Metrics

In [10]:
if history:
    print("\n Saving training results...")
    
    # Convert history to JSON-serializable format
    history_dict = {
        key: [float(val) for val in values] 
        for key, values in history.history.items()
    }
    
    # Save training history
    history_path = os.path.join(RESULTS_DIR, 'training_history.json')
    with open(history_path, 'w') as f:
        json.dump(history_dict, f, indent=2)
    print(f"   Saved: training_history.json")
    
    # Get model dimensions from loaded data
    train_result = load_preprocessed_data('train')
    if train_result:
        _, _, train_meta = train_result
        NUM_CHANNELS_USED = train_meta['num_channels']
    else:
        NUM_CHANNELS_USED = "unknown"
    
    # Save training configuration
    config = {
        'model_architecture': {
            'name': 'CogniVue_Transformer',
            'num_input_channels': NUM_CHANNELS_USED,
            'd_model': D_MODEL,
            'num_layers': NUM_LAYERS,
            'num_heads': NUM_HEADS,
            'ff_dim': FF_DIM,
            'dropout': DROPOUT,
            'window_size': WINDOW_SIZE_SAMPLES
        },
        'training_params': {
            'epochs_trained': len(history.history['loss']),
            'total_epochs': EPOCHS,
            'batch_size': BATCH_SIZE,
            'initial_lr': INITIAL_LR,
            'warmup_epochs': WARMUP_EPOCHS,
            'weight_decay': WEIGHT_DECAY,
            'gradient_clip_norm': GRADIENT_CLIP_NORM
        },
        'output_tasks': {
            'num_output_channels': NUM_OUTPUT_REGIONS,
            'num_regions': NUM_OUTPUT_REGIONS,
            'num_bands': NUM_OUTPUT_BANDS,
            'num_states': NUM_OUTPUT_STATES
        },
        'final_metrics': {
            'best_val_loss': float(min(history.history['val_loss'])),
            'final_train_loss': float(history.history['loss'][-1]),
            'final_val_loss': float(history.history['val_loss'][-1]),
            'final_region_accuracy': float(history.history['region_accuracy'][-1]),
            'final_val_region_accuracy': float(history.history['val_region_accuracy'][-1])
        },
        'training_info': {
            'completed_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'tensorflow_version': tf.__version__,
            'accelerator': 'TPU' if 'TPU' in str(strategy.__class__) else 'CPU/GPU'
        }
    }
    
    config_path = os.path.join(RESULTS_DIR, 'training_config.json')
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=2)
    print(f"   Saved: training_config.json")
    
    # Create a summary markdown file
    summary_md = f"""# CogniVue Training Summary

**Training Completed:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Model Architecture
- **Model:** CogniVue Transformer
- **Input Channels:** {NUM_CHANNELS_USED}
- **Model Dimension:** {D_MODEL}
- **Transformer Layers:** {NUM_LAYERS}
- **Attention Heads:** {NUM_HEADS}
- **Feedforward Dim:** {FF_DIM}
- **Dropout:** {DROPOUT}

## Training Configuration
- **Epochs:** {len(history.history['loss'])}/{EPOCHS}
- **Batch Size:** {BATCH_SIZE}
- **Initial LR:** {INITIAL_LR}
- **Warmup Epochs:** {WARMUP_EPOCHS}
- **Weight Decay:** {WEIGHT_DECAY}

## Final Performance
- **Best Val Loss:** {min(history.history['val_loss']):.4f}
- **Final Train Loss:** {history.history['loss'][-1]:.4f}
- **Final Val Loss:** {history.history['val_loss'][-1]:.4f}
- **Final Region Accuracy:** {history.history['region_accuracy'][-1]:.4f}
- **Final Val Region Accuracy:** {history.history['val_region_accuracy'][-1]:.4f}

## Output Files
- `checkpoints/best_model.keras` - Best model weights
- `checkpoints/final_model.keras` - Final model weights
- `checkpoints/checkpoint_epoch_*.keras` - Periodic checkpoints
- `results/training_history.json` - Loss and metrics per epoch
- `results/training_config.json` - Full configuration
- `logs/` - TensorBoard logs
"""
    
    summary_path = os.path.join(RESULTS_DIR, 'TRAINING_SUMMARY.md')
    with open(summary_path, 'w') as f:
        f.write(summary_md)
    print(f"   Saved: TRAINING_SUMMARY.md")
    
    print("\n All results saved!")
    print(f"\n Saved files:")
    print(f"   {RESULTS_DIR}/")
    print(f"   ‚îú‚îÄ‚îÄ training_history.json")
    print(f"   ‚îú‚îÄ‚îÄ training_config.json")
    print(f"   ‚îî‚îÄ‚îÄ TRAINING_SUMMARY.md")
    print(f"\n   {CHECKPOINT_DIR}/")
    print(f"   ‚îú‚îÄ‚îÄ best_model.keras")
    print(f"   ‚îú‚îÄ‚îÄ final_model.keras")
    print(f"   ‚îî‚îÄ‚îÄ checkpoint_epoch_*.keras")
    
else:
    print("\n‚ö†Ô∏è No training history to save")





 Saving training results...
   Saved: training_history.json

 Loading train data from: /kaggle/input/preprocessed-cog-eeg-dataset/processed/train/train_data.pkl
   Loaded 21,538 samples

  Data format:
     X shape: (58, 256) (channels, time)
     bp shape: (58, 5) (channels, bands)
     Channels: 58

   Converting to arrays...
   Final shapes:
     X_eeg: (21538, 256, 58) (N, time, channels)
     X_bp: (21538, 290) (N, features)
     X_task: (21538,)
     Labels: (21538,) each
   Saved: training_config.json
   Saved: TRAINING_SUMMARY.md

 All results saved!

 Saved files:
   /kaggle/working/results/
   ‚îú‚îÄ‚îÄ training_history.json
   ‚îú‚îÄ‚îÄ training_config.json
   ‚îî‚îÄ‚îÄ TRAINING_SUMMARY.md

   /kaggle/working/checkpoints/
   ‚îú‚îÄ‚îÄ best_model.keras
   ‚îú‚îÄ‚îÄ final_model.keras
   ‚îî‚îÄ‚îÄ checkpoint_epoch_*.keras


11: Package & Download All Outputs

In [11]:

# ============================================================
# Section 11: Package & Download All Outputs
# ============================================================

import zipfile
from pathlib import Path

print("\n Creating download package...")
print("=" * 70)

# Create zip filename with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
zip_filename = f"cognivue_training_outputs_{timestamp}.zip"
zip_path = os.path.join(WORKING_DIR, zip_filename)

try:
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        
        # Add all files from results directory
        print("\n Adding results...")
        if os.path.exists(RESULTS_DIR):
            for file in os.listdir(RESULTS_DIR):
                file_path = os.path.join(RESULTS_DIR, file)
                if os.path.isfile(file_path):
                    arcname = os.path.join('results', file)
                    zipf.write(file_path, arcname)
                    print(f"   {file}")
        
        # Add all checkpoint files
        print("\nüîñ Adding checkpoints...")
        if os.path.exists(CHECKPOINT_DIR):
            for file in os.listdir(CHECKPOINT_DIR):
                file_path = os.path.join(CHECKPOINT_DIR, file)
                if os.path.isfile(file_path):
                    arcname = os.path.join('checkpoints', file)
                    zipf.write(file_path, arcname)
                    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
                    print(f"   {file} ({file_size_mb:.1f} MB)")
        
        # Add TensorBoard logs (optional - can be large)
        print("\n Adding TensorBoard logs...")
        if os.path.exists(LOGS_DIR):
            log_count = 0
            for root, dirs, files in os.walk(LOGS_DIR):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.join('logs', os.path.relpath(file_path, LOGS_DIR))
                    zipf.write(file_path, arcname)
                    log_count += 1
            print(f"   Added {log_count} log files")
    
    # Get final zip size
    zip_size_mb = os.path.getsize(zip_path) / (1024 * 1024)
    
    print("\n" + "=" * 70)
    print(" PACKAGE CREATED SUCCESSFULLY!")
    print("=" * 70)
    print(f"\n Zip file: {zip_filename}")
    print(f" Size: {zip_size_mb:.1f} MB")
    print(f" Location: {zip_path}")
    
    print("\n To download:")
    print("   1. Go to the 'Output' tab (top right)")
    print("   2. Click 'Save Version' to commit outputs")
    print(f"   3. Download '{zip_filename}'")
    print("\n Or click the download icon next to the file in the Output tab")
    
    # List contents
    print("\n Package contents:")
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        file_list = zipf.namelist()
        print(f"   Total files: {len(file_list)}")
        print("\n   Structure:")
        print("   ‚îú‚îÄ‚îÄ results/")
        print("   ‚îÇ   ‚îú‚îÄ‚îÄ training_history.json")
        print("   ‚îÇ   ‚îú‚îÄ‚îÄ training_config.json")
        print("   ‚îÇ   ‚îî‚îÄ‚îÄ TRAINING_SUMMARY.md")
        print("   ‚îú‚îÄ‚îÄ checkpoints/")
        print("   ‚îÇ   ‚îú‚îÄ‚îÄ best_model.keras")
        print("   ‚îÇ   ‚îú‚îÄ‚îÄ final_model.keras")
        print("   ‚îÇ   ‚îî‚îÄ‚îÄ checkpoint_epoch_*.keras")
        print("   ‚îî‚îÄ‚îÄ logs/")
        print("       ‚îî‚îÄ‚îÄ TensorBoard logs")
    
    print("\n" + "=" * 70)
    
except Exception as e:
    print(f"\n‚ùå Error creating zip: {e}")
    import traceback
    traceback.print_exc()


 Creating download package...

 Adding results...
   TRAINING_SUMMARY.md
   training_history.json
   training_config.json

üîñ Adding checkpoints...
   checkpoint_epoch_002.keras (57.1 MB)
   best_model.keras (57.1 MB)
   final_model.keras (57.1 MB)
   checkpoint_epoch_016.keras (57.1 MB)
   checkpoint_epoch_018.keras (57.1 MB)
   checkpoint_epoch_012.keras (57.1 MB)
   checkpoint_epoch_014.keras (57.1 MB)
   checkpoint_epoch_006.keras (57.1 MB)
   checkpoint_epoch_020.keras (57.1 MB)
   checkpoint_epoch_010.keras (57.1 MB)
   checkpoint_epoch_008.keras (57.1 MB)
   checkpoint_epoch_004.keras (57.1 MB)

 Adding TensorBoard logs...
   Added 2 log files

 PACKAGE CREATED SUCCESSFULLY!

 Zip file: cognivue_training_outputs_20251229_160732.zip
 Size: 614.8 MB
 Location: /kaggle/working/cognivue_training_outputs_20251229_160732.zip

 To download:
   1. Go to the 'Output' tab (top right)
   2. Click 'Save Version' to commit outputs
   3. Download 'cognivue_training_outputs_20251229_160732.