In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm.auto import tqdm
import os
import pickle
import math # Transformer
from datetime import datetime

In [2]:
# Search GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: NVIDIA H200


In [3]:
def normalize_quaternions(quaternion_features):
    """
    w²+x²+y²+z²=1
    """
    import numpy as np
    
    norms = np.linalg.norm(quaternion_features, axis=1, keepdims=True)
    
    norms[norms == 0] = 1.0
    
    normalized_quaternions = quaternion_features / norms
    
    if __debug__:
        verification_norms = np.linalg.norm(normalized_quaternions, axis=1)
        assert np.allclose(verification_norms, 1.0, atol=1e-6), "四元数归一化失败"
    
    return normalized_quaternions

In [4]:
# Dataset file.
#file_path_Kabsch = '/srv/scratch/z5548879/VIVIAN_Dataset_V1/vive_vicon_comparison_data.csv'
file_path_Kabsch = '/srv/scratch/z5548879/VIVIAN_Dataset_V2_with orientation/position_dataset.csv'

# Read Kabsch data (main dataset)
data_kabsch = pd.read_csv(file_path_Kabsch)

# Reorder data to ensure sorting by participant and timestamp
data_kabsch = data_kabsch.sort_values(['participant_id', 'time_stamp'] if 'participant_id' in data_kabsch.columns and 'time_stamp' in data_kabsch.columns 
                       else [data_kabsch.columns[0], data_kabsch.columns[4]])

data_kabsch_numpy = data_kabsch.to_numpy()

# Extract basic info from Kabsch data
participant_id = data_kabsch_numpy[:, 0]  # 8 participants
speed = data_kabsch_numpy[:, 1]           # 0.5 1.0 1.5 2.0 m/s
height = data_kabsch_numpy[:, 2]          # Tracker: 1, 2, 3
direction = data_kabsch_numpy[:, 3]       # XYZ

# Extract position data - separate by direction (VIVE as input, Vicon as target)
vicon_data = data_kabsch_numpy[:, 6]  # Vicon data (target)
vive_data = data_kabsch_numpy[:, 7]   # VIVE data (input)

# Extract XYZ directional position features (including tracker information)
def extract_xyz_features(position_data, participant_ids, speed_data, height_data, direction_data):
    """Extract XYZ position features for each participant, speed, and tracker"""
    unique_participants = np.unique(participant_ids)
    unique_speeds = np.unique(speed_data)
    unique_heights = np.unique(height_data)
    
    # Calculate total number of data points
    total_points = 0
    for participant in unique_participants:
        for speed_val in unique_speeds:
            for height_val in unique_heights:  # Add tracker loop
                mask = (participant_ids == participant) & (speed_data == speed_val) & (height_data == height_val)
                if np.any(mask):
                    x_mask = mask & (direction_data == 'X')
                    total_points += np.sum(x_mask)
    
    # Initialize feature matrix
    xyz_features = np.zeros((total_points, 3))
    xyz_spatial_magnitude = np.zeros(total_points)
    feature_participant_ids = []
    feature_speeds = []
    feature_heights = []  # Add heights list
    
    current_idx = 0
    
    for participant in unique_participants:
        for speed_val in unique_speeds:
            for height_val in unique_heights:  # Add tracker loop
                # Get data for current participant, speed, and tracker
                base_mask = (participant_ids == participant) & (speed_data == speed_val) & (height_data == height_val)
                
                if np.any(base_mask):
                    x_mask = base_mask & (direction_data == 'X')
                    y_mask = base_mask & (direction_data == 'Y')
                    z_mask = base_mask & (direction_data == 'Z')
                    
                    x_indices = np.where(x_mask)[0]
                    y_indices = np.where(y_mask)[0] 
                    z_indices = np.where(z_mask)[0]
                    
                    # Ensure XYZ data lengths are consistent
                    min_length = min(len(x_indices), len(y_indices), len(z_indices))
                    
                    if min_length > 0:
                        x_values = np.array(position_data[x_indices[:min_length]], dtype=float)
                        y_values = np.array(position_data[y_indices[:min_length]], dtype=float)
                        z_values = np.array(position_data[z_indices[:min_length]], dtype=float)
                        
                        # Store XYZ features
                        end_idx = current_idx + min_length
                        xyz_features[current_idx:end_idx, 0] = x_values
                        xyz_features[current_idx:end_idx, 1] = y_values
                        xyz_features[current_idx:end_idx, 2] = z_values
                        
                        # Calculate spatial magnitude
                        spatial_magnitude = np.sqrt(x_values**2 + y_values**2 + z_values**2)
                        xyz_spatial_magnitude[current_idx:end_idx] = spatial_magnitude
                        
                        # Record corresponding participant, speed, and tracker information
                        for _ in range(min_length):
                            feature_participant_ids.append(participant)
                            feature_speeds.append(speed_val)
                            feature_heights.append(height_val)  # Add tracker
                        
                        current_idx = end_idx
    
    # Truncate to actually used portion
    xyz_features = xyz_features[:current_idx]
    xyz_spatial_magnitude = xyz_spatial_magnitude[:current_idx]
    
    return (xyz_features, xyz_spatial_magnitude, 
            np.array(feature_participant_ids), 
            np.array(feature_speeds),
            np.array(feature_heights))  # Return heights

print("Extracting VIVE XYZ position features...")
vive_xyz, vive_spatial_mag, vive_participants, vive_speeds, vive_heights = extract_xyz_features(
    vive_data, participant_id, speed, height, direction)  # Add height parameter

print("Extracting Vicon XYZ position features...")
vicon_xyz, vicon_spatial_mag, vicon_participants, vicon_speeds, vicon_heights = extract_xyz_features(
    vicon_data, participant_id, speed, height, direction)  # Add height parameter

print("VIVE XYZ feature shape:", vive_xyz.shape)
print("VIVE spatial magnitude shape:", vive_spatial_mag.shape)
print("Vicon XYZ feature shape:", vicon_xyz.shape)
print("Vicon spatial magnitude shape:", vicon_spatial_mag.shape)

# Ensure data length alignment
min_length = min(len(vive_xyz), len(vicon_xyz))
print(f"Data alignment length: {min_length}")

# Truncate to same length
vive_xyz = vive_xyz[:min_length]
vive_spatial_mag = vive_spatial_mag[:min_length]
vicon_xyz = vicon_xyz[:min_length]
vicon_spatial_mag = vicon_spatial_mag[:min_length]
aligned_participants = vive_participants[:min_length]
aligned_speeds = vive_speeds[:min_length]
aligned_height = vive_heights[:min_length]  # Use heights returned from function

print(f"Tracker distribution after alignment: {np.unique(aligned_height, return_counts=True)}")

# Process categorical features
participant_encoder = LabelEncoder()
participant_encoded = participant_encoder.fit_transform(aligned_participants)
participant_normalised = participant_encoded / (len(np.unique(participant_encoded)) - 1)

# Standardize all numerical features
scalers = {}

# VIVE XYZ position features (3D)
scalers['vive_xyz'] = StandardScaler()
vive_xyz_scaled = scalers['vive_xyz'].fit_transform(vive_xyz)

# VIVE spatial magnitude feature (1D)
scalers['vive_spatial'] = StandardScaler()
vive_spatial_scaled = scalers['vive_spatial'].fit_transform(vive_spatial_mag.reshape(-1, 1))

# Vicon target variable (3D)
scalers['vicon_xyz'] = StandardScaler()
vicon_xyz_scaled = scalers['vicon_xyz'].fit_transform(vicon_xyz)

# Vicon spatial magnitude feature (1D)
scalers['vicon_spatial'] = StandardScaler()
vicon_spatial_scaled = scalers['vicon_spatial'].fit_transform(vicon_spatial_mag.reshape(-1, 1))

# Other features
scalers['speed'] = StandardScaler()
scalers['height'] = StandardScaler()

speed_scaled = scalers['speed'].fit_transform(aligned_speeds.reshape(-1, 1))
height_scaled = scalers['height'].fit_transform(aligned_height.reshape(-1, 1))

# Combine all features (excluding quaternions)
additional_features = np.hstack([
    participant_normalised.reshape(-1, 1),    # 1 feature: participant ID
    speed_scaled,                             # 1 feature: speed
    height_scaled,                            # 1 feature: tracker position
    vive_spatial_scaled,                      # 1 feature: VIVE spatial magnitude
    #vive_xyz_scaled,                          # 3 features: VIVE XYZ coordinates
])

print("Final feature matrix shape:", additional_features.shape)

# Generate sequence data
seq_length = 10

def create_sequences_with_xyz_features(xyz_input, additional_feats, xyz_target, participant_ids, heights, seq_length=10):
    """Create sequence data with XYZ features, grouped by participant and tracker"""
    x_time, x_feat, y, seq_participant_ids, seq_heights = [], [], [], [], []
    
    unique_participants = np.unique(participant_ids)
    unique_heights = np.unique(heights)
    
    # Group by participant and tracker combination
    for participant in unique_participants:
        for height in unique_heights:
            # Filter by both participant and tracker
            mask = (participant_ids == participant) & (heights == height)
            indices = np.where(mask)[0]
            
            if len(indices) < seq_length + 1:
                continue
            
            # Create sequences for current participant-tracker combination
            for i in range(len(indices) - seq_length):
                # Critical modification: use indices array instead of direct indexing
                seq_indices = indices[i:i+seq_length]  # Get sequence indices
                target_idx = indices[i + seq_length]    # Target index
                
                x_time.append(xyz_input[seq_indices])        # Use sequence indices
                x_feat.append(additional_feats[target_idx])  # Additional features
                y.append(xyz_target[target_idx])             # XYZ target
                seq_participant_ids.append(participant_ids[target_idx])
                seq_heights.append(heights[target_idx])
    
    print(f"Total sequences generated: {len(x_time)}")
    
    return (np.array(x_time), np.array(x_feat), np.array(y), 
            np.array(seq_participant_ids), np.array(seq_heights))

print("Generating XYZ sequence data...")
X_time, X_feat, y, seq_participant_ids, seq_heights = create_sequences_with_xyz_features(
    vive_xyz_scaled, additional_features, vicon_xyz_scaled, 
    aligned_participants, aligned_height, seq_length
)

print("Final training data shape:")
print("X_time (VIVE XYZ time series):", X_time.shape)
print("X_feat (additional features):", X_feat.shape)
print("y (Vicon XYZ target):", y.shape)
print("Number of sequences:", len(seq_participant_ids))

# Check tracker distribution
unique_heights_in_seq, counts = np.unique(seq_heights, return_counts=True)
print("\nTracker distribution:")
for h, c in zip(unique_heights_in_seq, counts):
    print(f"  Tracker {int(h)}: {c} sequences")

Extracting VIVE XYZ position features...
Extracting Vicon XYZ position features...
VIVE XYZ feature shape: (600000, 3)
VIVE spatial magnitude shape: (600000,)
Vicon XYZ feature shape: (600000, 3)
Vicon spatial magnitude shape: (600000,)
Data alignment length: 600000
Tracker distribution after alignment: (array([1, 2, 3]), array([200000, 200000, 200000]))
Final feature matrix shape: (600000, 4)
Feature details:
- Participant ID: 1 column
- Speed: 1 column
- Tracker position: 1 column
- VIVE spatial magnitude sqrt(x²+y²+z²): 1 column
- VIVE XYZ coordinates: 3 columns
- Total: 7 additional features
Generating XYZ sequence data...
Total sequences generated: 599700
Final training data shape:
X_time (VIVE XYZ time series): (599700, 10, 3)
X_feat (additional features): (599700, 4)
y (Vicon XYZ target): (599700, 3)
Number of sequences: 599700

Tracker distribution:
  Tracker 1: 199900 sequences
  Tracker 2: 199900 sequences
  Tracker 3: 199900 sequences

=== Feature Summary ===
Time series inp

In [5]:
class EarlyStopping:
    def __init__(self, patience=8, min_delta=1e-5, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None
        
    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1
            
        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False
    
    def save_checkpoint(self, model):
        self.best_weights = model.state_dict().copy()

In [6]:
class PositionalEncoding(nn.Module):
    """ Add Transformer location information"""
    def __init__(self, d_model, max_len=5000, batch_first=True):
        super(PositionalEncoding, self).__init__()
        self.batch_first = batch_first
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        if batch_first:
            # (1, max_len, d_model) for batch_first=True
            pe = pe.unsqueeze(0)
        else:
            # (max_len, 1, d_model) for batch_first=False
            pe = pe.unsqueeze(1)
            
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        if self.batch_first:
            # x shape: (batch_size, seq_len, d_model)
            seq_len = x.size(1)
            return x + self.pe[:, :seq_len, :]
        else:
            # x shape: (seq_len, batch_size, d_model)
            seq_len = x.size(0)
            return x + self.pe[:seq_len, :, :]

In [7]:
class TransformerBlock(nn.Module):
    """ Single Transformer block with Pre-LayerNorm architecture"""
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        
        # 权重初始化
        self._init_weights()
        
    def _init_weights(self):
        """Initialize weights using Xavier uniform initialization"""
        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.xavier_uniform_(self.linear2.weight)
        nn.init.constant_(self.linear1.bias, 0)
        nn.init.constant_(self.linear2.bias, 0)
        
    def forward(self, src, src_mask=None, src_key_padding_mask=None, return_attention=False):
        # Pre-LayerNorm: 先归一化，再计算
        # Self-attention
        norm_src = self.norm1(src)
        src2, attention_weights = self.self_attn(norm_src, norm_src, norm_src, 
                                               attn_mask=src_mask,
                                               key_padding_mask=src_key_padding_mask)
        src = src + self.dropout1(src2)
        
        # Feed forward
        norm_src = self.norm2(src)
        src2 = self.linear2(self.dropout(torch.relu(self.linear1(norm_src))))
        src = src + self.dropout2(src2)
        
        if return_attention:
            return src, attention_weights
        return src

In [8]:
class PureTransformerModel(nn.Module):
    def __init__(self, seq_length, num_features=4, hidden_dim=128, dropout_rate=0.1,
                 nhead=8, num_transformer_layers=3, dim_feedforward=512):
        super(PureTransformerModel, self).__init__()
        
        # Ensure hidden_dim is divisible by nhead
        if hidden_dim % nhead != 0:
            hidden_dim = ((hidden_dim // nhead) + 1) * nhead
            print(f"Adjusted hidden_dim to {hidden_dim} to fit multi-head attention")
        
        self.hidden_dim = hidden_dim
        self.seq_length = seq_length
        self.num_time_features = 3  # Assuming 3 time series channels (e.g., open, high, low)
        
        # Multi-scale embedding: capture patterns at different temporal granularities
        # This replaces the multi-scale convolution in the original model 
        # Short-term pattern embedding (local context)
        self.short_term_embed = nn.Sequential(
            nn.Linear(self.num_time_features, 32),
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(dropout_rate * 0.1)
        )
        
        # Medium-term pattern embedding (intermediate context)
        self.medium_term_embed = nn.Sequential(
            nn.Linear(self.num_time_features * 3, 32),  # Look at 3-step windows
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(dropout_rate * 0.1)
        )
        
        # Long-term pattern embedding (broader context)
        self.long_term_embed = nn.Sequential(
            nn.Linear(self.num_time_features * 5, 32),  # Look at 5-step windows
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(dropout_rate * 0.1)
        )
        
        # Gradient/change embedding (captures peaks and troughs)
        # This replaces the gradient convolution in the original model
        self.gradient_embed = nn.Sequential(
            nn.Linear(self.num_time_features, 32),
            nn.LayerNorm(32),
            nn.Tanh(),  # Use tanh to better capture directional changes
            nn.Dropout(dropout_rate * 0.1)
        )
        
        # Combined multi-scale features: 32 + 32 + 32 + 32 = 128
        self.multi_scale_fusion = nn.Sequential(
            nn.Linear(128, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate * 0.1)
        )
        
        # Positional Encoding
        self.pos_encoder = PositionalEncoding(hidden_dim, max_len=seq_length, batch_first=True)
        
        # Transformer Encoder Layers
        self.transformer_layers = nn.ModuleList([
            TransformerBlock(hidden_dim, nhead, dim_feedforward, dropout_rate * 0.1)
            for _ in range(num_transformer_layers)
        ])
        
        # Global Attention Pooling (sequence aggregation)
        self.global_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim, 
            num_heads=nhead,
            batch_first=True
        )
        
        # Learnable query vector for global pooling
        self.global_query = nn.Parameter(torch.randn(1, 1, hidden_dim))   
        # Static Feature Processing Network
        self.feat_fc1 = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
        )
        
        self.feat_fc2 = nn.Sequential(
            nn.Linear(64, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
        )
        
        # Residual connection for static features
        self.feat_res = nn.Sequential(
            nn.Linear(num_features, 128),
            nn.LayerNorm(128)
        )
        
        # Feature Fusion Layer (combines temporal and static features)
        self.feature_fusion = nn.Sequential(
            nn.Linear(hidden_dim + 128, hidden_dim * 2),
            nn.LayerNorm(hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU()
        )
        
        # Classification Head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 3)
        )
        
        # Weight initialization
        self._init_weights()
        
    def _init_weights(self):
        """Xavier initialization for all linear layers"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
    
    def _create_multi_scale_input(self, x_time):
        batch_size, seq_len, _ = x_time.shape
        
        # Short-term: use original timesteps
        short_term = x_time  # [batch, seq_len, 3]
        
        # Medium-term: create 3-step windows with padding
        x_padded_3 = torch.cat([x_time[:, :1, :].repeat(1, 1, 1), 
                                x_time, 
                                x_time[:, -1:, :].repeat(1, 1, 1)], dim=1)
        medium_term = torch.cat([
            x_padded_3[:, i:i+seq_len, :] for i in range(3)
        ], dim=2)  # [batch, seq_len, 9]
        
        # Long-term: create 5-step windows with padding
        x_padded_5 = torch.cat([x_time[:, :1, :].repeat(1, 2, 1), 
                                x_time, 
                                x_time[:, -1:, :].repeat(1, 2, 1)], dim=1)
        long_term = torch.cat([
            x_padded_5[:, i:i+seq_len, :] for i in range(5)
        ], dim=2)  # [batch, seq_len, 15]
        
        # Gradient: compute first-order differences to capture rate of change
        gradient = torch.diff(x_time, dim=1, prepend=x_time[:, :1, :])  # [batch, seq_len, 3]
        
        return {
            'short_term': short_term,
            'medium_term': medium_term,
            'long_term': long_term,
            'gradient': gradient
        }
    
    def forward(self, x_time, x_feat, return_attention=False):
        batch_size = x_time.size(0)
        

        # Multi-scale Time Series Embedding
        # Create multi-scale representations
        multi_scale_inputs = self._create_multi_scale_input(x_time)
        
        # Process each scale
        short_features = self.short_term_embed(multi_scale_inputs['short_term'])
        medium_features = self.medium_term_embed(multi_scale_inputs['medium_term'])
        long_features = self.long_term_embed(multi_scale_inputs['long_term'])
        gradient_features = self.gradient_embed(multi_scale_inputs['gradient'])
        
        # Concatenate all scales
        multi_scale_combined = torch.cat([
            short_features, 
            medium_features, 
            long_features, 
            gradient_features
        ], dim=2)  # [batch, seq_len, 128]
        
        # Fuse multi-scale features and project to hidden_dim
        x_embedded = self.multi_scale_fusion(multi_scale_combined)  # [batch, seq_len, hidden_dim]
        
        # Add positional encoding
        x_embedded = self.pos_encoder(x_embedded)
        
        # Transformer Processing 
        # Pass through Transformer layers
        attention_weights_list = []
        x_transformed = x_embedded
        
        for transformer_layer in self.transformer_layers:
            if return_attention:
                x_transformed, attention_weights = transformer_layer(
                    x_transformed, return_attention=True
                )
                attention_weights_list.append(attention_weights)
            else:
                x_transformed = transformer_layer(x_transformed, return_attention=False)
        
        # Global Attention Pooling
        global_query = self.global_query.expand(batch_size, -1, -1)
        context, global_attention_weights = self.global_attention(
            global_query, x_transformed, x_transformed
        )
        context = context.squeeze(1)  # [batch, hidden_dim]
        
        feat_out = self.feat_fc1(x_feat)
        feat_out = self.feat_fc2(feat_out)
        feat_residual = self.feat_res(x_feat)
        feat_out = feat_out + feat_residual  # Residual connection
        
        combined = torch.cat([context, feat_out], dim=1)
        fused_features = self.feature_fusion(combined)
        
        # Final prediction
        output = self.classifier(fused_features)
        
        if return_attention:
            return output, {
                'transformer_attention': attention_weights_list,
                'global_attention': global_attention_weights
            }
        else:
            return output

model = PureTransformerModel(seq_length, num_features=4).to(device)
print(model)

PureTransformerModel(
  (short_term_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (medium_term_embed): Sequential(
    (0): Linear(in_features=9, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (long_term_embed): Sequential(
    (0): Linear(in_features=15, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (gradient_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): Tanh()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (multi_scale_fusion): Sequential(
    (0): Linear(in_

In [9]:
def train_model(model, train_loader, val_loader, criterion, optimiser, scheduler, epochs=40, patience=8):
    best_val_loss = float('inf')
    best_model = None
    history = {'train_loss': [], 'val_loss': [],
              'train_loss_eval_mode':[]
              }

    early_stopping = EarlyStopping(patience=patience, restore_best_weights=True)
    
    # Gradient accumulation settings
    accum_steps = 4  # Update weights every 4 batches
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        print("---" * 30)  # Progress bar separator
        
        # Training phase
        model.train()
        train_predictions = []
        train_actuals = []
        train_losses = []
        
        optimiser.zero_grad()  # Zero gradients at start of epoch
        
        # Create progress bar for training
        train_pbar = tqdm(train_loader, desc="Training", leave=True)
        
        for i, (batch_X_time, batch_X_feat, batch_y) in enumerate(train_pbar):
            # Move data to device
            batch_X_time = batch_X_time.to(device)
            batch_X_feat = batch_X_feat.to(device)
            batch_y = batch_y.to(device)
            
            # Forward pass
            output = model(batch_X_time, batch_X_feat)
            
            # Calculate original loss for logging and statistics
            original_loss = criterion(output, batch_y)
            
            # Record original loss for statistics
            train_losses.append(original_loss.item())
            
            # Scale loss for gradient accumulation
            scaled_loss = original_loss / accum_steps
            scaled_loss.backward()
            
            # Collect predictions and actuals for RMSE calculation (detach from graph)
            train_predictions.append(output.detach().cpu().numpy())
            train_actuals.append(batch_y.detach().cpu().numpy())
            
            # Update weights every accum_steps batches
            if (i + 1) % accum_steps == 0 or (i + 1) == len(train_loader):
                # Gradient clipping to prevent exploding gradients
                nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                optimiser.step()
                optimiser.zero_grad()
                
                # Clean up GPU memory
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            
            # Update progress bar with original loss
            train_pbar.set_postfix({"batch_loss": f"{original_loss.item():.4f}"})
            
            # Free up memory
            del batch_X_time, batch_X_feat, batch_y, output, original_loss, scaled_loss
        
        # Calculate training loss
        train_loss = np.mean(train_losses)
        history['train_loss'].append(train_loss)
        
        # Validation phase
        model.eval()
        val_predictions = []
        val_actuals = []     
        train_eval_losses = []
        
        with torch.no_grad():
            train_eval_pbar = tqdm(train_loader, desc="Train Eval", leave=True)
            for batch_X_time, batch_X_feat, batch_y in train_eval_pbar:
                batch_X_time = batch_X_time.to(device)
                batch_X_feat = batch_X_feat.to(device)
                batch_y = batch_y.to(device)
                
                output = model(batch_X_time, batch_X_feat)
                batch_loss = criterion(output, batch_y).item()
                train_eval_losses.append(batch_loss)
                
                del batch_X_time, batch_X_feat, batch_y, output
        
        train_eval_loss = np.mean(train_eval_losses)
        history['train_loss_eval_mode'].append(train_eval_loss)

        val_losses = []
        # Create progress bar for validation
        val_pbar = tqdm(val_loader, desc="Validation", leave=True)
        
        with torch.no_grad():
            for batch_X_time, batch_X_feat, batch_y in val_pbar:
                # Move data to device
                batch_X_time = batch_X_time.to(device)
                batch_X_feat = batch_X_feat.to(device)
                batch_y = batch_y.to(device)
                
                output = model(batch_X_time, batch_X_feat)
                
                # Collect predictions and actuals for RMSE calculation
                val_predictions.append(output.cpu().numpy())
                val_actuals.append(batch_y.cpu().numpy())
                
                # Calculate loss for progress bar
                batch_loss = criterion(output, batch_y).item()
                val_losses.append(batch_loss)
                val_pbar.set_postfix({"batch_loss": f"{batch_loss:.4f}"})
                
                # Free up memory
                del batch_X_time, batch_X_feat, batch_y, output
        
        # Calculate validation loss
        val_loss = np.mean(val_losses)
        history['val_loss'].append(val_loss)
        
        # Update learning rate
        scheduler.step(val_loss)
        
        print(f"Epoch {epoch+1}/{epochs} completed - Train Loss: {train_loss:.4f}, Train Eval: {train_eval_loss:.4f}, Val Loss: {val_loss:.4f}")

        for param_group in optimiser.param_groups:
            print(f"Current LR: {param_group['lr']}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
            print(f"New best model saved with Val Loss: {val_loss:.4f}")
            
        if early_stopping(val_loss, model):
            print(f"Early stopping triggered at epoch {epoch+1}")
            break
        
        # Clean up GPU memory at end of epoch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # Load best model
    model.load_state_dict(best_model)
    return model, history

In [10]:
# Function to evaluate the model
def evaluate_model(model, val_loader, criterion):
    model.eval()
    #val_loss = 0.0
    predictions = []
    actuals = []
    val_losses = []
    
    with torch.no_grad():
        for batch_X_time, batch_X_feat, batch_y in val_loader:
            batch_X_time = batch_X_time.to(device)
            batch_X_feat = batch_X_feat.to(device)
            batch_y = batch_y.to(device)
            
            #output = model(batch_X_time, batch_X_feat)
            #output, attention_info = model(batch_X_time, batch_X_feat, return_attention=True)
            if hasattr(model, 'return_attention'):
                 output, attention_info = model(batch_X_time, batch_X_feat, return_attention=True)
            else:
                 output = model(batch_X_time, batch_X_feat)
                
            loss = criterion(output, batch_y)
            val_losses.append(loss.item())
            
            # Collect predictions and actuals for later analysis
            predictions.append(output.cpu().numpy())
            actuals.append(batch_y.cpu().numpy())
            
            # Free memory
            del batch_X_time, batch_X_feat, batch_y, output, loss
    
    # Convert lists to arrays
    predictions = np.vstack(predictions)
    actuals = np.vstack(actuals)
    
    # Calculate RMSE
    # avg_val_loss = val_losses / len(val_loader)
    avg_val_loss = np.mean(val_losses)
    
    return avg_val_loss, predictions, actuals

In [11]:
# 基于参与者的K-fold交叉验证
print("Starting Participant-based K-fold Cross Validation...")

# 获取唯一的参与者ID
unique_participants = np.unique(seq_participant_ids)
print(f"Available participants: {unique_participants}")

Starting Participant-based K-fold Cross Validation...
Available participants: ['HW3-001' 'HW3-002' 'HW3-003' 'HW3-004' 'HW3-005' 'HW3-006' 'HW3-007'
 'HW3-008' 'HW3-009' 'HW3-010']


In [None]:
# Participant-based K-Fold Cross Validation
num_folds = len(unique_participants)  # 10-fold (each participant as validation set once)
fold_rmse_scores = []
best_models = []
all_histories = []
batch_size = 512

# Loop through each participant as validation set
for fold, val_participant in enumerate(unique_participants):
    print(f"\nFold {fold+1}/{num_folds} - Validation Participant: {val_participant}")
    print("-" * 60)
    
    # Create training and validation indices
    val_indices = np.where(seq_participant_ids == val_participant)[0]
    train_indices = np.where(seq_participant_ids != val_participant)[0]
    
    print(f"Training samples: {len(train_indices)}, Validation samples: {len(val_indices)}")
    
    # Create training and validation sets
    X_time_train, X_time_val = X_time[train_indices], X_time[val_indices]
    X_feat_train, X_feat_val = X_feat[train_indices], X_feat[val_indices]
    y_train, y_val = y[train_indices], y[val_indices]
    
    # Create data loaders
    train_dataset = TensorDataset(
        torch.tensor(X_time_train, dtype=torch.float32),
        torch.tensor(X_feat_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_time_val, dtype=torch.float32),
        torch.tensor(X_feat_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.float32)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
    
    # Initialize model
    model = PureTransformerModel(seq_length, num_features=X_feat.shape[1]).to(device)
    
    print(f"\nModel Structure for Fold {fold+1}:")
    print(model)
    
    # Combined MSE and L1 loss
    mse_loss = nn.MSELoss()
    l1_loss = nn.L1Loss()
    criterion = lambda output, target: mse_loss(output, target) + 0.1 * l1_loss(output, target)
    
    # Initialize optimizer and learning rate scheduler
    optimiser = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-3)
    scheduler = ReduceLROnPlateau(optimiser, mode='min', factor=0.5, patience=3)
    
    # Train model with early stopping
    model, history = train_model(
        model, 
        train_loader, 
        val_loader, 
        criterion, 
        optimiser, 
        scheduler, 
        epochs=40, 
        patience=8
    )
    
    # Evaluate model on validation set
    fold_rmse, predictions, actuals = evaluate_model(model, val_loader, criterion)
    fold_rmse_scores.append(fold_rmse)
    
    print(f"Fold {fold+1} ({val_participant}) Loss: {fold_rmse:.4f}")
    
    # Save model state and training history
    best_models.append(model.state_dict().copy())
    all_histories.append(history)
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Print overall cross-validation results
print("\nParticipant-based K-fold Cross Validation Results:")
print("-" * 60)
for fold, (participant, rmse) in enumerate(zip(unique_participants, fold_rmse_scores)):
    print(f"Fold {fold+1} ({participant}): Loss = {rmse:.4f}")
print(f"Average Loss: {np.mean(fold_rmse_scores):.4f}")
print(f"Standard Deviation: {np.std(fold_rmse_scores):.4f}")

# Select best model based on lowest validation loss
best_fold_idx = np.argmin(fold_rmse_scores)
best_participant = unique_participants[best_fold_idx]
print(f"\nBest model is from Fold {best_fold_idx+1} (validation on {best_participant}) with Loss: {fold_rmse_scores[best_fold_idx]:.4f}")


Fold 1/10 - Validation Participant: HW3-001
------------------------------------------------------------
Training samples: 539730, Validation samples: 59970

Model Structure for Fold 1:
PureTransformerModel(
  (short_term_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (medium_term_embed): Sequential(
    (0): Linear(in_features=9, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (long_term_embed): Sequential(
    (0): Linear(in_features=15, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (gradient_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1

Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 1/40 completed - Train Loss: 0.1094, Train Eval: 0.0357, Val Loss: 1.0687
Current LR: 0.001
New best model saved with Val Loss: 1.0687

Epoch 2/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 2/40 completed - Train Loss: 0.0265, Train Eval: 0.0211, Val Loss: 0.8794
Current LR: 0.001
New best model saved with Val Loss: 0.8794

Epoch 3/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 3/40 completed - Train Loss: 0.0175, Train Eval: 0.0125, Val Loss: 0.9401
Current LR: 0.001

Epoch 4/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 4/40 completed - Train Loss: 0.0130, Train Eval: 0.0107, Val Loss: 0.9012
Current LR: 0.001

Epoch 5/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 5/40 completed - Train Loss: 0.0099, Train Eval: 0.0087, Val Loss: 0.9079
Current LR: 0.001

Epoch 6/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 6/40 completed - Train Loss: 0.0098, Train Eval: 0.0080, Val Loss: 0.8847
Current LR: 0.0005

Epoch 7/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 7/40 completed - Train Loss: 0.0071, Train Eval: 0.0065, Val Loss: 0.8965
Current LR: 0.0005

Epoch 8/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 8/40 completed - Train Loss: 0.0067, Train Eval: 0.0063, Val Loss: 0.8845
Current LR: 0.0005

Epoch 9/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 9/40 completed - Train Loss: 0.0066, Train Eval: 0.0064, Val Loss: 0.8894
Current LR: 0.0005

Epoch 10/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 10/40 completed - Train Loss: 0.0065, Train Eval: 0.0061, Val Loss: 0.9360
Current LR: 0.00025
Early stopping triggered at epoch 10
Fold 1 (HW3-001) Loss: 0.9360

Fold 2/10 - Validation Participant: HW3-002
------------------------------------------------------------
Training samples: 539730, Validation samples: 59970

Model Structure for Fold 2:
PureTransformerModel(
  (short_term_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (medium_term_embed): Sequential(
    (0): Linear(in_features=9, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (long_term_embed): Sequential(
    (0): Linear(in_features=15, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (

Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 1/40 completed - Train Loss: 0.1125, Train Eval: 0.0360, Val Loss: 0.4438
Current LR: 0.001
New best model saved with Val Loss: 0.4438

Epoch 2/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 2/40 completed - Train Loss: 0.0292, Train Eval: 0.0204, Val Loss: 0.3901
Current LR: 0.001
New best model saved with Val Loss: 0.3901

Epoch 3/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 3/40 completed - Train Loss: 0.0184, Train Eval: 0.0158, Val Loss: 0.3847
Current LR: 0.001
New best model saved with Val Loss: 0.3847

Epoch 4/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 4/40 completed - Train Loss: 0.0141, Train Eval: 0.0101, Val Loss: 0.4401
Current LR: 0.001

Epoch 5/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 5/40 completed - Train Loss: 0.0103, Train Eval: 0.0093, Val Loss: 0.4319
Current LR: 0.001

Epoch 6/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 6/40 completed - Train Loss: 0.0095, Train Eval: 0.0101, Val Loss: 0.4430
Current LR: 0.001

Epoch 7/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 7/40 completed - Train Loss: 0.0089, Train Eval: 0.0078, Val Loss: 0.4268
Current LR: 0.0005

Epoch 8/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 8/40 completed - Train Loss: 0.0068, Train Eval: 0.0062, Val Loss: 0.4386
Current LR: 0.0005

Epoch 9/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 9/40 completed - Train Loss: 0.0066, Train Eval: 0.0063, Val Loss: 0.4356
Current LR: 0.0005

Epoch 10/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 10/40 completed - Train Loss: 0.0065, Train Eval: 0.0059, Val Loss: 0.4409
Current LR: 0.0005

Epoch 11/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 11/40 completed - Train Loss: 0.0063, Train Eval: 0.0058, Val Loss: 0.4420
Current LR: 0.00025
Early stopping triggered at epoch 11
Fold 2 (HW3-002) Loss: 0.4420

Fold 3/10 - Validation Participant: HW3-003
------------------------------------------------------------
Training samples: 539730, Validation samples: 59970

Model Structure for Fold 3:
PureTransformerModel(
  (short_term_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (medium_term_embed): Sequential(
    (0): Linear(in_features=9, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (long_term_embed): Sequential(
    (0): Linear(in_features=15, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (

Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 1/40 completed - Train Loss: 0.1013, Train Eval: 0.0429, Val Loss: 0.2505
Current LR: 0.001
New best model saved with Val Loss: 0.2505

Epoch 2/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 2/40 completed - Train Loss: 0.0274, Train Eval: 0.0177, Val Loss: 0.3195
Current LR: 0.001

Epoch 3/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 3/40 completed - Train Loss: 0.0174, Train Eval: 0.0152, Val Loss: 0.3413
Current LR: 0.001

Epoch 4/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 4/40 completed - Train Loss: 0.0127, Train Eval: 0.0112, Val Loss: 0.2796
Current LR: 0.001

Epoch 5/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 5/40 completed - Train Loss: 0.0103, Train Eval: 0.0084, Val Loss: 0.3034
Current LR: 0.0005

Epoch 6/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 6/40 completed - Train Loss: 0.0077, Train Eval: 0.0074, Val Loss: 0.2982
Current LR: 0.0005

Epoch 7/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 7/40 completed - Train Loss: 0.0074, Train Eval: 0.0071, Val Loss: 0.3068
Current LR: 0.0005

Epoch 8/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 8/40 completed - Train Loss: 0.0072, Train Eval: 0.0066, Val Loss: 0.3097
Current LR: 0.0005

Epoch 9/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 9/40 completed - Train Loss: 0.0070, Train Eval: 0.0063, Val Loss: 0.3014
Current LR: 0.00025
Early stopping triggered at epoch 9
Fold 3 (HW3-003) Loss: 0.3014

Fold 4/10 - Validation Participant: HW3-004
------------------------------------------------------------
Training samples: 539730, Validation samples: 59970

Model Structure for Fold 4:
PureTransformerModel(
  (short_term_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (medium_term_embed): Sequential(
    (0): Linear(in_features=9, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (long_term_embed): Sequential(
    (0): Linear(in_features=15, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2)

Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 1/40 completed - Train Loss: 0.1091, Train Eval: 0.0385, Val Loss: 0.0719
Current LR: 0.001
New best model saved with Val Loss: 0.0719

Epoch 2/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 2/40 completed - Train Loss: 0.0302, Train Eval: 0.0226, Val Loss: 0.0567
Current LR: 0.001
New best model saved with Val Loss: 0.0567

Epoch 3/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 3/40 completed - Train Loss: 0.0186, Train Eval: 0.0140, Val Loss: 0.0511
Current LR: 0.001
New best model saved with Val Loss: 0.0511

Epoch 4/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 4/40 completed - Train Loss: 0.0135, Train Eval: 0.0123, Val Loss: 0.0536
Current LR: 0.001

Epoch 5/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 5/40 completed - Train Loss: 0.0107, Train Eval: 0.0089, Val Loss: 0.0396
Current LR: 0.001
New best model saved with Val Loss: 0.0396

Epoch 6/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 6/40 completed - Train Loss: 0.0097, Train Eval: 0.0082, Val Loss: 0.0352
Current LR: 0.001
New best model saved with Val Loss: 0.0352

Epoch 7/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 7/40 completed - Train Loss: 0.0098, Train Eval: 0.0083, Val Loss: 0.0357
Current LR: 0.001

Epoch 8/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 8/40 completed - Train Loss: 0.0085, Train Eval: 0.0076, Val Loss: 0.0312
Current LR: 0.001
New best model saved with Val Loss: 0.0312

Epoch 9/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 9/40 completed - Train Loss: 0.0079, Train Eval: 0.0069, Val Loss: 0.0315
Current LR: 0.001

Epoch 10/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 10/40 completed - Train Loss: 0.0076, Train Eval: 0.0064, Val Loss: 0.0345
Current LR: 0.001

Epoch 11/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 11/40 completed - Train Loss: 0.0073, Train Eval: 0.0071, Val Loss: 0.0305
Current LR: 0.001
New best model saved with Val Loss: 0.0305

Epoch 12/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 12/40 completed - Train Loss: 0.0071, Train Eval: 0.0064, Val Loss: 0.0285
Current LR: 0.001
New best model saved with Val Loss: 0.0285

Epoch 13/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 13/40 completed - Train Loss: 0.0068, Train Eval: 0.0067, Val Loss: 0.0263
Current LR: 0.001
New best model saved with Val Loss: 0.0263

Epoch 14/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 14/40 completed - Train Loss: 0.0069, Train Eval: 0.0063, Val Loss: 0.0329
Current LR: 0.001

Epoch 15/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 15/40 completed - Train Loss: 0.0066, Train Eval: 0.0062, Val Loss: 0.0281
Current LR: 0.001

Epoch 16/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 16/40 completed - Train Loss: 0.0063, Train Eval: 0.0057, Val Loss: 0.0247
Current LR: 0.001
New best model saved with Val Loss: 0.0247

Epoch 17/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 17/40 completed - Train Loss: 0.0063, Train Eval: 0.0057, Val Loss: 0.0328
Current LR: 0.001

Epoch 18/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 18/40 completed - Train Loss: 0.0062, Train Eval: 0.0098, Val Loss: 0.0248
Current LR: 0.001

Epoch 19/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 19/40 completed - Train Loss: 0.0068, Train Eval: 0.0055, Val Loss: 0.0258
Current LR: 0.001

Epoch 20/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 20/40 completed - Train Loss: 0.0058, Train Eval: 0.0049, Val Loss: 0.0272
Current LR: 0.0005

Epoch 21/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 21/40 completed - Train Loss: 0.0042, Train Eval: 0.0039, Val Loss: 0.0232
Current LR: 0.0005
New best model saved with Val Loss: 0.0232

Epoch 22/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 22/40 completed - Train Loss: 0.0041, Train Eval: 0.0037, Val Loss: 0.0278
Current LR: 0.0005

Epoch 23/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 23/40 completed - Train Loss: 0.0041, Train Eval: 0.0036, Val Loss: 0.0246
Current LR: 0.0005

Epoch 24/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 24/40 completed - Train Loss: 0.0041, Train Eval: 0.0040, Val Loss: 0.0283
Current LR: 0.0005

Epoch 25/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 25/40 completed - Train Loss: 0.0041, Train Eval: 0.0037, Val Loss: 0.0289
Current LR: 0.00025

Epoch 26/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 26/40 completed - Train Loss: 0.0034, Train Eval: 0.0030, Val Loss: 0.0253
Current LR: 0.00025

Epoch 27/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 27/40 completed - Train Loss: 0.0032, Train Eval: 0.0027, Val Loss: 0.0257
Current LR: 0.00025

Epoch 28/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 28/40 completed - Train Loss: 0.0032, Train Eval: 0.0028, Val Loss: 0.0263
Current LR: 0.00025

Epoch 29/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 29/40 completed - Train Loss: 0.0032, Train Eval: 0.0028, Val Loss: 0.0310
Current LR: 0.000125
Early stopping triggered at epoch 29
Fold 4 (HW3-004) Loss: 0.0310

Fold 5/10 - Validation Participant: HW3-005
------------------------------------------------------------
Training samples: 539730, Validation samples: 59970

Model Structure for Fold 5:
PureTransformerModel(
  (short_term_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (medium_term_embed): Sequential(
    (0): Linear(in_features=9, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (long_term_embed): Sequential(
    (0): Linear(in_features=15, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    

Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 1/40 completed - Train Loss: 0.0844, Train Eval: 0.0271, Val Loss: 0.3166
Current LR: 0.001
New best model saved with Val Loss: 0.3166

Epoch 2/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 2/40 completed - Train Loss: 0.0208, Train Eval: 0.0109, Val Loss: 0.2639
Current LR: 0.001
New best model saved with Val Loss: 0.2639

Epoch 3/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 3/40 completed - Train Loss: 0.0116, Train Eval: 0.0105, Val Loss: 0.2579
Current LR: 0.001
New best model saved with Val Loss: 0.2579

Epoch 4/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 4/40 completed - Train Loss: 0.0087, Train Eval: 0.0044, Val Loss: 0.2345
Current LR: 0.001
New best model saved with Val Loss: 0.2345

Epoch 5/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 5/40 completed - Train Loss: 0.0051, Train Eval: 0.0037, Val Loss: 0.2262
Current LR: 0.001
New best model saved with Val Loss: 0.2262

Epoch 6/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 6/40 completed - Train Loss: 0.0052, Train Eval: 0.0033, Val Loss: 0.2240
Current LR: 0.001
New best model saved with Val Loss: 0.2240

Epoch 7/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 7/40 completed - Train Loss: 0.0039, Train Eval: 0.0033, Val Loss: 0.2204
Current LR: 0.001
New best model saved with Val Loss: 0.2204

Epoch 8/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 8/40 completed - Train Loss: 0.0037, Train Eval: 0.0027, Val Loss: 0.2257
Current LR: 0.001

Epoch 9/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 9/40 completed - Train Loss: 0.0035, Train Eval: 0.0028, Val Loss: 0.2256
Current LR: 0.001

Epoch 10/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 10/40 completed - Train Loss: 0.0047, Train Eval: 0.0025, Val Loss: 0.2245
Current LR: 0.001

Epoch 11/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 11/40 completed - Train Loss: 0.0030, Train Eval: 0.0027, Val Loss: 0.2189
Current LR: 0.001
New best model saved with Val Loss: 0.2189

Epoch 12/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 12/40 completed - Train Loss: 0.0031, Train Eval: 0.0025, Val Loss: 0.2227
Current LR: 0.001

Epoch 13/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 13/40 completed - Train Loss: 0.0030, Train Eval: 0.0026, Val Loss: 0.2156
Current LR: 0.001
New best model saved with Val Loss: 0.2156

Epoch 14/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 14/40 completed - Train Loss: 0.0029, Train Eval: 0.0022, Val Loss: 0.2193
Current LR: 0.001

Epoch 15/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 15/40 completed - Train Loss: 0.0029, Train Eval: 0.0023, Val Loss: 0.2163
Current LR: 0.001

Epoch 16/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 16/40 completed - Train Loss: 0.0028, Train Eval: 0.0022, Val Loss: 0.2216
Current LR: 0.001

Epoch 17/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 17/40 completed - Train Loss: 0.0028, Train Eval: 0.0022, Val Loss: 0.2126
Current LR: 0.001
New best model saved with Val Loss: 0.2126

Epoch 18/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 18/40 completed - Train Loss: 0.0028, Train Eval: 0.0025, Val Loss: 0.2148
Current LR: 0.001

Epoch 19/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 19/40 completed - Train Loss: 0.0028, Train Eval: 0.0024, Val Loss: 0.2150
Current LR: 0.001

Epoch 20/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 20/40 completed - Train Loss: 0.0027, Train Eval: 0.0027, Val Loss: 0.2109
Current LR: 0.001
New best model saved with Val Loss: 0.2109

Epoch 21/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 21/40 completed - Train Loss: 0.0026, Train Eval: 0.0023, Val Loss: 0.2109
Current LR: 0.001
New best model saved with Val Loss: 0.2109

Epoch 22/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 22/40 completed - Train Loss: 0.0026, Train Eval: 0.0022, Val Loss: 0.2119
Current LR: 0.001

Epoch 23/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 23/40 completed - Train Loss: 0.0026, Train Eval: 0.0023, Val Loss: 0.2243
Current LR: 0.001

Epoch 24/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 24/40 completed - Train Loss: 0.0025, Train Eval: 0.0021, Val Loss: 0.2120
Current LR: 0.001

Epoch 25/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 25/40 completed - Train Loss: 0.0025, Train Eval: 0.0021, Val Loss: 0.2135
Current LR: 0.0005

Epoch 26/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 26/40 completed - Train Loss: 0.0018, Train Eval: 0.0015, Val Loss: 0.2112
Current LR: 0.0005

Epoch 27/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 27/40 completed - Train Loss: 0.0018, Train Eval: 0.0016, Val Loss: 0.2121
Current LR: 0.0005

Epoch 28/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 28/40 completed - Train Loss: 0.0018, Train Eval: 0.0016, Val Loss: 0.2081
Current LR: 0.0005
New best model saved with Val Loss: 0.2081

Epoch 29/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 29/40 completed - Train Loss: 0.0018, Train Eval: 0.0017, Val Loss: 0.2084
Current LR: 0.0005

Epoch 30/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 30/40 completed - Train Loss: 0.0019, Train Eval: 0.0017, Val Loss: 0.2104
Current LR: 0.0005

Epoch 31/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 31/40 completed - Train Loss: 0.0019, Train Eval: 0.0016, Val Loss: 0.2097
Current LR: 0.0005

Epoch 32/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 32/40 completed - Train Loss: 0.0019, Train Eval: 0.0016, Val Loss: 0.2093
Current LR: 0.00025

Epoch 33/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 33/40 completed - Train Loss: 0.0016, Train Eval: 0.0014, Val Loss: 0.2071
Current LR: 0.00025
New best model saved with Val Loss: 0.2071

Epoch 34/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 34/40 completed - Train Loss: 0.0016, Train Eval: 0.0014, Val Loss: 0.2075
Current LR: 0.00025

Epoch 35/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 35/40 completed - Train Loss: 0.0016, Train Eval: 0.0014, Val Loss: 0.2060
Current LR: 0.00025
New best model saved with Val Loss: 0.2060

Epoch 36/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 36/40 completed - Train Loss: 0.0016, Train Eval: 0.0014, Val Loss: 0.2073
Current LR: 0.00025

Epoch 37/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 37/40 completed - Train Loss: 0.0016, Train Eval: 0.0014, Val Loss: 0.2062
Current LR: 0.00025

Epoch 38/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 38/40 completed - Train Loss: 0.0016, Train Eval: 0.0014, Val Loss: 0.2087
Current LR: 0.00025

Epoch 39/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 39/40 completed - Train Loss: 0.0016, Train Eval: 0.0014, Val Loss: 0.2075
Current LR: 0.000125

Epoch 40/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 40/40 completed - Train Loss: 0.0015, Train Eval: 0.0013, Val Loss: 0.2067
Current LR: 0.000125
Fold 5 (HW3-005) Loss: 0.2067

Fold 6/10 - Validation Participant: HW3-006
------------------------------------------------------------
Training samples: 539730, Validation samples: 59970

Model Structure for Fold 6:
PureTransformerModel(
  (short_term_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (medium_term_embed): Sequential(
    (0): Linear(in_features=9, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (long_term_embed): Sequential(
    (0): Linear(in_features=15, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.0100

Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 1/40 completed - Train Loss: 0.0977, Train Eval: 0.0339, Val Loss: 0.2472
Current LR: 0.001
New best model saved with Val Loss: 0.2472

Epoch 2/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 2/40 completed - Train Loss: 0.0252, Train Eval: 0.0197, Val Loss: 0.2787
Current LR: 0.001

Epoch 3/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 3/40 completed - Train Loss: 0.0175, Train Eval: 0.0140, Val Loss: 0.3307
Current LR: 0.001

Epoch 4/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 4/40 completed - Train Loss: 0.0121, Train Eval: 0.0100, Val Loss: 0.2797
Current LR: 0.001

Epoch 5/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 5/40 completed - Train Loss: 0.0104, Train Eval: 0.0123, Val Loss: 0.2924
Current LR: 0.0005

Epoch 6/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 6/40 completed - Train Loss: 0.0078, Train Eval: 0.0069, Val Loss: 0.2939
Current LR: 0.0005

Epoch 7/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 7/40 completed - Train Loss: 0.0073, Train Eval: 0.0069, Val Loss: 0.3064
Current LR: 0.0005

Epoch 8/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 8/40 completed - Train Loss: 0.0070, Train Eval: 0.0068, Val Loss: 0.3048
Current LR: 0.0005

Epoch 9/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 9/40 completed - Train Loss: 0.0069, Train Eval: 0.0062, Val Loss: 0.2968
Current LR: 0.00025
Early stopping triggered at epoch 9
Fold 6 (HW3-006) Loss: 0.2968

Fold 7/10 - Validation Participant: HW3-007
------------------------------------------------------------
Training samples: 539730, Validation samples: 59970

Model Structure for Fold 7:
PureTransformerModel(
  (short_term_embed): Sequential(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (medium_term_embed): Sequential(
    (0): Linear(in_features=9, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
    (3): Dropout(p=0.010000000000000002, inplace=False)
  )
  (long_term_embed): Sequential(
    (0): Linear(in_features=15, out_features=32, bias=True)
    (1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (2)

Training:   0%|          | 0/1055 [00:00<?, ?it/s]

Train Eval:   0%|          | 0/1055 [00:00<?, ?it/s]

Validation:   0%|          | 0/118 [00:00<?, ?it/s]

Epoch 1/40 completed - Train Loss: 0.0991, Train Eval: 0.0325, Val Loss: 0.2199
Current LR: 0.001
New best model saved with Val Loss: 0.2199

Epoch 2/40
------------------------------------------------------------------------------------------


Training:   0%|          | 0/1055 [00:00<?, ?it/s]

In [None]:
# Initialise the final model with the best weights
final_model = PureTransformerModel(seq_length, num_features=X_feat.shape[1]).to(device)
final_model.load_state_dict(best_models[best_fold_idx])

# Plot training history for the best fold
best_history = all_histories[best_fold_idx]
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(best_history['train_loss'], label='Train Loss')
plt.plot(best_history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title(f'Best Fold (#{best_fold_idx+1}) Training History')
plt.legend()
plt.grid(True)

# Plot average training history across all folds
plt.subplot(1, 2, 2)
max_epochs = max(len(h['train_loss']) for h in all_histories)
padded_train_losses = []
padded_val_losses = []

for h in all_histories:
    train_loss = h['train_loss'] + [h['train_loss'][-1]] * (max_epochs - len(h['train_loss']))
    val_loss = h['val_loss'] + [h['val_loss'][-1]] * (max_epochs - len(h['val_loss']))
    
    padded_train_losses.append(train_loss)
    padded_val_losses.append(val_loss)

avg_train_loss = np.mean(padded_train_losses, axis=0)
avg_val_loss = np.mean(padded_val_losses, axis=0)

plt.plot(avg_train_loss, label='Avg Train Loss')
plt.plot(avg_val_loss, label='Avg Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Average Training History Across All Folds')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Ensemble prediction function (using all trained models)
def ensemble_predict(models, X_time_test, X_feat_test, batch_size=512):
    predictions = []
    
    # Create test dataset
    test_dataset = TensorDataset(
        torch.tensor(X_time_test, dtype=torch.float32),
        torch.tensor(X_feat_test, dtype=torch.float32)
    )
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Predict with each model
    for model_state in models:
        model = PureTransformerModel(seq_length, num_features=X_feat.shape[1]).to(device)
        model.load_state_dict(model_state)
        model.eval()
        
        fold_preds = []
        with torch.no_grad():
            for batch_X_time, batch_X_feat in test_loader:
                batch_X_time = batch_X_time.to(device)
                batch_X_feat = batch_X_feat.to(device)
                
                output = model(batch_X_time, batch_X_feat)
                fold_preds.append(output.cpu().numpy())
                
                # Free memory
                del batch_X_time, batch_X_feat, output
        
        # Concatenate predictions for this fold
        fold_preds = np.vstack(fold_preds)
        predictions.append(fold_preds)
    
    # Average predictions from all models
    ensemble_predictions = np.mean(predictions, axis=0)
    
    return ensemble_predictions

# Function to evaluate on the entire dataset using the ensemble
def evaluate_ensemble(models, X_time, X_feat, y, batch_size=512):
    # Get ensemble predictions
    predictions = ensemble_predict(models, X_time, X_feat, batch_size)
    
    # Inverse transform predictions and targets back to original scale
    # Updated to use the scalers dictionary
    predictions = scalers['vicon_xyz'].inverse_transform(predictions)
    
    # For VIVE original, we need to inverse transform the last timestep of X_time
    # X_time contains scaled VIVE XYZ data, so we use vive_xyz scaler
    vive_original = scalers['vive_xyz'].inverse_transform(X_time[:, -1])  # Shape: (samples, 3)
    
    # Inverse transform target y
    y_original = scalers['vicon_xyz'].inverse_transform(y)
    
    # Calculate metrics for Ensemble vs Vicon
    ensemble_mse = np.mean((predictions - y_original) ** 2)
    ensemble_rmse = np.sqrt(ensemble_mse)
    ensemble_mae = np.mean(np.abs(predictions - y_original))
    
    # Calculate metrics for Original VIVE vs Vicon
    vive_mse = np.mean((vive_original - y_original) ** 2)
    vive_rmse = np.sqrt(vive_mse)
    vive_mae = np.mean(np.abs(vive_original - y_original))
    
    # Calculate improvement
    rmse_improvement = ((vive_rmse - ensemble_rmse) / vive_rmse) * 100
    mae_improvement = ((vive_mae - ensemble_mae) / vive_mae) * 100
    
    print(f"Model Performance Comparison:")
    print(f"{'='*50}")
    print(f"Original VIVE vs Vicon:")
    print(f"  RMSE: {vive_rmse:.4f}")
    print(f"  MAE:  {vive_mae:.4f}")
    print(f"")
    print(f"Ensemble Model vs Vicon:")
    print(f"  RMSE: {ensemble_rmse:.4f}")
    print(f"  MAE:  {ensemble_mae:.4f}")
    print(f"")
    print(f"Improvement:")
    print(f"  RMSE: {rmse_improvement:+.2f}% {'(Better)' if rmse_improvement > 0 else '(Worse)'}")
    print(f"  MAE:  {mae_improvement:+.2f}% {'(Better)' if mae_improvement > 0 else '(Worse)'}")
    
    # Visualize predictions vs actual
    plt.figure(figsize=(15, 10))
    
    # Plot a sample of the data (first 4000 points)
    sample_size = min(4000, len(predictions))
    
    # Time series comparison - show each XYZ dimension
    for i, dim in enumerate(['X', 'Y', 'Z']):
        plt.subplot(2, 3, i+1)
        plt.plot(y_original[:sample_size, i], label=f'Actual Vicon {dim}', linewidth=2)
        plt.plot(predictions[:sample_size, i], label=f'Predicted {dim}', alpha=0.8)
        plt.plot(vive_original[:sample_size, i], label=f'Original VIVE {dim}', alpha=0.7)
        plt.xlabel('Sample Index')
        plt.ylabel(f'{dim} Value')
        plt.title(f'{dim} Dimension Comparison')
        plt.legend()
        plt.grid(True)
    
    # Overall correlation plot for Ensemble
    plt.subplot(2, 3, 4)
    plt.scatter(y_original.flatten(), predictions.flatten(), alpha=0.5, s=1)
    plt.plot([y_original.min(), y_original.max()], [y_original.min(), y_original.max()], 'r--')
    plt.xlabel('Actual (Vicon)')
    plt.ylabel('Predicted (Ensemble)')
    plt.title(f'Ensemble Correlation (RMSE: {ensemble_rmse:.4f})')
    plt.grid(True)
    
    # Overall correlation plot for Original VIVE
    plt.subplot(2, 3, 5)
    plt.scatter(y_original.flatten(), vive_original.flatten(), alpha=0.5, s=1, color='orange')
    plt.plot([y_original.min(), y_original.max()], [y_original.min(), y_original.max()], 'r--')
    plt.xlabel('Actual (Vicon)')
    plt.ylabel('Original (VIVE)')
    plt.title(f'VIVE Correlation (RMSE: {vive_rmse:.4f})')
    plt.grid(True)
    
    # Error distribution comparison
    plt.subplot(2, 3, 6)
    ensemble_errors = predictions.flatten() - y_original.flatten()
    vive_errors = vive_original.flatten() - y_original.flatten()
    
    plt.hist(ensemble_errors, bins=50, alpha=0.7, label=f'Ensemble (std: {np.std(ensemble_errors):.4f})', density=True)
    plt.hist(vive_errors, bins=50, alpha=0.7, label=f'VIVE (std: {np.std(vive_errors):.4f})', density=True)
    plt.xlabel('Error (Predicted - Actual)')
    plt.ylabel('Density')
    plt.title('Error Distribution Comparison')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed statistics
    print(f"\nDetailed Error Statistics:")
    print(f"{'='*50}")
    print(f"Ensemble Model Errors:")
    print(f"  Mean Error: {np.mean(ensemble_errors):.4f}")
    print(f"  Std Error:  {np.std(ensemble_errors):.4f}")
    print(f"  Min Error:  {np.min(ensemble_errors):.4f}")
    print(f"  Max Error:  {np.max(ensemble_errors):.4f}")
    print(f"")
    print(f"Original VIVE Errors:")
    print(f"  Mean Error: {np.mean(vive_errors):.4f}")
    print(f"  Std Error:  {np.std(vive_errors):.4f}")
    print(f"  Min Error:  {np.min(vive_errors):.4f}")
    print(f"  Max Error:  {np.max(vive_errors):.4f}")
    
    # Print per-dimension statistics
    print(f"\nPer-Dimension RMSE:")
    print(f"{'='*30}")
    for i, dim in enumerate(['X', 'Y', 'Z']):
        ensemble_rmse_dim = np.sqrt(np.mean((predictions[:, i] - y_original[:, i]) ** 2))
        vive_rmse_dim = np.sqrt(np.mean((vive_original[:, i] - y_original[:, i]) ** 2))
        improvement_dim = ((vive_rmse_dim - ensemble_rmse_dim) / vive_rmse_dim) * 100
        
        print(f"{dim} Dimension:")
        print(f"  VIVE RMSE:     {vive_rmse_dim:.4f}")
        print(f"  Ensemble RMSE: {ensemble_rmse_dim:.4f}")
        print(f"  Improvement:   {improvement_dim:+.2f}%")
    
    return predictions, y_original, vive_original

# Evaluate the ensemble on the entire dataset
predictions, actuals, vive_original = evaluate_ensemble(best_models, X_time, X_feat, y)

# Save the best model
torch.save(final_model.state_dict(), 'best_vive_to_vicon_model_Transformer-only.pth')
print("Best model saved to 'best_vive_to_vicon_model_Transformer-only.pth'")

# Feature importance analysis
def analyze_feature_importance(model, X_time, X_feat, y, feature_names):
    # Base performance
    model.eval()
    base_preds = []
    
    # Create dataset
    dataset = TensorDataset(
        torch.tensor(X_time, dtype=torch.float32),
        torch.tensor(X_feat, dtype=torch.float32)
    )
    data_loader = DataLoader(dataset, batch_size=512, shuffle=False)
    
    with torch.no_grad():
        for batch_X_time, batch_X_feat in data_loader:
            batch_X_time = batch_X_time.to(device)
            batch_X_feat = batch_X_feat.to(device)
            
            output = model(batch_X_time, batch_X_feat)
            base_preds.append(output.cpu().numpy())
    
    base_preds = np.vstack(base_preds)
    base_mse = np.mean((base_preds - y) ** 2)  # Updated: y is now 3D (samples, 3)
    
    # Permutation importance
    importance_scores = []
    
    for i in range(X_feat.shape[1]):
        # Create a copy and permute one feature
        X_feat_permuted = X_feat.copy()
        X_feat_permuted[:, i] = np.random.permutation(X_feat_permuted[:, i])
        
        # Predict with permuted feature
        perm_preds = []
        
        dataset_perm = TensorDataset(
            torch.tensor(X_time, dtype=torch.float32),
            torch.tensor(X_feat_permuted, dtype=torch.float32)
        )
        loader_perm = DataLoader(dataset_perm, batch_size=512, shuffle=False)
        
        with torch.no_grad():
            for batch_X_time, batch_X_feat in loader_perm:
                batch_X_time = batch_X_time.to(device)
                batch_X_feat = batch_X_feat.to(device)
                
                output = model(batch_X_time, batch_X_feat)
                perm_preds.append(output.cpu().numpy())
        
        perm_preds = np.vstack(perm_preds)
        perm_mse = np.mean((perm_preds - y) ** 2)  # Updated: y is now 3D (samples, 3)
        
        # Importance is the increase in error
        importance = perm_mse - base_mse
        importance_scores.append(importance)
    
    # Visualise feature importance
    plt.figure(figsize=(12, 8))
    sorted_idx = np.argsort(importance_scores)
    plt.barh(range(len(feature_names)), [importance_scores[i] for i in sorted_idx])
    plt.yticks(range(len(feature_names)), [feature_names[i] for i in sorted_idx])
    plt.xlabel('Increase in MSE when feature is permuted')
    plt.title('Feature Importance Analysis')
    plt.grid(True, axis='x')
    plt.tight_layout()
    plt.show()
    
    return importance_scores

# Define feature names for visualization (updated to match new 7 features)
feature_names = [
    'Participant ID', 
    'Speed', 
    'Tracker Height',
    'VIVE Spatial Magnitude'
]

# Verify feature count matches
assert len(feature_names) == X_feat.shape[1], f"Feature names count ({len(feature_names)}) doesn't match actual features ({X_feat.shape[1]})"

# Analyze feature importance
importance_scores = analyze_feature_importance(final_model, X_time, X_feat, y, feature_names)
print("Feature importance analysis complete!")

# Function to save ensemble models
def save_ensemble_models(models, model_dir='saved_models'):
    """
    Save all models in the ensemble to disk
    
    Parameters:
    -----------
    models : list
        List of model state dictionaries
    model_dir : str
        Directory to save models in
    
    Returns:
    --------
    model_path : str
        Path to the saved ensemble
    """
    # Create directory if it doesn't exist
    os.makedirs(model_dir, exist_ok=True)
    
    # Create timestamp for unique filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    ensemble_path = os.path.join(model_dir, f"vive_vicon_CoM_ensemble_{timestamp}")
    os.makedirs(ensemble_path, exist_ok=True)
    
    # Save each model in the ensemble
    for i, model_state in enumerate(models):
        model_path = os.path.join(ensemble_path, f"model_fold_{i+1}.pth")
        torch.save(model_state, model_path)
    

# Updated ensemble models data structure
ensemble_models_data = {
    'model_states': best_models,
    'model_config': {
        'seq_length': seq_length,
        'num_features': X_feat.shape[1]  # Now 7 features
    },
    'preprocessing': {
        'scalers': scalers,  # Dictionary containing all scalers
        'participant_encoder': participant_encoder,
    }
}

# Save ensemble model
with open('feet_ensemble_models_Transformer-only.pkl', 'wb') as f:
    pickle.dump(ensemble_models_data, f)
print("Ensemble model has been saved to 'feet_ensemble_models_Transformer-only.pkl'")

In [None]:
def load_ensemble_models():
    with open('feet_ensemble_models_Transformer-only.pkl', 'rb') as f:
        ensemble_data = pickle.load(f)
    return ensemble_data

# Integrated prediction function
def ensemble_predict_custom(start_index, length, X_time, X_feat, y, tracker_info=None):
    """
    Make predictions using the ensemble model on a subset of data.
    
    Parameters:
    -----------
    start_index : int
        Starting index for the prediction subset
    length : int
        Number of samples to predict
    X_time : ndarray
        Time series features
    X_feat : ndarray
        Additional features
    y : ndarray
        Ground truth targets
    tracker_info : int, optional
        Tracker ID to filter data (1, 2, or 3). If None, uses all data.
    
    Returns:
    --------
    predictions_original : ndarray
        Ensemble predictions in original scale
    y_original : ndarray
        Ground truth in original scale
    vive_original : ndarray
        Original VIVE data in original scale
    """
    # Load ensemble model data
    ensemble_data = load_ensemble_models()
    scalers = ensemble_data['preprocessing']['scalers']
    
    # Filter data by tracker if specified
    if tracker_info is not None:
        # Extract tracker height feature from X_feat (3rd column, index 2)
        # Need to inverse transform to get original tracker values
        height_original = scalers['height'].inverse_transform(X_feat[:, 2].reshape(-1, 1)).flatten()
        
        # Round to integer to handle floating point precision issues
        height_rounded = np.round(height_original).astype(int)
        
        # Filter indices for the specified tracker
        tracker_mask = (height_rounded == tracker_info)
        tracker_indices = np.where(tracker_mask)[0]
        
        print(f"Filtering data for Tracker {tracker_info}...")
        print(f"Unique values (after rounding): {np.unique(height_rounded)}")
        
        if len(tracker_indices) == 0:
            print(f"Error: No data found for Tracker {tracker_info}")
            print(f"Available Tracker values: {np.unique(height_rounded)}")
            return None
        
        # Use filtered data
        X_time = X_time[tracker_mask]
        X_feat = X_feat[tracker_mask]
        y = y[tracker_mask]
        
        print(f"Successfully filtered Tracker {tracker_info} data: {len(X_time)} samples")
    
    # Validate index range
    end_index = min(start_index + length, len(X_time))
    if start_index >= len(X_time) or start_index < 0:
        print(f"Error: Start index out of range [0, {len(X_time)-1}]")
        return None
    
    # Extract data subset
    X_time_subset = X_time[start_index:end_index]
    X_feat_subset = X_feat[start_index:end_index]
    y_subset = y[start_index:end_index]
    
    # Create data loader
    test_dataset = TensorDataset(
        torch.tensor(X_time_subset, dtype=torch.float32),
        torch.tensor(X_feat_subset, dtype=torch.float32)
    )
    test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)
    
    # Make predictions using all models in the ensemble
    all_predictions = []
    
    for model_state in ensemble_data['model_states']:
        model = PureTransformerModel(
            ensemble_data['model_config']['seq_length'],
            num_features=ensemble_data['model_config']['num_features']
        ).to(device)
        model.load_state_dict(model_state)
        model.eval()
        
        fold_preds = []
        with torch.no_grad():
            for batch_X_time, batch_X_feat in test_loader:
                batch_X_time = batch_X_time.to(device)
                batch_X_feat = batch_X_feat.to(device)
                output = model(batch_X_time, batch_X_feat)
                fold_preds.append(output.cpu().numpy())
        
        fold_preds = np.vstack(fold_preds)
        all_predictions.append(fold_preds)
    
    # Average predictions across all models
    ensemble_predictions = np.mean(all_predictions, axis=0)
    
    # Inverse transform to original scale
    predictions_original = scalers['vicon_xyz'].inverse_transform(ensemble_predictions)
    y_original = scalers['vicon_xyz'].inverse_transform(y_subset)
    vive_original = scalers['vive_xyz'].inverse_transform(X_time_subset[:, -1])
    
    # Calculate evaluation metrics
    ensemble_mse = np.mean((predictions_original - y_original) ** 2)
    ensemble_rmse = np.sqrt(ensemble_mse)
    ensemble_mae = np.mean(np.abs(predictions_original - y_original))
    
    vive_mse = np.mean((vive_original - y_original) ** 2)
    vive_rmse = np.sqrt(vive_mse)
    vive_mae = np.mean(np.abs(vive_original - y_original))
    
    rmse_improvement = ((vive_rmse - ensemble_rmse) / vive_rmse) * 100
    mae_improvement = ((vive_mae - ensemble_mae) / vive_mae) * 100

    # Add tracker information to output
    tracker_label = f" (Tracker {tracker_info})" if tracker_info is not None else ""
    print(f"\nPrediction results for index range {start_index}-{end_index-1}{tracker_label}:")
    print(f"Original VIVE vs Vicon: RMSE={vive_rmse:.4f}, MAE={vive_mae:.4f}")
    print(f"Ensemble model vs Vicon: RMSE={ensemble_rmse:.4f}, MAE={ensemble_mae:.4f}")
    print(f"Improvement: RMSE {rmse_improvement:+.2f}%, MAE {mae_improvement:+.2f}%")
    
    # Visualization - Updated for 3D XYZ data
    sample_size = min(2000, len(predictions_original))
    
    # Add tracker information to plot titles
    title_suffix = f" - Tracker {tracker_info}" if tracker_info is not None else ""
    
    plt.figure(figsize=(15, 12))
    
    # Plot each XYZ dimension separately
    for i, dim in enumerate(['X', 'Y', 'Z']):
        plt.subplot(3, 2, i*2+1)
        plt.plot(y_original[:sample_size, i], label=f'Actual Vicon {dim}', linewidth=2)
        plt.plot(predictions_original[:sample_size, i], label=f'Predicted {dim}', alpha=0.8)
        plt.plot(vive_original[:sample_size, i], label=f'Original VIVE {dim}', alpha=0.7)
        plt.xlabel('Sample Index')
        plt.ylabel(f'{dim} Value')
        plt.title(f'{dim} Dimension Time Series{title_suffix}')
        plt.legend()
        plt.grid(True)
        
        plt.subplot(3, 2, i*2+2)
        plt.scatter(y_original[:sample_size, i], predictions_original[:sample_size, i], 
                   alpha=0.5, s=1, label='Ensemble')
        plt.scatter(y_original[:sample_size, i], vive_original[:sample_size, i], 
                   alpha=0.5, s=1, color='orange', label='VIVE')
        plt.plot([y_original[:sample_size, i].min(), y_original[:sample_size, i].max()], 
                [y_original[:sample_size, i].min(), y_original[:sample_size, i].max()], 'r--')
        plt.xlabel(f'Actual Vicon {dim}')
        plt.ylabel(f'Predicted {dim}')
        plt.title(f'{dim} Dimension Correlation{title_suffix}')
        plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Overall error distribution
    plt.figure(figsize=(12, 4))
    
    for i, dim in enumerate(['X', 'Y', 'Z']):
        plt.subplot(1, 3, i+1)
        ensemble_errors = predictions_original[:, i] - y_original[:, i]
        vive_errors = vive_original[:, i] - y_original[:, i]
        
        plt.hist(ensemble_errors, bins=30, alpha=0.7, 
                label=f'Ensemble (std: {np.std(ensemble_errors):.4f})', density=True)
        plt.hist(vive_errors, bins=30, alpha=0.7, 
                label=f'VIVE (std: {np.std(vive_errors):.4f})', density=True)
        plt.xlabel(f'{dim} Error')
        plt.ylabel('Density')
        plt.title(f'{dim} Error Distribution{title_suffix}')
        plt.legend()
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Print per-dimension statistics
    print(f"\nPer-Dimension Statistics:")
    print(f"{'='*40}")
    for i, dim in enumerate(['X', 'Y', 'Z']):
        ensemble_rmse_dim = np.sqrt(np.mean((predictions_original[:, i] - y_original[:, i]) ** 2))
        vive_rmse_dim = np.sqrt(np.mean((vive_original[:, i] - y_original[:, i]) ** 2))
        improvement_dim = ((vive_rmse_dim - ensemble_rmse_dim) / vive_rmse_dim) * 100
        
        print(f"{dim} Dimension:")
        print(f"  VIVE RMSE:     {vive_rmse_dim:.4f}")
        print(f"  Ensemble RMSE: {ensemble_rmse_dim:.4f}")
        print(f"  Improvement:   {improvement_dim:+.2f}%")
    
    return predictions_original, y_original, vive_original

# Test the prediction function
predictions, actual, vive = ensemble_predict_custom(
    start_index=100000,
    length=1000,
    X_time=X_time,
    X_feat=X_feat,
    y=y,
    tracker_info=3  # Tracker: 1, 2, or 3
)