In [None]:
%pip install optuna
%pip install sparsemax # https://pypi.org/project/sparsemax/

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sparsemax import Sparsemax
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.datasets import fetch_california_housing
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
from tqdm import tqdm
import optuna
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sparsemax = Sparsemax(dim=-1)

class TabularDataset(Dataset):
    """Dataset for tabular data with continuous features"""
    def __init__(self, X, y=None):
        # Continuous features
        self.X = torch.tensor(X, dtype=torch.float32)

        # Target
        if y is not None:
            self.y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
        else:
            self.y = None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

'''
The code below has been adapted from the original codebase.

For the implementation of the FT Transformer, please check out this repository: https://github.com/yandex-research/rtdl-revisiting-models

For the implementation of the Piecewise Linear Embedding, please check out: https://github.com/yandex-research/rtdl-num-embeddings
'''

class LinearEmbedding(nn.Module):
    """Linear embedding for continuous features"""
    def __init__(self, num_features, d_token):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Linear(1, d_token) for _ in range(num_features)
        ])

    def forward(self, x):
        # x shape: (batch_size, num_cont_features)
        batch_size = x.shape[0]
        num_features = x.shape[1]

        # Embed each continuous feature
        embedded = torch.zeros((batch_size, num_features, self.embeddings[0].out_features),
                              device=x.device)

        for i in range(num_features):
            embedded[:, i] = self.embeddings[i](x[:, i].unsqueeze(-1)).squeeze(-1)

        return embedded  # (batch_size, num_features, d_token)

class PiecewiseLinearEmbedding(nn.Module):
    """Piecewise linear embedding for continuous features"""
    def __init__(self, num_features, d_token, num_bins=20):
        super().__init__()
        self.num_features = num_features
        self.d_token = d_token
        self.num_bins = num_bins

        # Create embeddings for each feature
        self.embeddings = nn.ModuleList([
            nn.Linear(num_bins, d_token) for _ in range(num_features)
        ])

        # Create parameters for bin boundaries (learnable)
        self.bin_boundaries = nn.Parameter(torch.randn(num_features, num_bins-1))

    def forward(self, x):
        # x shape: (batch_size, num_features)
        batch_size = x.shape[0]

        # Output will contain embedded tokens for each feature
        embedded = torch.zeros((batch_size, self.num_features, self.d_token), device=x.device)

        for i in range(self.num_features):
            # Get feature values for current feature
            feature_values = x[:, i].unsqueeze(1)  # (batch_size, 1)

            # Get sorted boundaries for this feature
            boundaries = torch.sort(self.bin_boundaries[i]).values  # (num_bins-1)

            # Calculate bin activations using cumulative distribution
            # Start with all in the first bin
            bin_activations = torch.ones((batch_size, self.num_bins), device=x.device)

            # Update bin activations based on feature values and boundaries
            for j in range(self.num_bins-1):
                boundary = boundaries[j]
                # Calculate contribution to bins based on boundary comparison
                condition = feature_values > boundary
                # Move activations to next bin when condition is true
                bin_activations[:, j+1:] = torch.where(
                    condition.expand(-1, self.num_bins-j-1),
                    bin_activations[:, j:self.num_bins-1],
                    bin_activations[:, j+1:]
                )
                bin_activations[:, j] = torch.where(
                    condition.squeeze(1),
                    0.0,
                    bin_activations[:, j]
                )

            # Apply linear transformation to get embeddings
            feature_embedding = self.embeddings[i](bin_activations)  # (batch_size, d_token)
            embedded[:, i] = feature_embedding

        return embedded  # (batch_size, num_features, d_token)

# Custom attention module to capture attention weights
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        # Ensure d_model is divisible by num_heads
        assert self.head_dim * num_heads == d_model, "d_model must be divisible by num_heads"

        # Linear projections
        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

        # For storing attention weights
        self.attention_weights = None

    def forward(self, query, key, value, attn_mask=None):
        batch_size = query.shape[0]

        # Linear projections
        q = self.q_proj(query)  # (batch_size, seq_len, d_model)
        k = self.k_proj(key)    # (batch_size, seq_len, d_model)
        v = self.v_proj(value)  # (batch_size, seq_len, d_model)

        # Reshape for multi-head attention
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Calculate attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)

        # Apply mask if provided
        if attn_mask is not None:
            scores = scores.masked_fill(attn_mask == 0, -1e9)

        # Apply softmax to get attention weights
        attention_weights = F.softmax(scores, dim=-1)
        self.attention_weights = attention_weights  # Store for later use

        # Apply attention weights to values
        out = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len, head_dim)

        # Reshape back
        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        # Final linear projection
        out = self.out_proj(out)

        return out

# Custom transformer layer to capture attention weights
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, nhead)

        # Feed-forward network
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        # Layer norm
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None):
        # Self-attention
        attn_output = self.self_attn(src, src, src, attn_mask=src_mask)
        src = src + self.dropout1(attn_output)
        src = self.norm1(src)

        # Feed-forward network
        ff_output = self.linear2(self.dropout(F.relu(self.linear1(src))))
        src = src + self.dropout2(ff_output)
        src = self.norm2(src)

        return src

class FTTransformer(nn.Module):
    def __init__(self, num_features, d_token=64, num_heads=8, num_layers=2,
                 d_ffn=128, dropout=0.1, embedding_type='linear', n_bins=20):
        super().__init__()
        self.d_token = d_token
        self.num_features = num_features
        self.embedding_type = embedding_type

        # CLS token parameter
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_token))

        # Feature tokenizer
        if embedding_type == 'linear':
            self.feature_tokenizer = LinearEmbedding(num_features, d_token)
        elif embedding_type == 'piecewise':
            self.feature_tokenizer = PiecewiseLinearEmbedding(num_features, d_token, num_bins=n_bins)
        else:
            raise ValueError(f"Unknown embedding type: {embedding_type}")

        # Feature positional embedding
        self.feature_pos_embedding = nn.Parameter(torch.randn(1, num_features, d_token))

        # Custom transformer layers
        self.transformer_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model=d_token, nhead=num_heads,
                                   dim_feedforward=d_ffn, dropout=dropout)
            for _ in range(num_layers)
        ])

        # Output layer for regression (single value)
        self.output_layer = nn.Linear(d_token, 1)

    def forward(self, x):
        batch_size = x.shape[0]

        # Tokenize features
        tokens = self.feature_tokenizer(x)  # (batch_size, num_features, d_token)

        # Add positional embedding
        tokens = tokens + self.feature_pos_embedding

        # Add CLS token
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        sequence = torch.cat([cls_tokens, tokens], dim=1)  # (batch_size, num_features+1, d_token)

        # Apply transformer layers
        for layer in self.transformer_layers:
            sequence = layer(sequence)

        # Use CLS token for prediction
        cls_output = sequence[:, 0]

        # Final prediction (no activation for regression)
        output = self.output_layer(cls_output)

        return output

    def get_cls_attention(self):
        """Return the attention weights from CLS token to feature tokens (average over all layers)"""
        # Average attention weights across all layers
        cls_attention = []

        for layer in self.transformer_layers:
            # Extract CLS token attention to features
            # layer_weights shape: (batch_size, num_heads, seq_len, seq_len)
            if layer.self_attn.attention_weights is not None:
                # Get attention from CLS (idx 0) to features (idx 1:)
                layer_weights = layer.self_attn.attention_weights
                cls_to_features = layer_weights[:, :, 0, 1:].mean(dim=1)  # Average over heads
                cls_attention.append(cls_to_features)
            else:
                raise ValueError("Attention weights not available. Run forward first.")

        # Average over layers
        avg_attention = torch.stack(cls_attention).mean(dim=0)
        return avg_attention

# Sparse attention variants
class sparseMultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        # Ensure d_model is divisible by num_heads
        assert self.head_dim * num_heads == d_model, "d_model must be divisible by num_heads"

        # Linear projections
        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

        # For storing attention weights
        self.attention_weights = None

    def forward(self, query, key, value, attn_mask=None):
        batch_size = query.shape[0]

        # Linear projections
        q = self.q_proj(query)  # (batch_size, seq_len, d_model)
        k = self.k_proj(key)    # (batch_size, seq_len, d_model)
        v = self.v_proj(value)  # (batch_size, seq_len, d_model)

        # Reshape for multi-head attention
        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Calculate attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)

        # Apply mask if provided
        if attn_mask is not None:
            scores = scores.masked_fill(attn_mask == 0, -1e9)

        # Apply sparsemax to get attention weights
        attention_weights = sparsemax(scores)
        self.attention_weights = attention_weights  # Store for later use

        # Apply attention weights to values
        out = torch.matmul(attention_weights, v)  # (batch_size, num_heads, seq_len, head_dim)

        # Reshape back
        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        # Final linear projection
        out = self.out_proj(out)

        return out

class sparseTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super().__init__()
        self.self_attn = sparseMultiHeadAttention(d_model, nhead)

        # Feed-forward network
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        # Layer norm
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None):
        # Self-attention
        attn_output = self.self_attn(src, src, src, attn_mask=src_mask)
        src = src + self.dropout1(attn_output)
        src = self.norm1(src)

        # Feed-forward network
        ff_output = self.linear2(self.dropout(F.relu(self.linear1(src))))
        src = src + self.dropout2(ff_output)
        src = self.norm2(src)

        return src

class sparseFTTransformer(nn.Module):
    def __init__(self, num_features, d_token=64, num_heads=8, num_layers=2,
                 d_ffn=128, dropout=0.1, embedding_type='linear', n_bins=20):
        super().__init__()
        self.d_token = d_token
        self.num_features = num_features
        self.embedding_type = embedding_type

        # CLS token parameter
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_token))

        # Feature tokenizer
        if embedding_type == 'linear':
            self.feature_tokenizer = LinearEmbedding(num_features, d_token)
        elif embedding_type == 'piecewise':
            self.feature_tokenizer = PiecewiseLinearEmbedding(num_features, d_token, num_bins=n_bins)
        else:
            raise ValueError(f"Unknown embedding type: {embedding_type}")

        # Feature positional embedding
        self.feature_pos_embedding = nn.Parameter(torch.randn(1, num_features, d_token))

        # Custom transformer layers
        self.transformer_layers = nn.ModuleList([
            sparseTransformerEncoderLayer(d_model=d_token, nhead=num_heads,
                                   dim_feedforward=d_ffn, dropout=dropout)
            for _ in range(num_layers)
        ])

        # Output layer for regression
        self.output_layer = nn.Linear(d_token, 1)

    def forward(self, x):
        batch_size = x.shape[0]

        # Tokenize features
        tokens = self.feature_tokenizer(x)  # (batch_size, num_features, d_token)

        # Add positional embedding
        tokens = tokens + self.feature_pos_embedding

        # Add CLS token
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        sequence = torch.cat([cls_tokens, tokens], dim=1)  # (batch_size, num_features+1, d_token)

        # Apply transformer layers
        for layer in self.transformer_layers:
            sequence = layer(sequence)

        # Use CLS token for prediction
        cls_output = sequence[:, 0]

        # Final prediction (no activation for regression)
        output = self.output_layer(cls_output)

        return output

    def get_cls_attention(self):
        """Return the attention weights from CLS token to feature tokens (average over all layers)"""
        # Average attention weights across all layers
        cls_attention = []

        for layer in self.transformer_layers:
            # Extract CLS token attention to features
            # layer_weights shape: (batch_size, num_heads, seq_len, seq_len)
            if layer.self_attn.attention_weights is not None:
                # Get attention from CLS (idx 0) to features (idx 1:)
                layer_weights = layer.self_attn.attention_weights
                cls_to_features = layer_weights[:, :, 0, 1:].mean(dim=1)  # Average over heads
                cls_attention.append(cls_to_features)
            else:
                raise ValueError("Attention weights not available. Run forward first.")

        # Average over layers
        avg_attention = torch.stack(cls_attention).mean(dim=0)
        return avg_attention

def calculate_pfi(model, X_val, y_val, num_permutations=5):
    """Calculate Permutation Feature Importance (PFI) for regression"""
    # Convert to PyTorch tensors
    X = torch.tensor(X_val, dtype=torch.float32).to(device)
    y = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1).to(device)

    # Get baseline performance
    model.eval()
    with torch.no_grad():
        baseline_preds = model(X)
        baseline_mse = F.mse_loss(baseline_preds, y).item()

    # Calculate importance for each feature
    importances = []

    for feat_idx in range(X.shape[1]):
        mse_increases = []

        for _ in range(num_permutations):
            # Create a permuted copy of the data
            X_permuted = X.clone()

            # Permute the feature
            perm_idx = torch.randperm(X.shape[0])
            X_permuted[:, feat_idx] = X_permuted[perm_idx, feat_idx]

            # Calculate MSE with permuted feature
            with torch.no_grad():
                perm_preds = model(X_permuted)
                perm_mse = F.mse_loss(perm_preds, y).item()

            # Feature importance is the increase in MSE
            mse_increases.append(perm_mse - baseline_mse)

        # Average over permutations (higher = more important)
        importances.append(np.mean(mse_increases))

    return np.array(importances)

def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=100, early_stopping=16):
    """Train the model with early stopping"""
    model.to(device)
    best_val_loss = float('inf')
    early_stop_counter = 0
    best_state = None

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            # Save best model state dict
            best_state = model.state_dict()
        else:
            early_stop_counter += 1
            if early_stop_counter >= early_stopping:
                print(f"Early stopping at epoch {epoch+1}")
                break

    # Load best model
    if best_state is not None:
        model.load_state_dict(best_state)
    return model

def evaluate_model(model, X_test, y_test, device):
    """Evaluate model performance for regression"""
    model.to(device)
    model.eval()

    X = torch.tensor(X_test, dtype=torch.float32).to(device)
    y = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1).to(device)

    with torch.no_grad():
        y_pred = model(X).cpu().numpy()

    y_test = y_test.reshape(-1, 1)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"Test MSE: {mse:.4f}")
    print(f"Test RMSE: {rmse:.4f}")
    print(f"Test R²: {r2:.4f}")
    print(f"Test MAE: {mae:.4f}")

    return {
        'mse': mse,
        'rmse': rmse,
        'r2': r2,
        'mae': mae
    }

def analyze_pfi_attention_correlation(model, X_val, y_val, feature_names, device):
    """Analyze correlation between PFI and attention scores for regression"""
    model.to(device)
    model.eval()

    # Get attention scores
    X = torch.tensor(X_val, dtype=torch.float32).to(device)
    with torch.no_grad():
        _ = model(X)  # Forward pass to compute attention
        attention_scores = model.get_cls_attention().cpu().numpy()

    # Average attention scores across samples
    avg_attention = attention_scores.mean(axis=0)

    # Calculate PFI
    pfi_scores = calculate_pfi(model, X_val, y_val)

    # Calculate Spearman rank correlation
    correlation, p_value = spearmanr(pfi_scores, avg_attention)

    print(f"Spearman Rank Correlation: {correlation:.4f} (p-value: {p_value:.4f})")

    # Create a visualization
    fig, ax = plt.subplots(figsize=(12, 8))

    # Create a scatter plot
    scatter = ax.scatter(pfi_scores, avg_attention, alpha=0.7)

    # Add feature labels
    for i, name in enumerate(feature_names):
        ax.annotate(name, (pfi_scores[i], avg_attention[i]),
                   textcoords="offset points", xytext=(0,10), ha='center', fontsize=8)

    # Add best fit line
    z = np.polyfit(pfi_scores, avg_attention, 1)
    p = np.poly1d(z)
    ax.plot(np.sort(pfi_scores), p(np.sort(pfi_scores)), "r--", alpha=0.7)

    # Add correlation information
    ax.text(0.05, 0.95, f"Spearman ρ: {correlation:.4f}\np-value: {p_value:.4f}",
            transform=ax.transAxes, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    ax.set_xlabel('Permutation Feature Importance')
    ax.set_ylabel('CLS Token Attention Score')
    ax.set_title('PFI vs CLS Token Attention Correlation')

    plt.tight_layout()
    plt.savefig('pfi_attention_correlation_california.png')
    plt.close()

    # Return results
    results = {
        'correlation': correlation,
        'p_value': p_value,
        'pfi_scores': pfi_scores.tolist(),
        'attention_scores': avg_attention.tolist(),
        'feature_names': feature_names
    }

    return results

def load_california_housing_dataset():
    """Load and preprocess the California Housing dataset"""
    print("Loading California Housing dataset...")

    # Load the dataset using sklearn
    housing = fetch_california_housing()
    X = housing.data
    y = housing.target
    feature_names = housing.feature_names

    print(f"Dataset shape: {X.shape}")
    print(f"Features: {feature_names}")

    # Split data
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    print(f"Dataset splits - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

    return X_train, X_val, X_test, y_train, y_val, y_test, feature_names

def tune_hyperparameters(X_train, y_train, X_val, y_val, embedding_type='linear', n_trials=20, sparse=False):
    """Tune hyperparameters using Optuna for regression"""

    # Create datasets
    train_dataset = TabularDataset(X_train, y_train)
    val_dataset = TabularDataset(X_val, y_val)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=128)

    num_features = X_train.shape[1]

    def objective(trial):
        # Define hyperparameters to tune
        d_token = trial.suggest_int('d_token', 32, 128)
        num_heads = trial.suggest_int('num_heads', 2, 8)
        num_layers = trial.suggest_int('num_layers', 1, 3)
        d_ffn = trial.suggest_int('d_ffn', 64, 256)
        lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
        dropout = trial.suggest_float('dropout', 0.0, 0.5)
        n_bins = trial.suggest_int('n_bins', 10, 100)

        # Ensure d_token is divisible by num_heads
        d_token = (d_token // num_heads) * num_heads

        # Create model with trial hyperparameters
        if not sparse:
            model = FTTransformer(
                num_features=num_features,
                d_token=d_token,
                num_heads=num_heads,
                num_layers=num_layers,
                d_ffn=d_ffn,
                dropout=dropout,
                embedding_type=embedding_type,
                n_bins=n_bins
            )
        else:
            model = sparseFTTransformer(
                num_features=num_features,
                d_token=d_token,
                num_heads=num_heads,
                num_layers=num_layers,
                d_ffn=d_ffn,
                dropout=dropout,
                embedding_type=embedding_type,
                n_bins=n_bins
            )

        # Define criterion and optimizer for regression
        criterion = nn.MSELoss()
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)

        # Train for a few epochs
        model.to(device)
        best_val_loss = float('inf')

        patience = 5
        patience_counter = 0
        num_epochs = 20

        # Short training loop for hyperparameter search
        for epoch in range(num_epochs):
            # Training
            model.train()
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()

            # Validation
            model.eval()
            val_loss = 0

            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)
                    val_loss += loss.item()

            val_loss /= len(val_loader)

            # Update best validation loss
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter > patience:
                    break

            trial.report(val_loss, epoch)

            if trial.should_prune():
                raise optuna.TrialPruned()

        return best_val_loss

    # Create Optuna study
    study = optuna.create_study(
        direction="minimize",
        pruner=optuna.pruners.MedianPruner( # https://optuna.readthedocs.io/en/stable/reference/generated/optuna.pruners.MedianPruner.html
            n_startup_trials=5,
            n_warmup_steps=10,
            interval_steps=2
        )
    )
    study.optimize(objective, n_trials=n_trials, timeout=1800)

    # Print best parameters
    print("Best trial:")
    trial = study.best_trial
    print(f"  Value (validation loss): {trial.value:.4f}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    # Return best parameters
    return trial.params

def train_with_best_params(X_train, y_train, X_val, y_val, X_test, y_test, best_params,
                         embedding_type='linear', sparse=False):
    """Train a model with the best hyperparameters for regression"""
    # Create datasets
    train_dataset = TabularDataset(X_train, y_train)
    val_dataset = TabularDataset(X_val, y_val)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1024)

    # Ensure d_token is divisible by num_heads
    d_token = (best_params['d_token'] // best_params['num_heads']) * best_params['num_heads']

    # Create model with best hyperparameters
    if not sparse:
        model = FTTransformer(
            num_features=X_train.shape[1],
            d_token=d_token,
            num_heads=best_params['num_heads'],
            num_layers=best_params['num_layers'],
            d_ffn=best_params['d_ffn'],
            dropout=best_params['dropout'],
            embedding_type=embedding_type,
            n_bins=best_params['n_bins']
        )
    else:
        model = sparseFTTransformer(
            num_features=X_train.shape[1],
            d_token=d_token,
            num_heads=best_params['num_heads'],
            num_layers=best_params['num_layers'],
            d_ffn=best_params['d_ffn'],
            dropout=best_params['dropout'],
            embedding_type=embedding_type,
            n_bins=best_params['n_bins']
        )

    # Define criterion for regression
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=best_params['lr'], weight_decay=1e-5)

    # Train the model with early stopping
    model = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        criterion=criterion,
        optimizer=optimizer,
        device=device,
        epochs=100
    )

    # Evaluate on test set
    results = evaluate_model(model, X_test, y_test, device)

    return model, results

def visualize_all_models(results, feature_names):
    """Create a comprehensive visualization comparing all regression models"""

    # Create a figure with 2x2 subplots
    fig, axs = plt.subplots(2, 2, figsize=(20, 16))

    # Plot performance metrics
    models = list(results.keys())
    rmse_values = [results[model]['rmse'] for model in models]
    r2_values = [results[model]['r2'] for model in models]

    # RMSE comparison
    axs[0, 0].bar(models, rmse_values)
    axs[0, 0].set_title('RMSE Comparison (California Housing)')
    axs[0, 0].set_ylabel('RMSE')
    axs[0, 0].tick_params(axis='x', rotation=45)

    # R² comparison
    axs[0, 1].bar(models, r2_values)
    axs[0, 1].set_title('R² Comparison (California Housing)')
    axs[0, 1].set_ylabel('R²')
    axs[0, 1].tick_params(axis='x', rotation=45)

    # Correlation comparison
    correlations = [results[model]['correlation_analysis']['correlation'] for model in models]
    p_values = [results[model]['correlation_analysis']['p_value'] for model in models]

    axs[1, 0].bar(models, correlations)
    axs[1, 0].set_title('PFI-Attention Correlation Comparison (California Housing)')
    axs[1, 0].set_ylabel('Spearman Correlation')
    axs[1, 0].tick_params(axis='x', rotation=45)

    # Feature importance comparison across models (top 3 features)
    axs[1, 1].axis('off')  # Turn off the axis for the text summary

    summary_text = "Feature Importance Summary:\n\n"

    for model in models:
        pfi_scores = np.array(results[model]['correlation_analysis']['pfi_scores'])
        attn_scores = np.array(results[model]['correlation_analysis']['attention_scores'])
        feature_names = results[model]['correlation_analysis']['feature_names']

        # Get top 3 features by PFI
        pfi_top_indices = np.argsort(-pfi_scores)[:3]
        pfi_top_features = [feature_names[i] for i in pfi_top_indices]

        # Get top 3 features by attention
        attn_top_indices = np.argsort(-attn_scores)[:3]
        attn_top_features = [feature_names[i] for i in attn_top_indices]

        summary_text += f"{model}:\n"
        summary_text += f"  Top PFI features: {', '.join(pfi_top_features)}\n"
        summary_text += f"  Top attention features: {', '.join(attn_top_features)}\n\n"

    axs[1, 1].text(0.05, 0.95, summary_text, transform=axs[1, 1].transAxes,
                 verticalalignment='top', fontsize=12)

    plt.tight_layout()
    plt.savefig('model_comparison_california.png')
    plt.close()

    # Create additional visualization for feature importance comparison
    fig, axs = plt.subplots(len(models), 1, figsize=(14, 5 * len(models)))

    if len(models) == 1:
        axs = [axs]  # Convert to list if there's only one model

    for i, model in enumerate(models):
        pfi_scores = np.array(results[model]['correlation_analysis']['pfi_scores'])
        attn_scores = np.array(results[model]['correlation_analysis']['attention_scores'])
        feature_names = results[model]['correlation_analysis']['feature_names']

        # Sort features by PFI for visualization
        sorted_indices = np.argsort(-pfi_scores)
        sorted_features = [feature_names[j] for j in sorted_indices]
        sorted_pfi = [pfi_scores[j] for j in sorted_indices]
        sorted_attn = [attn_scores[j] for j in sorted_indices]

        x = np.arange(len(sorted_features))
        width = 0.35

        axs[i].bar(x - width/2, sorted_pfi, width, label='PFI')
        axs[i].bar(x + width/2, sorted_attn, width, label='Attention')

        axs[i].set_title(f'Feature Importance (California Housing): {model}')
        axs[i].set_ylabel('Importance Score')
        axs[i].set_xticks(x)
        axs[i].set_xticklabels(sorted_features, rotation=45, ha='right')
        axs[i].legend()

    plt.tight_layout()
    plt.savefig('feature_importance_comparison_california.png')
    plt.close()

    print("\nVisualizations saved as 'model_comparison_california.png' and 'feature_importance_comparison_california.png'")

def save_model(model, filename):
    torch.save(model.state_dict(), filename)
    print(f"Model saved as {filename}")

def save_results(results, filename):
    # Convert numpy arrays to lists for json serialization
    for model in results:
        if 'correlation_analysis' in results[model]:
            if isinstance(results[model]['correlation_analysis']['pfi_scores'], np.ndarray):
                results[model]['correlation_analysis']['pfi_scores'] = results[model]['correlation_analysis']['pfi_scores'].tolist()
            if isinstance(results[model]['correlation_analysis']['attention_scores'], np.ndarray):
                results[model]['correlation_analysis']['attention_scores'] = results[model]['correlation_analysis']['attention_scores'].tolist()

    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved as {filename}")

def main_with_tuning():
    """Main function to run the California Housing dataset experiments"""
    # Set random seed for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Set device
    print(f"Using device: {device}")

    # Load California Housing dataset
    X_train, X_val, X_test, y_train, y_val, y_test, feature_names = load_california_housing_dataset()

    models = {}
    results = {}

    # Tune hyperparameters for Linear Embedding
    print("\n=== Tuning Hyperparameters for FT Transformer with Linear Embedding ===")
    linear_best_params = tune_hyperparameters(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        embedding_type='linear',
        n_trials=20
    )

    # Train with best parameters for Linear Embedding
    print("\n=== Training FT Transformer with Linear Embedding (Tuned) ===")
    ft_linear_tuned, linear_results = train_with_best_params(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        X_test=X_test,
        y_test=y_test,
        best_params=linear_best_params,
        embedding_type='linear'
    )

    save_model(ft_linear_tuned, 'ft_linear_tuned_california.pth')

    models['ft_linear_tuned'] = ft_linear_tuned
    results['ft_linear_tuned'] = linear_results

    # Analyze PFI and attention correlation for tuned linear model
    print("\n=== Analyzing PFI vs Attention Correlation for Linear Embedding (Tuned) ===")
    linear_tuned_correlation = analyze_pfi_attention_correlation(
        model=ft_linear_tuned,
        X_val=X_val,
        y_val=y_val,
        feature_names=feature_names,
        device=device
    )
    results['ft_linear_tuned']['correlation_analysis'] = linear_tuned_correlation

    # Tune hyperparameters for Piecewise Linear Embedding
    print("\n=== Tuning Hyperparameters for FT Transformer with Piecewise Linear Embedding ===")
    piecewise_best_params = tune_hyperparameters(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        embedding_type='piecewise',
        n_trials=20
    )

    # Train with best parameters for Piecewise Linear Embedding
    print("\n=== Training FT Transformer with Piecewise Linear Embedding (Tuned) ===")
    ft_piecewise_tuned, piecewise_results = train_with_best_params(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        X_test=X_test,
        y_test=y_test,
        best_params=piecewise_best_params,
        embedding_type='piecewise'
    )

    save_model(ft_piecewise_tuned, 'ft_piecewise_tuned_california.pth')

    models['ft_piecewise_tuned'] = ft_piecewise_tuned
    results['ft_piecewise_tuned'] = piecewise_results

    # Analyze PFI and attention correlation for tuned piecewise model
    print("\n=== Analyzing PFI vs Attention Correlation for Piecewise Embedding (Tuned) ===")
    piecewise_tuned_correlation = analyze_pfi_attention_correlation(
        model=ft_piecewise_tuned,
        X_val=X_val,
        y_val=y_val,
        feature_names=feature_names,
        device=device
    )
    results['ft_piecewise_tuned']['correlation_analysis'] = piecewise_tuned_correlation

    # Tune hyperparameters for Sparse Linear Embedding
    print("\n=== Tuning Hyperparameters for sparse FT Transformer with Linear Embedding ===")
    sparse_linear_best_params = tune_hyperparameters(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        embedding_type='linear',
        n_trials=20,
        sparse=True
    )

    # Train with best parameters for Sparse Linear Embedding
    print("\n=== Training sparse FT Transformer with Linear Embedding (Tuned) ===")
    sparse_ft_linear_tuned, sparse_linear_results = train_with_best_params(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        X_test=X_test,
        y_test=y_test,
        best_params=sparse_linear_best_params,
        embedding_type='linear',
        sparse=True
    )

    save_model(sparse_ft_linear_tuned, 'sparse_ft_linear_tuned_california.pth')

    models['sparse_ft_linear_tuned'] = sparse_ft_linear_tuned
    results['sparse_ft_linear_tuned'] = sparse_linear_results

    # Analyze PFI and attention correlation for tuned sparse linear model
    print("\n=== Analyzing PFI vs Attention Correlation for Sparse Linear Embedding (Tuned) ===")
    sparse_linear_tuned_correlation = analyze_pfi_attention_correlation(
        model=sparse_ft_linear_tuned,
        X_val=X_val,
        y_val=y_val,
        feature_names=feature_names,
        device=device
    )
    results['sparse_ft_linear_tuned']['correlation_analysis'] = sparse_linear_tuned_correlation

    # Tune hyperparameters for Sparse Piecewise Embedding
    print("\n=== Tuning Hyperparameters for sparse FT Transformer with Piecewise Linear Embedding ===")
    sparse_piecewise_best_params = tune_hyperparameters(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        embedding_type='piecewise',
        n_trials=20,
        sparse=True
    )

    # Train with best parameters for Sparse Piecewise Embedding
    print("\n=== Training sparse FT Transformer with Piecewise Linear Embedding (Tuned) ===")
    sparse_ft_piecewise_tuned, sparse_piecewise_results = train_with_best_params(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        X_test=X_test,
        y_test=y_test,
        best_params=sparse_piecewise_best_params,
        embedding_type='piecewise',
        sparse=True
    )

    save_model(sparse_ft_piecewise_tuned, 'sparse_ft_piecewise_tuned_california.pth')

    models['sparse_ft_piecewise_tuned'] = sparse_ft_piecewise_tuned
    results['sparse_ft_piecewise_tuned'] = sparse_piecewise_results

    # Analyze PFI and attention correlation for tuned sparse piecewise model
    print("\n=== Analyzing PFI vs Attention Correlation for Sparse Piecewise Embedding (Tuned) ===")
    sparse_piecewise_tuned_correlation = analyze_pfi_attention_correlation(
        model=sparse_ft_piecewise_tuned,
        X_val=X_val,
        y_val=y_val,
        feature_names=feature_names,
        device=device
    )
    results['sparse_ft_piecewise_tuned']['correlation_analysis'] = sparse_piecewise_tuned_correlation

    # Compare the results
    print("\n=== Comparison of Tuned Models ===")
    print(f"FT Transformer (Linear Tuned): RMSE={results['ft_linear_tuned']['rmse']:.4f}, R²={results['ft_linear_tuned']['r2']:.4f}")
    print(f"FT Transformer (Piecewise Tuned): RMSE={results['ft_piecewise_tuned']['rmse']:.4f}, R²={results['ft_piecewise_tuned']['r2']:.4f}")
    print(f"Sparse FT Transformer (Linear Tuned): RMSE={results['sparse_ft_linear_tuned']['rmse']:.4f}, R²={results['sparse_ft_linear_tuned']['r2']:.4f}")
    print(f"Sparse FT Transformer (Piecewise Tuned): RMSE={results['sparse_ft_piecewise_tuned']['rmse']:.4f}, R²={results['sparse_ft_piecewise_tuned']['r2']:.4f}")

    print("\n=== Comparison of PFI-Attention Correlations (Tuned Models) ===")
    print(f"FT Transformer (Linear Tuned): ρ={linear_tuned_correlation['correlation']:.4f}, p-value={linear_tuned_correlation['p_value']:.4f}")
    print(f"FT Transformer (Piecewise Tuned): ρ={piecewise_tuned_correlation['correlation']:.4f}, p-value={piecewise_tuned_correlation['p_value']:.4f}")
    print(f"Sparse FT Transformer (Linear Tuned): ρ={sparse_linear_tuned_correlation['correlation']:.4f}, p-value={sparse_linear_tuned_correlation['p_value']:.4f}")
    print(f"Sparse FT Transformer (Piecewise Tuned): ρ={sparse_piecewise_tuned_correlation['correlation']:.4f}, p-value={sparse_piecewise_tuned_correlation['p_value']:.4f}")

    # Create visualization comparing all models
    visualize_all_models(
        results=results,
        feature_names=feature_names
    )

    save_results(results, 'results_california.json')

    return models, results

if __name__ == "__main__":
    main_with_tuning()


Using device: cuda
Loading California Housing dataset...


[I 2025-03-04 20:44:28,802] A new study created in memory with name: no-name-becd26b5-6896-4d5f-824d-3d91d46ac5f6


Dataset shape: (20640, 8)
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Dataset splits - Train: (14448, 8), Val: (3096, 8), Test: (3096, 8)

=== Tuning Hyperparameters for FT Transformer with Linear Embedding ===


[I 2025-03-04 20:45:08,874] Trial 0 finished with value: 0.4700487005710602 and parameters: {'d_token': 97, 'num_heads': 4, 'num_layers': 2, 'd_ffn': 78, 'lr': 0.005229665463833917, 'dropout': 0.41954752003865214, 'n_bins': 58}. Best is trial 0 with value: 0.4700487005710602.
[I 2025-03-04 20:46:04,012] Trial 1 finished with value: 0.2960692125558853 and parameters: {'d_token': 74, 'num_heads': 4, 'num_layers': 3, 'd_ffn': 86, 'lr': 0.0005848011795173988, 'dropout': 0.34695049624693125, 'n_bins': 33}. Best is trial 1 with value: 0.2960692125558853.
[I 2025-03-04 20:46:35,960] Trial 2 finished with value: 0.3211324429512024 and parameters: {'d_token': 59, 'num_heads': 4, 'num_layers': 1, 'd_ffn': 205, 'lr': 0.0006136487544673718, 'dropout': 0.11018974049895375, 'n_bins': 40}. Best is trial 1 with value: 0.2960692125558853.
[I 2025-03-04 20:47:03,226] Trial 3 finished with value: 0.8904365491867066 and parameters: {'d_token': 88, 'num_heads': 3, 'num_layers': 3, 'd_ffn': 113, 'lr': 0.007

Best trial:
  Value (validation loss): 0.2906
  Params:
    d_token: 32
    num_heads: 6
    num_layers: 2
    d_ffn: 165
    lr: 0.002668499340879217
    dropout: 0.12095807781149959
    n_bins: 99

=== Training FT Transformer with Linear Embedding (Tuned) ===
Epoch 1/100, Train Loss: 1.1812, Val Loss: 0.8097
Epoch 2/100, Train Loss: 0.5862, Val Loss: 0.6031
Epoch 3/100, Train Loss: 0.4903, Val Loss: 0.5133
Epoch 4/100, Train Loss: 0.4018, Val Loss: 0.4176
Epoch 5/100, Train Loss: 0.3585, Val Loss: 0.4195
Epoch 6/100, Train Loss: 0.3364, Val Loss: 0.4084
Epoch 7/100, Train Loss: 0.3234, Val Loss: 0.3808
Epoch 8/100, Train Loss: 0.3193, Val Loss: 0.3676
Epoch 9/100, Train Loss: 0.3178, Val Loss: 0.3591
Epoch 10/100, Train Loss: 0.3195, Val Loss: 0.3690
Epoch 11/100, Train Loss: 0.3084, Val Loss: 0.3543
Epoch 12/100, Train Loss: 0.2952, Val Loss: 0.3653
Epoch 13/100, Train Loss: 0.2945, Val Loss: 0.3639
Epoch 14/100, Train Loss: 0.2923, Val Loss: 0.3655
Epoch 15/100, Train Loss: 0.2924,

[I 2025-03-04 20:56:59,682] A new study created in memory with name: no-name-f04d3de0-1935-43c3-a7fd-d6d9069d5103



=== Tuning Hyperparameters for FT Transformer with Piecewise Linear Embedding ===


[I 2025-03-04 21:00:11,513] Trial 0 finished with value: 0.37675600171089174 and parameters: {'d_token': 74, 'num_heads': 8, 'num_layers': 2, 'd_ffn': 256, 'lr': 0.00034060031951906966, 'dropout': 0.32462864952140685, 'n_bins': 31}. Best is trial 0 with value: 0.37675600171089174.
[I 2025-03-04 21:06:06,280] Trial 1 finished with value: 0.3135442239046097 and parameters: {'d_token': 118, 'num_heads': 2, 'num_layers': 3, 'd_ffn': 124, 'lr': 0.00027916202597893705, 'dropout': 0.38064392750118, 'n_bins': 63}. Best is trial 1 with value: 0.3135442239046097.
[I 2025-03-04 21:12:27,547] Trial 2 finished with value: 0.7146687269210815 and parameters: {'d_token': 96, 'num_heads': 3, 'num_layers': 1, 'd_ffn': 180, 'lr': 0.0064932993388807075, 'dropout': 0.0024777060118402194, 'n_bins': 93}. Best is trial 1 with value: 0.3135442239046097.
[I 2025-03-04 21:17:40,952] Trial 3 finished with value: 0.37265430867671967 and parameters: {'d_token': 60, 'num_heads': 4, 'num_layers': 1, 'd_ffn': 234, 'lr

Best trial:
  Value (validation loss): 0.3135
  Params:
    d_token: 118
    num_heads: 2
    num_layers: 3
    d_ffn: 124
    lr: 0.00027916202597893705
    dropout: 0.38064392750118
    n_bins: 63

=== Training FT Transformer with Piecewise Linear Embedding (Tuned) ===
Epoch 1/100, Train Loss: 1.1817, Val Loss: 0.8533
Epoch 2/100, Train Loss: 0.6809, Val Loss: 0.7501
Epoch 3/100, Train Loss: 0.5419, Val Loss: 0.6596
Epoch 4/100, Train Loss: 0.4563, Val Loss: 0.5212
Epoch 5/100, Train Loss: 0.4217, Val Loss: 0.5351
Epoch 6/100, Train Loss: 0.3978, Val Loss: 0.4773
Epoch 7/100, Train Loss: 0.3802, Val Loss: 0.4785
Epoch 8/100, Train Loss: 0.3670, Val Loss: 0.4735
Epoch 9/100, Train Loss: 0.3595, Val Loss: 0.4653
Epoch 10/100, Train Loss: 0.3487, Val Loss: 0.4458
Epoch 11/100, Train Loss: 0.3448, Val Loss: 0.4847
Epoch 12/100, Train Loss: 0.3349, Val Loss: 0.4492
Epoch 13/100, Train Loss: 0.3352, Val Loss: 0.4344
Epoch 14/100, Train Loss: 0.3266, Val Loss: 0.4102
Epoch 15/100, Train Los

[I 2025-03-04 21:36:11,519] A new study created in memory with name: no-name-102fbb09-ae6d-4f5f-8bc2-7468364de431



=== Tuning Hyperparameters for sparse FT Transformer with Linear Embedding ===


[I 2025-03-04 21:36:53,485] Trial 0 finished with value: 0.5019448637962342 and parameters: {'d_token': 106, 'num_heads': 3, 'num_layers': 3, 'd_ffn': 72, 'lr': 0.003216982006630175, 'dropout': 0.005153900042998594, 'n_bins': 94}. Best is trial 0 with value: 0.5019448637962342.
[I 2025-03-04 21:37:43,253] Trial 1 finished with value: 0.3441982686519623 and parameters: {'d_token': 102, 'num_heads': 3, 'num_layers': 2, 'd_ffn': 107, 'lr': 0.0005882447428556302, 'dropout': 0.4252170024880027, 'n_bins': 93}. Best is trial 1 with value: 0.3441982686519623.
[I 2025-03-04 21:38:13,394] Trial 2 finished with value: 0.4590043616294861 and parameters: {'d_token': 62, 'num_heads': 5, 'num_layers': 3, 'd_ffn': 206, 'lr': 0.0030199695612711562, 'dropout': 0.41142402553493534, 'n_bins': 72}. Best is trial 1 with value: 0.3441982686519623.
[I 2025-03-04 21:39:03,495] Trial 3 finished with value: 0.4023519229888916 and parameters: {'d_token': 65, 'num_heads': 5, 'num_layers': 2, 'd_ffn': 177, 'lr': 0.

Best trial:
  Value (validation loss): 0.2918
  Params:
    d_token: 76
    num_heads: 4
    num_layers: 3
    d_ffn: 137
    lr: 0.00012693332036879313
    dropout: 0.12522034565257845
    n_bins: 46

=== Training sparse FT Transformer with Linear Embedding (Tuned) ===
Epoch 1/100, Train Loss: 1.3574, Val Loss: 0.8104
Epoch 2/100, Train Loss: 0.6832, Val Loss: 0.7461
Epoch 3/100, Train Loss: 0.5984, Val Loss: 0.7058
Epoch 4/100, Train Loss: 0.5490, Val Loss: 0.6457
Epoch 5/100, Train Loss: 0.5012, Val Loss: 0.5938
Epoch 6/100, Train Loss: 0.4682, Val Loss: 0.5757
Epoch 7/100, Train Loss: 0.4456, Val Loss: 0.5285
Epoch 8/100, Train Loss: 0.4189, Val Loss: 0.5108
Epoch 9/100, Train Loss: 0.3981, Val Loss: 0.4594
Epoch 10/100, Train Loss: 0.3838, Val Loss: 0.4553
Epoch 11/100, Train Loss: 0.3661, Val Loss: 0.4112
Epoch 12/100, Train Loss: 0.3543, Val Loss: 0.4105
Epoch 13/100, Train Loss: 0.3455, Val Loss: 0.3976
Epoch 14/100, Train Loss: 0.3399, Val Loss: 0.3732
Epoch 15/100, Train Loss

[I 2025-03-04 21:50:36,467] A new study created in memory with name: no-name-1917c520-ab21-46ca-9e55-9a245897a72a



=== Tuning Hyperparameters for sparse FT Transformer with Piecewise Linear Embedding ===


[I 2025-03-04 21:52:48,114] Trial 0 finished with value: 0.5436476838588714 and parameters: {'d_token': 61, 'num_heads': 7, 'num_layers': 3, 'd_ffn': 161, 'lr': 0.004405752773034635, 'dropout': 0.3556196652959508, 'n_bins': 65}. Best is trial 0 with value: 0.5436476838588714.
[I 2025-03-04 21:59:28,696] Trial 1 finished with value: 0.31122311353683474 and parameters: {'d_token': 83, 'num_heads': 8, 'num_layers': 2, 'd_ffn': 206, 'lr': 0.0004178731104848749, 'dropout': 0.3575829004136913, 'n_bins': 100}. Best is trial 1 with value: 0.31122311353683474.
[I 2025-03-04 22:01:38,554] Trial 2 finished with value: 0.42817248821258547 and parameters: {'d_token': 108, 'num_heads': 7, 'num_layers': 1, 'd_ffn': 212, 'lr': 0.00177567629409646, 'dropout': 0.4478411234412837, 'n_bins': 47}. Best is trial 1 with value: 0.31122311353683474.
[I 2025-03-04 22:03:25,853] Trial 3 finished with value: 1.449229679107666 and parameters: {'d_token': 61, 'num_heads': 2, 'num_layers': 3, 'd_ffn': 246, 'lr': 0.0

Best trial:
  Value (validation loss): 0.3112
  Params:
    d_token: 83
    num_heads: 8
    num_layers: 2
    d_ffn: 206
    lr: 0.0004178731104848749
    dropout: 0.3575829004136913
    n_bins: 100

=== Training sparse FT Transformer with Piecewise Linear Embedding (Tuned) ===
Epoch 1/100, Train Loss: 1.4883, Val Loss: 0.7834
Epoch 2/100, Train Loss: 0.6942, Val Loss: 0.7022
Epoch 3/100, Train Loss: 0.5004, Val Loss: 0.5640
Epoch 4/100, Train Loss: 0.4272, Val Loss: 0.5819
Epoch 5/100, Train Loss: 0.4069, Val Loss: 0.5234
Epoch 6/100, Train Loss: 0.3842, Val Loss: 0.5580
Epoch 7/100, Train Loss: 0.3673, Val Loss: 0.4899
Epoch 8/100, Train Loss: 0.3664, Val Loss: 0.4755
Epoch 9/100, Train Loss: 0.3506, Val Loss: 0.4563
Epoch 10/100, Train Loss: 0.3459, Val Loss: 0.4372
Epoch 11/100, Train Loss: 0.3463, Val Loss: 0.4406
Epoch 12/100, Train Loss: 0.3278, Val Loss: 0.4039
Epoch 13/100, Train Loss: 0.3235, Val Loss: 0.3972
Epoch 14/100, Train Loss: 0.3299, Val Loss: 0.4338
Epoch 15/100, T

In [None]:
import os
from google.colab import drive

def save_all_files_to_drive(folder_name='CA_Files'):
  """Saves all files in the current Colab environment to a new folder in Google Drive.

  Args:
    folder_name: The name of the folder to create in Google Drive. Defaults to 'Colab_Files'.
  """

  # Mount Google Drive
  drive.mount('/content/drive')

  # Create the folder in Google Drive
  folder_path = os.path.join('/content/drive/My Drive', folder_name)
  os.makedirs(folder_path, exist_ok=True)

  # Get a list of all files in the current directory
  files = os.listdir('.')

  # Copy each file to the Google Drive folder
  for file in files:
    source_path = os.path.join('.', file)
    destination_path = os.path.join(folder_path, file)
    os.system(f'cp "{source_path}" "{destination_path}"')  # Using os.system for file copying

  print(f"All files saved to Google Drive: /content/drive/My Drive/{folder_name}")

save_all_files_to_drive()

In [None]:
save_all_files_to_drive()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All files saved to Google Drive: /content/drive/My Drive/CA_Files
