In [1]:
# Standard libraries
import os
import time
import random
import itertools

# Third-party libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
from copy import deepcopy


def set_seed(seed):
    """Set all random seeds for reproducibility."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU setups
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def prepare_data(X, y, train_ratio=0.6, val_ratio=0.2, seed=None):
    """Split data into training, validation and test sets with seed control."""
    total_size = len(X)
    train_size = int(train_ratio * total_size)
    val_size = int(val_ratio * total_size)
    test_size = total_size - train_size - val_size
    
    dataset = TensorDataset(X, y)
    generator = torch.Generator().manual_seed(seed) if seed is not None else None
    return random_split(dataset, [train_size, val_size, test_size], generator=generator)


class FlexibleMLP(nn.Module):
    """Configurable Multi-Layer Perceptron with variable depth and width."""
    
    def __init__(self, input_dim, depth, width):
        super(FlexibleMLP, self).__init__()
        layers = []
        for _ in range(depth):
            layers.append(nn.Sequential(
                nn.Linear(input_dim, width),
                nn.ReLU()
            ))
            input_dim = width
        layers.append(nn.Linear(width, 1))
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)


def grid_search_sdl(X, y, input_dim, seed):
    """Perform hyperparameter grid search for SDL model."""
    set_seed(seed)
    
    # Hyperparameter grid
    dims = [32, 64]
    depths = [2, 3, 4]
    lrs = [0.001]
    batch_size = 16
    num_epochs = 25000
    patience = 30
    
    # Prepare data loaders with seed control
    train_set, val_set, test_set = prepare_data(X, y, seed=seed)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

    best_val_loss = float('inf')
    best_model = None
    best_params = None

    # Grid search over hyperparameters
    for dim, depth, lr in itertools.product(dims, depths, lrs):
        model = FlexibleMLP(input_dim, depth, dim)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.95)
        
        current_best_loss = float('inf')
        bad_epochs = 0
        current_best_model = None

        # Training loop
        for epoch in range(num_epochs):
            model.train()
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(X_batch)
                loss = criterion(y_pred, y_batch)
                loss.backward()
                optimizer.step()
            scheduler.step()

            # Validation
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for X_val, y_val in val_loader:
                    y_pred = model(X_val)
                    val_loss += criterion(y_pred, y_val).item()
            
            avg_val_loss = val_loss / len(val_loader)
            
            # Early stopping check
            if avg_val_loss < current_best_loss:
                current_best_loss = avg_val_loss
                current_best_model = deepcopy(model.state_dict())
                bad_epochs = 0
            else:
                bad_epochs += 1
                if bad_epochs >= patience:
                    break

        # Update overall best model
        if current_best_loss < best_val_loss:
            best_val_loss = current_best_loss
            best_params = (dim, depth, lr)
            best_model = current_best_model

    # Load best model for testing
    model = FlexibleMLP(input_dim, best_params[1], best_params[0])
    model.load_state_dict(best_model)
    model.eval()
    
    # Evaluation metrics
    criterion = nn.MSELoss()
    test_loss = 0
    rel_error = 0
    
    with torch.no_grad():
        for X_test, y_test in test_loader:
            y_pred = model(X_test)
            test_loss += torch.sqrt(criterion(y_pred, y_test)).item()
            null_error = criterion(torch.zeros_like(y_test), y_test)
            rel_error += criterion(y_pred, y_test) / null_error
    
    avg_test_loss = test_loss / len(test_loader)
    avg_rel_error = rel_error / len(test_loader)
    
    return avg_test_loss

In [2]:

# read CSV
df = pd.read_csv("ADNI_real_data.csv")
is_source = df['domain'] == 'source'
is_target = df['domain'] == 'target'

# response
y_s = torch.tensor(df.loc[is_source, 'y'].values, dtype=torch.float32).view(-1, 1)
y_t = torch.tensor(df.loc[is_target, 'y'].values, dtype=torch.float32).view(-1, 1)

# first block
X_1_cols = [col for col in df.columns if col.startswith("X_1_")]
X_s_1 = torch.tensor(df.loc[is_source, X_1_cols].values, dtype=torch.float32)
X_t_1 = torch.tensor(df.loc[is_target, X_1_cols].values, dtype=torch.float32)

# second block
X_2_cols = [col for col in df.columns if col.startswith("X_2_")]
X_t_2 = torch.tensor(df.loc[is_target, X_2_cols].values, dtype=torch.float32)

# third block
X_3_cols = [col for col in df.columns if col.startswith("X_3_")]
X_s_3 = torch.tensor(df.loc[is_source, X_3_cols].values, dtype=torch.float32)

#
p1 = 267
p2 = 113
p3 = 300


In [3]:
results = []
num_seeds = 20
for seed in range(num_seeds): 
    set_seed(seed)
    start_time = time.time() 
    
    X_t_hat=torch.cat([X_t_1, X_t_2],dim=1)
    X_s_hat=torch.cat([X_s_1, X_s_3],dim=1)
    average_test_loss_t = grid_search_sdl(X_t_hat, y_t, p1 + p2, seed)
    average_test_loss_s = grid_search_sdl(X_s_hat, y_s, p1 + p3, seed)   
    
    elapsed_time = time.time() - start_time
    results.append((seed, average_test_loss_s, average_test_loss_t, elapsed_time))

df = pd.DataFrame(results, columns=['seed', 'average_test_loss_s', 'average_test_loss_t', 'time'])
filename = f"SDL_ADNI_seeds={num_seeds}.csv"
filepath = os.path.join(".", filename)  
df.to_csv(filepath, index=False)
print(f"SDL Results saved to {filepath}")  
        

SDL Results saved to ./SDL_ADNI_seeds=20.csv


In [7]:
mean_values = df[['average_test_loss_s', 'average_test_loss_t']].mean()
std_values = df[['average_test_loss_s', 'average_test_loss_t']].std()

print(mean_values)
print(std_values)

average_test_loss_s    4.610077
average_test_loss_t    2.877350
dtype: float64
average_test_loss_s    1.373310
average_test_loss_t    1.244118
dtype: float64
