Here we use AE-MLP model with all lags

hidden_dim =128  and lr= 1-e6, those parameters were suggested in the discusiion as the best and we also tested other parameters, which are close to those

the model is trained and saved as /kaggle/input/ae_mlp_v3/pytorch/ae_mlp_v3/1/ae_mlp_model (1).pth (https://www.kaggle.com/models/peach785/ae_mlp_v3)

AE для выделения более унифицированных признаков из данных. у нас все признаки в разном масштабе и разного распределения, поэтому AE может быть ключевым элементом. Перед подачей данных в AE мы их стандартизируем для приведения к одному масштабу, что может также улучшить качество модели.

После того как данные были закодированы, они проходят через декодер, который служит для предсказания таргета, что часто улучшает качество предсказания.

In [None]:
import pandas as pd
import polars as pl
import numpy as np
import gc

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, WeightedRandomSampler, TensorDataset
from sklearn.preprocessing import RobustScaler

## Model

In [None]:
# AE-MLP with Dropout & L2-regulirization
class AE_MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=1, dropout_rate=0.3):
        super(AE_MLP, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),  # Dropout after activation not to overfit
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Several loss function to choose better one
# Взвешенная Huber Loss
def weighted_loss(predictions, targets, weights, delta=1.0):
    loss = nn.SmoothL1Loss(beta=delta, reduction='none')  # Huber Loss
    per_sample_loss = loss(predictions, targets)
    weighted_loss = (per_sample_loss * weights).mean()  # weight loss
    return weighted_loss

# # RMSE
# def weighted_loss(predictions, targets, weights):
#     per_sample_loss = (predictions - targets) ** 2
#     weighted_mse = (per_sample_loss * weights).mean()
#     weighted_rmse = torch.sqrt(weighted_mse)
#     return weighted_rmse


In [None]:
def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=1e-3, l2_lambda=1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_lambda)  # L2-регуляризация через weight_decay
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for features, targets, weights in train_loader:
            features, targets, weights = features.to(device), targets.to(device), weights.to(device)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = weighted_loss(outputs.squeeze(), targets.squeeze(), weights)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Ограничение градиентов
            optimizer.step()
            train_loss += loss.item()
        
        # Валидация
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets, weights in val_loader:
                features, targets, weights = features.to(device), targets.to(device), weights.to(device)
                outputs = model(features)
                loss = weighted_loss(outputs.squeeze(), targets.squeeze(), weights)
                val_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.6f}, Val Loss: {val_loss/len(val_loader):.6f}")


In [None]:
def prepare_dataloader(X, y, weights, batch_size=1024):
    dataset = TensorDataset(X, y, weights)
    sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
    loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
    return loader

# Ensemble mechanism
class EnsembleModel(nn.Module):
    def __init__(self, models):
        super(EnsembleModel, self).__init__()
        self.models = nn.ModuleList(models)
    
    def forward(self, x):
        outputs = [model(x) for model in self.models]
        return torch.stack(outputs, dim=0).mean(dim=0)


## Load data

In [None]:
class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = { f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.09
    start_dt = 1450


train_path = "/kaggle/input/Preprocessing/training.parquet/"
valid_path = "/kaggle/input/Preprocessing/validation.parquet/"

In [None]:
# Use last 2 parquets
train = pl.scan_parquet(
    train_path
).select(
    pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
    pl.all(),
).with_columns(
    (pl.col(CONFIG.target_col)*2).cast(pl.Int32).alias("label"),
).filter(
    pl.col("date_id").gt(CONFIG.start_dt)
)

In [None]:
lags = train.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(
    date_id = pl.col('date_id') + 1,  # lagged by 1 day
    )
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()  # pick up last record of previous date
lags

In [None]:
train = train.join(lags, on=["date_id", "symbol_id"],  how="left")

In [None]:
len_train   = train.select(pl.col("date_id")).collect().shape[0]
valid_records = int(len_train * CONFIG.valid_ratio)
len_ofl_mdl = len_train - valid_records
last_tr_dt  = train.select(pl.col("date_id")).collect().row(len_ofl_mdl)[0]

print(f"\n len_train = {len_train}")
print(f"\n len_ofl_mdl = {len_ofl_mdl}")
print(f"\n---> Last offline train date = {last_tr_dt}\n")

training_data = train.filter(pl.col("date_id").le(last_tr_dt))
validation_data   = train.filter(pl.col("date_id").gt(last_tr_dt))

## Training and validating

In [None]:
# RobustScaler because in finantial data fat tails are common
scaler = RobustScaler()

# transforming X_train with RobustScaler
    # select features
X_train = training_data.select([f"feature_{idx:02d}" for idx in range(79)] + ['symbol_id'] + [f"responder_{idx}" for idx in range(9)]).collect().to_numpy().astype('float32')
  # fill nan with mean
col_means = np.nanmean(X_train, axis=0)
X_train[np.isnan(X_train)] = np.take(col_means, np.where(np.isnan(X_train))[1])
    # fit Robust Scaling
X_train = scaler.fit_transform(X_train)  

    # select weights and target as numpy array
y_train = training_data.select('responder_6').collect().to_numpy().astype('float32')
weights = training_data.select('weight').collect().to_numpy().astype('float32').flatten()

# transform X_val with RobustScaler
    # select features
X_val = validation_data.select([f"feature_{idx:02d}" for idx in range(79)] + ['symbol_id'] + [f"responder_{idx}" for idx in range(9)]).collect().to_numpy().astype('float32')
  # fill nan with mean
col_means = np.nanmean(X_val, axis=0)
X_val[np.isnan(X_val)] = np.take(col_means, np.where(np.isnan(X_val))[1])
  # use Robust Scaling
X_val = scaler.transform(X_val)

    # select weights and target as numpy array
y_val = validation_data.select('responder_6').collect().to_numpy().astype('float32')
weights_val = validation_data.select('weight').collect().to_numpy().astype('float32').flatten()

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
weights_train_tensor = torch.tensor(weights, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
weights_val_tensor = torch.tensor(weights_val, dtype=torch.float32)

# Prepare DataLoaders
train_loader = prepare_dataloader(X_train_tensor, y_train_tensor, weights_train_tensor)
val_loader = prepare_dataloader(X_val_tensor, y_val_tensor, weights_val_tensor)



In [None]:
# Initialize model, optimizer, and start training
# With validation
input_dim = X_train_tensor.shape[1]  # Number of features
model = AE_MLP(input_dim=input_dim, hidden_dim=128)

train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=1e-6)

Hubert Loss Weighted R2: 0.999055
RMSE Weighted R2: 0.999034


Count R2

In [None]:
def weighted_r2_on_batches(val_loader, model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    weighted_sum = 0.0
    weighted_mean_sum = 0.0
    ss_residual = 0.0
    ss_total = 0.0

    with torch.no_grad():
        weights_sum = 0.0
        
        for features, targets, weights in val_loader:
            features, targets, weights = features.to(device), targets.to(device), weights.to(device)
            
            preds = model(features)
            weights_sum += weights.sum().item()
            
            # Вычисление взвешенного среднего на GPU
            batch_weighted_mean = (weights * targets).sum()
            weighted_sum += batch_weighted_mean.item()
            weighted_mean_sum += (weights * targets).sum().item()

            # Вычисление отклонений
            ss_residual += (weights * (targets - preds) ** 2).sum().item()
            ss_total += (weights * (targets - (batch_weighted_mean / weights.sum())) ** 2).sum().item()

        # Общий взвешенный средний
        weighted_mean = weighted_sum / weights_sum

    # Итоговый R²
    r2 = 1 - (ss_residual / ss_total)
    return r2

# Подсчет R²
r2_score = weighted_r2_on_batches(val_loader, model)
print(f"Weighted R2: {r2_score:.6f}")


## train on all data

In [None]:
# All data for training final model
X_data = train.select(
    [f"feature_{idx:02d}" for idx in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)] + ['symbol_id', 'time_id', 'date_id']
).collect().to_numpy().astype('float32')

col_means = np.nanmean(X_data, axis=0) # fill nan with mean
X_data[np.isnan(X_data)] = np.take(col_means, np.where(np.isnan(X_data))[1])
y_data = train.select('responder_6').collect().to_numpy().astype('float32')
all_weights = train.select('weight').collect().to_numpy().astype('float32').flatten()


# scaling data
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_data)

# Convert to torch tensors
X_data_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_data_tensor = torch.tensor(y_data, dtype=torch.float32)
all_weights_tensor = torch.tensor(all_weights, dtype=torch.float32)

# # Save the scaler for future use
import joblib
joblib.dump(scaler, 'robust_scaler_07_01.pkl')


all_train_loader = prepare_dataloader(X_data_tensor, y_data_tensor, all_weights_tensor)

for prediction:

 Load the saved scaler
scaler = joblib.load('robust_scaler.pkl')

 Transform input data for prediction
X_test_scaled = scaler.transform(X_test)

 Convert to torch tensors
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

 Perform prediction
with torch.no_grad():
    model.eval()
    X_test_tensor = X_test_tensor.to(device)
    predictions = model(X_test_tensor).cpu().numpy()

 Optionally, inverse transform the predictions
predictions_original_scale = scaler.inverse_transform(predictions)



In [None]:
# train.select(
#     [f"feature_{idx:02d}" for idx in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)] + ['symbol_id', 'time_id', 'date_id']
# ).collect()

In [None]:
def train_on_full_data(model, full_loader, num_epochs=10, learning_rate=1e-3, l2_lambda=1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_lambda)  # L2-регуляризация через weight_decay
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for features, targets, weights in full_loader:
            features, targets, weights = features.to(device), targets.to(device), weights.to(device)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = weighted_loss(outputs.squeeze(), targets.squeeze(), weights)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Ограничение градиентов
            optimizer.step()
            train_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(full_loader):.6f}")


In [None]:
# Initialize model, optimizer, and start training
input_dim = X_data.shape[1]  # Number of features
model = AE_MLP(input_dim=input_dim, hidden_dim=128)

train_on_full_data(model, all_train_loader, num_epochs=20, learning_rate=1e-6)

In [None]:
import os

# Сохранение модели
model_save_path = "ae_mlp_model_06_01_2025.pth"
torch.save(model.state_dict(), model_save_path)

print(f"Model saved at {model_save_path}")

# import joblib
# joblib.dump(model.state_dict(), 'ae_mlp_04_01_2025.pkl')

In [None]:
# 128 learning_rate=1e-5
# Epoch 1/10, Train Loss: 0.688642, Val Loss: 0.482744
# Epoch 2/10, Train Loss: 0.684686, Val Loss: 0.484899

# 256
# Epoch 1/10, Train Loss: 0.686451, Val Loss: 0.484307
# Epoch 2/10, Train Loss: 0.685009, Val Loss: 0.486779

# 128 learning_rate=1e-6
# Epoch 1/10, Train Loss: 0.842413, Val Loss: 0.487302
# Epoch 2/10, Train Loss: 0.688747, Val Loss: 0.484387
