## Neural Network

In [None]:
from pathlib import Path
import pickle
import shutil

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter

from utils.eval_helpers import calculate_binary_classification_metrics

In [None]:
random_state = 42
device = (
    "cuda" 
    if torch.cuda.is_available() 
    else "cpu"
)
print(f"Using {device} device")

if device == "cuda":
    torch.cuda.manual_seed(device)
    torch.cuda.manual_seed_all(device)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
elif device == "cpu":
    torch.manual_seed(random_state)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def calculate_loss_and_metrics(model, loss_func, x, y, threshold=0.5):
    model.eval()
    with torch.no_grad():
        y_prob = model(x)
        loss = loss_func(y_prob, y).item()
        y_pred = (y_prob >= threshold).int()
        tensors = [y, y_pred, y_prob]
        lists = [tensor.cpu().numpy().tolist() for tensor in tensors]
        metrics_df = calculate_binary_classification_metrics(*lists)
    return loss, metrics_df

def fit(epochs, model, train_dl, loss_func, opt, log_dir, x_train, y_train, x_val, y_val):
    writer = SummaryWriter(log_dir=log_dir)
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            lossb = loss_func(model(xb), yb)
            lossb.backward()
            opt.step()
            opt.zero_grad()

        train_loss, train_metrics_df = calculate_loss_and_metrics(model, loss_func, x_train, y_train)
        val_loss, val_metrics_df = calculate_loss_and_metrics(model, loss_func, x_val, y_val)

        writer.add_scalar('loss/train', train_loss, epoch)
        writer.add_scalar('loss/validation', val_loss, epoch)
        for column in train_metrics_df.columns:
            writer.add_scalar(f'{column}/train', train_metrics_df[column].values, epoch)
        for column in val_metrics_df.columns:
            writer.add_scalar(f'{column}/validation', val_metrics_df[column].values, epoch)

        torch.save(model.state_dict(), Path(log_dir) / Path(f'model_weights_{epoch}.pth'))
    writer.close()

In [None]:
with open('X_train_minmax_scaled.pkl', 'rb') as file:
    X_train_minmax_scaled = pickle.load(file)
# with open('X_train_minmax_scaled_resampled.pkl', 'rb') as file:
#     X_train_minmax_scaled = pickle.load(file)
with open('X_validation_minmax_scaled.pkl', 'rb') as file:
    X_validation_minmax_scaled = pickle.load(file)
with open('y_train.pkl', 'rb') as file:
    y_train = pickle.load(file)
# with open('y_train_resampled.pkl', 'rb') as file:
#     y_train = pickle.load(file)
with open('y_validation.pkl', 'rb') as file:
    y_validation = pickle.load(file)

In [None]:
resampled = False
bs = 128
lr = 0.001
epochs = 20
input_dim = X_train_minmax_scaled.shape[1]
output_dim = 1
loss_func = nn.BCELoss()
log_name = f'resampled{resampled}-bs{bs}-lr{lr}'
log_dir = Path('runs') / Path(log_name)

if log_dir.exists():
    shutil.rmtree(log_dir)
    print(f"Folder '{log_dir}' and all its contents have been deleted.")
else:
    print(f"Folder '{log_dir}' does not exist.")

In [None]:
x_t, y_t, x_val, y_val = map(
    lambda x: torch.tensor(x, device=device, dtype=torch.float32),
    (X_train_minmax_scaled, y_train.reshape(-1, 1), X_validation_minmax_scaled, y_validation.reshape(-1, 1))
)
train_ds = TensorDataset(x_t, y_t)
valid_ds = TensorDataset(x_val, y_val)
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True, drop_last=False)
valid_dl = DataLoader(valid_ds, batch_size=bs, shuffle=False, drop_last=False)

model = nn.Sequential(
    nn.Linear(input_dim, output_dim),
    nn.Sigmoid()
)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
print(f'#Parameters: {count_parameters(model)}')

fit(epochs, model, train_dl, loss_func, optimizer, log_dir, x_t, y_t, x_val, y_val)

In [None]:
weights_path = 'runs/resampledFalse-bs128-lr0.001/model_weights_3.pth'
model.load_state_dict(torch.load(weights_path, map_location=torch.device(device), weights_only=True), strict=True)
_, metrics_df = calculate_loss_and_metrics(model, loss_func, x_val, y_val)
metrics_df

Summary:

The best model is logistic-regressor-like, which is trained not using resampling data. Training and validation losses are still high (~0.47). Introducing more parameters (extra linear layers) will degrade the performance.