## Neural Network

### Setup

In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))

In [1]:
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter

from utils.eval_helpers import calculate_binary_classification_metrics

In [2]:
random_state = 42
device = (
    "cuda" 
    if torch.cuda.is_available() 
    else "cpu"
)
print(f"Using {device} device")

if device == "cuda":
    torch.cuda.manual_seed(device)
    torch.cuda.manual_seed_all(device)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
elif device == "cpu":
    torch.manual_seed(random_state)

Using cpu device


In [3]:
def get_data(x_arr, y_arr, bs=None, is_train=False):
    x_t, y_t = map(
        lambda x: torch.tensor(x, device=device, dtype=torch.float32), (x_arr, y_arr)
    )
    ds = TensorDataset(x_t, y_t)
    shuffle = True if is_train else False
    dl = DataLoader(dataset=ds, batch_size=bs, shuffle=shuffle, drop_last=False)
    return dl, x_t, y_t

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def get_model(input_dim, hidden_dim=None, output_dim=1, lr=0.001):
    if hidden_dim:
        model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()
        )
    else:
        model = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.Sigmoid()
        )
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    print(f'#Parameters: {count_parameters(model)}')
    return model, optimizer

def calculate_loss_and_metrics(model, x, y, loss_func=None, threshold=0.5):
    model.eval()
    with torch.no_grad():
        y_prob = model(x)
        y_pred = (y_prob >= threshold).int()
        tensors = [y, y_pred, y_prob]
        lists = [tensor.cpu().numpy().tolist() for tensor in tensors]
        metrics_df = calculate_binary_classification_metrics(*lists)
        if loss_func:
            loss = loss_func(y_prob, y).item()
            return loss, metrics_df
    return None, metrics_df

def fit(epochs, model, train_dl, loss_func, opt, log_dir, x_train, y_train, x_val, y_val):
    writer = SummaryWriter(log_dir=log_dir)
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            lossb = loss_func(model(xb), yb)
            lossb.backward()
            opt.step()
            opt.zero_grad()

        train_loss, train_metrics_df = calculate_loss_and_metrics(model, x_train, y_train, loss_func)
        val_loss, val_metrics_df = calculate_loss_and_metrics(model, x_val, y_val, loss_func)
        writer.add_scalar('loss/train', train_loss, epoch)
        writer.add_scalar('loss/validation', val_loss, epoch)
        for column in train_metrics_df.columns:
            writer.add_scalar(f'{column}/train', train_metrics_df[column].values, epoch)
        for column in val_metrics_df.columns:
            writer.add_scalar(f'{column}/validation', val_metrics_df[column].values, epoch)
        torch.save(model.state_dict(), Path(log_dir) / Path(f'model_weights_{epoch}.pth'))
    writer.close()

def evaluate_model(x_arr, y_arr, hidden_dim, weights_path):
    _, x_val, y_val = get_data(x_arr, y_arr)
    model, _ = get_model(x_val.shape[1], hidden_dim)
    model.load_state_dict(torch.load(weights_path, map_location=torch.device(device), weights_only=True), strict=True)
    _, metrics_df = calculate_loss_and_metrics(model, x_val, y_val)
    return metrics_df

In [4]:
with open('X_train_minmax_scaled.pkl', 'rb') as file:
    X_train_minmax_scaled = pickle.load(file)
# with open('X_train_minmax_scaled_resampled.pkl', 'rb') as file:
#     X_train_minmax_scaled = pickle.load(file)
with open('X_validation_minmax_scaled.pkl', 'rb') as file:
    X_validation_minmax_scaled = pickle.load(file)
with open('y_train.pkl', 'rb') as file:
    y_train = pickle.load(file)
    y_train = y_train.reshape(-1, 1)
# with open('y_train_resampled.pkl', 'rb') as file:
#     y_train = pickle.load(file)
#     y_train = y_train.reshape(-1, 1)
with open('y_validation.pkl', 'rb') as file:
    y_validation = pickle.load(file)
    y_validation = y_validation.reshape(-1, 1)

### Training

In [None]:
resampled = True
bs = 64
lr = 0.005
epochs = 20
input_dim = X_train_minmax_scaled.shape[1]
hidden_dim = 5
output_dim = 1
loss_func = nn.BCELoss()
log_name = f'resampled{resampled}-bs{bs}-lr{lr}'
if hidden_dim:
    log_name += f'-hidden_dim{hidden_dim}'
log_dir = Path('runs') / Path(log_name)
if log_dir.exists():
    print(f"Folder '{log_dir}' already exists.")
else:
    print(f"Folder '{log_dir}' does not exist.")

In [None]:
train_dl, x_train, y_train = get_data(X_train_minmax_scaled, y_train, bs, is_train=True)
_, x_val, y_val = get_data(X_validation_minmax_scaled, y_validation, bs)
model, optimizer = get_model(input_dim, hidden_dim, output_dim, lr)
fit(epochs, model, train_dl, loss_func, optimizer, log_dir, x_train, y_train, x_val, y_val)

### Evaluation

#### `hidden_dim=None`

In [5]:
metrics_df = evaluate_model(X_validation_minmax_scaled, 
                            y_validation, 
                            hidden_dim=None, 
                            weights_path='runs/resampledFalse-bs128-lr0.002/model_weights_8.pth')
metrics_df

#Parameters: 280


Unnamed: 0,accuracy,precision,recall,f1_score,balanced_accuracy,roc_auc
0,0.819347,0.793731,0.964966,0.871012,0.767066,0.793589


#### `hidden_dim=5`

In [6]:
metrics_df = evaluate_model(X_validation_minmax_scaled, 
                            y_validation, 
                            hidden_dim=5, 
                            weights_path='runs/resampledFalse-bs64-lr0.005-hidden_dim5/model_weights_10.pth')
metrics_df

#Parameters: 1406


Unnamed: 0,accuracy,precision,recall,f1_score,balanced_accuracy,roc_auc
0,0.819347,0.795224,0.961893,0.870654,0.768169,0.794745


## Summary

The best model achieves an ROC-AUC score of 0.794745 by using an additional linear layer, which improves performance over the logistic-regressor-like model (which has no hidden layers). Resampling was not used, as it led to score degradation. However, both training and validation losses remain relatively high at approximately 0.46.