In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, random_split
import tqdm
import numpy as np
from sklearn.metrics import roc_auc_score, confusion_matrix

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

Device: 'cuda'


In [3]:
x_train = torch.load("ds/mlp20200300x_train.pt")
x_test = torch.load("ds/mlp20200300x_test.pt")
y_train = torch.load("ds/mlp20200300y_train.pt")
y_test = torch.load("ds/mlp20200300y_test.pt")

print(f"x_train shape:", x_train.shape)
print(f"x_test shape:", x_test.shape)
print(f"y_train shape:", y_train.shape)
print(f"y_test shape:", y_test.shape)

  x_train = torch.load("ds/mlp20200300x_train.pt")
  x_test = torch.load("ds/mlp20200300x_test.pt")


x_train shape: torch.Size([3203952, 76])
x_test shape: torch.Size([1722152, 76])
y_train shape: torch.Size([3203952])
y_test shape: torch.Size([1722152])


  y_train = torch.load("ds/mlp20200300y_train.pt")
  y_test = torch.load("ds/mlp20200300y_test.pt")


In [4]:
input_size = x_train.shape[1]

In [5]:
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)

In [6]:
train_size = int(0.8 * x_train.shape[0])
val_size = x_train.shape[0] - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

In [7]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

In [8]:
class MLP(nn.Module):
    def __init__(self, input_size: int):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.layers(x)

In [9]:
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs):
    best_threshold = 0.0

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()  # Set model to training mode
        train_loss = 0.0
        for inputs, targets in tqdm.tqdm(train_loader):
            inputs = inputs.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()  # Zero the gradients
            outputs = model(inputs).squeeze(1)
            loss = criterion(outputs, targets)
            loss.backward()  # Backpropagate the loss
            optimizer.step()  # Update the weights

            train_loss += loss.item()

        train_loss /= len(train_loader)
        print("Train loss:", train_loss)
    
        # Validation
        model.eval()
        val_loss = 0.0
        all_labels = []
        all_probs = []  # Store probabilities for ROC-AUC
        print("Validating...")
        with torch.no_grad():
            for inputs, targets in tqdm.tqdm(val_loader):
                inputs = inputs.to(device)
                targets = targets.to(device)

                outputs = model(inputs).squeeze(1)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

                # Get predictions and probabilities (assuming binary classification with sigmoid output)
                probs = torch.sigmoid(outputs).cpu().numpy()  # Apply sigmoid if needed
                labels = targets.cpu().numpy()

                all_labels.extend(labels)
                all_probs.extend(probs.flatten())

        val_loss /= len(val_loader)

        # Find threshold for predictions
        print("Looking for threshold")
        best_threshold = 0
        best_f1 = 0
        for threshold in tqdm.tqdm(np.arange(0.2, 0.81, 0.01)):
            preds_binary = (all_probs > threshold).astype(int)
            cm = confusion_matrix(all_labels, preds_binary)
            tp = cm[1, 1]
            fp = cm[0, 1]
            fn = cm[1, 0]
            tn = cm[0, 0]
            precision = 0 if tp == 0 else tp / (tp + fp)
            recall = 0 if tp == 0 else tp / (tp + fn)
            f1 = 0 if precision * recall == 0 else 2 * precision * recall / (precision + recall)
            if f1 > best_f1:
                best_threshold = threshold
                best_f1 = f1
        print(f"Best threshold: {best_threshold}")
        all_preds = (all_probs > best_threshold).astype(int)

        cm = confusion_matrix(all_labels, all_preds)
        tp = cm[1, 1]
        fp = cm[0, 1]
        fn = cm[1, 0]
        tn = cm[0, 0]

        accuracy = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0.0 # Handle division by zero
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        roc_auc = roc_auc_score(all_labels, all_probs)

        print(f"Validation Metrics - Epoch {epoch+1}/{num_epochs}:")
        print(f"Loss      :{val_loss:.4f}")
        print(f"Accuracy:  {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1-score:  {f1:.4f}")
        print(f"ROC-AUC:   {roc_auc:.4f}")
        print(f"Confusion Matrix:\n{tp} {fn}\n{fp} {tn}")
    
    return best_threshold


In [10]:
def test(model, test_loader, device, criterion, best_threshold):
    model.eval()
    test_loss = 0.0
    all_labels = []
    all_preds = []
    all_probs = []
    with torch.no_grad():
        for inputs, targets in tqdm.tqdm(test_loader):
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs).squeeze(1)
            loss = criterion(outputs, targets)  # Use criterion here
            test_loss += loss.item()

            probs = torch.sigmoid(outputs).cpu().numpy()  # Apply sigmoid if needed
            preds = (probs > best_threshold).astype(int)  # Convert probabilities to predictions
            labels = targets.cpu().numpy()

            all_labels.extend(labels)
            all_preds.extend(preds.flatten())
            all_probs.extend(probs.flatten())

    test_loss /= len(test_loader)

    cm = confusion_matrix(all_labels, all_preds)
    tp = cm[1, 1]
    fp = cm[0, 1]
    fn = cm[1, 0]
    tn = cm[0, 0]

    accuracy = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0.0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    try:
      roc_auc = roc_auc_score(all_labels, all_probs)
    except ValueError:
        roc_auc = 0.0

    print(f"Test Metrics:")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC-AUC:   {roc_auc:.4f}")
    print(f"Confusion Matrix:\n{tp} {fn}\n{fp} {tn}")
    print(f"Test Loss: {test_loss:.4f}") # Print the loss as well


In [11]:
model = MLP(input_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

best_threshold = train(
    model,
    train_loader,
    val_loader,
    optimizer,
    criterion,
    device,
    3
)

Epoch 1/3


100%|██████████| 80099/80099 [01:57<00:00, 682.06it/s]


Train loss: 0.4572873471505017
Validating...


100%|██████████| 20025/20025 [00:11<00:00, 1723.78it/s]


Looking for threshold


100%|██████████| 62/62 [01:25<00:00,  1.38s/it]


Best threshold: 0.4300000000000002
Validation Metrics - Epoch 1/3:
Loss      :0.4171
Accuracy:  0.7771
Precision: 0.7185
Recall:    0.9102
F1-score:  0.8031
ROC-AUC:   0.8827
Confusion Matrix:
291208 28713
114089 206781
Epoch 2/3


100%|██████████| 80099/80099 [01:58<00:00, 678.64it/s]


Train loss: 0.36714490110177606
Validating...


100%|██████████| 20025/20025 [00:11<00:00, 1712.51it/s]


Looking for threshold


100%|██████████| 62/62 [01:25<00:00,  1.38s/it]


Best threshold: 0.38000000000000017
Validation Metrics - Epoch 2/3:
Loss      :0.4351
Accuracy:  0.8351
Precision: 0.8059
Recall:    0.8821
F1-score:  0.8423
ROC-AUC:   0.9242
Confusion Matrix:
282206 37715
67982 252888
Epoch 3/3


100%|██████████| 80099/80099 [01:59<00:00, 667.69it/s]


Train loss: 0.33788825660973004
Validating...


100%|██████████| 20025/20025 [00:11<00:00, 1713.90it/s]


Looking for threshold


100%|██████████| 62/62 [01:24<00:00,  1.37s/it]


Best threshold: 0.4300000000000002
Validation Metrics - Epoch 3/3:
Loss      :0.3245
Accuracy:  0.8565
Precision: 0.8493
Recall:    0.8663
F1-score:  0.8577
ROC-AUC:   0.9345
Confusion Matrix:
277144 42777
49161 271709


0.4300000000000002

In [13]:
test(
    model,
    test_loader,
    device,
    criterion,
    best_threshold
)

100%|██████████| 53818/53818 [00:32<00:00, 1672.74it/s]


Test Metrics:
Accuracy:  0.7078
Precision: 0.6996
Recall:    0.7282
F1-score:  0.7136
ROC-AUC:   0.7581
Confusion Matrix:
626994 234082
269216 591860
Test Loss: 7.1332
