In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import pandas as pd 
import random 
from sklearn.model_selection import train_test_split

random.seed(42) #in case we will use random somewhere

data = pd.read_csv("../data/processed/processed_credit_risk_dataset.csv")

In [2]:
from sklearn.preprocessing import StandardScaler

y = data['loan_status'].values
X = data.drop(columns=['loan_status']).values  # Converting data to NumPy array

scaler = StandardScaler()
X = scaler.fit_transform(X)

X = torch.tensor(X, dtype=torch.float32) # Converting to PyTorch tensors
y = torch.tensor(y, dtype=torch.float32)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Convert into PyTorch datasets
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

# Create data loaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [4]:
class CreditRiskModel(nn.Module):
    def __init__(self):
        super(CreditRiskModel, self).__init__()
        self.fc1 = nn.Linear(X.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

In [5]:
model = CreditRiskModel()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [6]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * X_batch.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

Epoch 1/100, Loss: 0.3273
Epoch 2/100, Loss: 0.2836
Epoch 3/100, Loss: 0.2737
Epoch 4/100, Loss: 0.2669
Epoch 5/100, Loss: 0.2622
Epoch 6/100, Loss: 0.2591
Epoch 7/100, Loss: 0.2565
Epoch 8/100, Loss: 0.2537
Epoch 9/100, Loss: 0.2523
Epoch 10/100, Loss: 0.2512
Epoch 11/100, Loss: 0.2497
Epoch 12/100, Loss: 0.2480
Epoch 13/100, Loss: 0.2467
Epoch 14/100, Loss: 0.2445
Epoch 15/100, Loss: 0.2443
Epoch 16/100, Loss: 0.2429
Epoch 17/100, Loss: 0.2413
Epoch 18/100, Loss: 0.2396
Epoch 19/100, Loss: 0.2396
Epoch 20/100, Loss: 0.2382
Epoch 21/100, Loss: 0.2372
Epoch 22/100, Loss: 0.2348
Epoch 23/100, Loss: 0.2343
Epoch 24/100, Loss: 0.2327
Epoch 25/100, Loss: 0.2322
Epoch 26/100, Loss: 0.2312
Epoch 27/100, Loss: 0.2287
Epoch 28/100, Loss: 0.2277
Epoch 29/100, Loss: 0.2265
Epoch 30/100, Loss: 0.2249
Epoch 31/100, Loss: 0.2228
Epoch 32/100, Loss: 0.2217
Epoch 33/100, Loss: 0.2204
Epoch 34/100, Loss: 0.2192
Epoch 35/100, Loss: 0.2183
Epoch 36/100, Loss: 0.2183
Epoch 37/100, Loss: 0.2148
Epoch 38/1

In [7]:
model.eval()
y_pred = []

with torch.no_grad():
    for X_batch, _ in test_loader:
        outputs = model(X_batch).squeeze()
        y_pred.append(outputs)

y_pred = torch.cat(y_pred).numpy()
y_pred = (y_pred >= 0.5).astype(int)

In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Compute metrics
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("\nClassification Report: \n", classification_report(y_test, y_pred))

Accuracy:  0.893355838576032

Confusion Matrix: 
 [[4851  221]
 [ 474  971]]

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.91      0.96      0.93      5072
         1.0       0.81      0.67      0.74      1445

    accuracy                           0.89      6517
   macro avg       0.86      0.81      0.83      6517
weighted avg       0.89      0.89      0.89      6517



In [None]:
# The results from PyTorch model show a significant improvement over the Keras model you previously trained
# Accuracy indicates that the model correctly predicts the loan status about 89% of the time, which is a good overall performance.
# F1 Score is good for 0 class 0.93, but worse for 1 class 0.78. But it still better than Keras model.
# Class 0 is predicted with higher accuracy, precision, and recall, indicating the model is more confident in identifying non-default cases.
# Class 1 has lower recall, meaning the model misses a portion of actual defaults, which is a common issue in imbalanced datasets like this.

In [9]:
torch.save(model.state_dict(), '../models/credit_risk_model.pth')