In [107]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from random import randint
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, recall_score, precision_score

# own Modules 
from nn_model import NeuralNetwork
from data_set import DataSetLoans
from trainer import Trainer
from logger import Logger

## Parameters

In [143]:
param = {
    "data" : {
        "path" : '../../data/reduced_dataset.csv' ,
    },
    "model" : {
        "input_size" : 125,
        "batch_size" : 8,
        "dropout_rate_fc": 0.2,
        "units_fc1": 60,
        "units_fc2": 30,
        "units_fc3": 5,
    },
    "cycling_lr" : {
        "scheduler_active" : True, 
        # Mode can be one of {triangular, triangular2, exp_range}
        "mode" : "triangular", 
        "gamma" : 0.9995,
        "base_lr" : 0.001, 
        "max_lr" :0.05
    },
    "training": {
        "n_epochs" : 20, # Mit mehr Rechenpower sollte die Anazahl an epochen deutlich höher sein!
        "patience" : 10,
    },
    "filed_location": {
        "trained_model" : "./models/trained_model/",
        "log_file" : "./models/log/",
    },
}

# Preprocessing

In [144]:
data_set = pd.read_csv("../../data/reduced_dataset.csv")
data_set.shape

(421096, 76)

In [145]:
data_set = data_set.dropna()
data_set.shape

(326529, 76)

### Create Target Variable and Delet Samples with Status "Current"
Bei allen Loans mit dem Status Current ist nicht klar, ob diese zu einem guten oder schlechten Loan entwicklen. Deshalb lösche ich diese Samples.

In [146]:
def create_target_variable(attribute):
    if attribute == 'Fully Paid' or attribute == 'In Grace Period':
        return 0
    elif attribute == 'Current':
        return -1
    else:
        return 1
data_set['target'] = data_set['loan_status'].apply(create_target_variable)
data_set = data_set[data_set['target'] != -1]
data_set.drop(labels='loan_status', axis=1, inplace=True)
data_set.shape

(166731, 76)

### Reduce Purpose Feature

In [147]:
def new_purpose_attributes(attribute):
    if attribute=='debt_consolidation' or attribute=='credit_card':
        return 'refinance'
    elif attribute=='house' or attribute=='home_improvement' or attribute=='moving':
        return 'house'
    elif attribute=='car' or attribute=='major_purchase':
        return 'major_purchase'
    else:
        return 'other'
data_set['purpose'] = data_set['purpose'].apply(new_purpose_attributes)

### Drop Grade Feature
Ich schließe die Noten des Lending Clubs  aus. Sie werden wahrscheinlich mit Hilfe eines Clustering-Algorithmus generiert und würde einen künstlichen Einblick in die Kreditnehmer gewähren (Information Leakage) 

In [148]:
data_set.drop(labels='grade', axis=1, inplace=True)
data_set.shape

(166731, 75)

### Create Train Data Set and Target Variable

In [149]:
y = data_set['target']
X = data_set.drop(['target'], axis=1)

### Zero-Hot Encoding

In [150]:
X = pd.get_dummies(X).to_numpy()
y = pd.get_dummies(y)

In [151]:
y

Unnamed: 0,0,1
4,1,0
5,0,1
7,1,0
9,1,0
10,1,0
...,...,...
421088,0,1
421089,1,0
421092,0,1
421093,0,1


Good Loans are represendet by the first column, bad loands by the second column

In [152]:
y = y.to_numpy()

### Create Test and Train Data Set

In [153]:
X_train_total, X_test, y_train_total, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

### Split Train Data Set in Train and Validation Set

In [154]:
X_train, X_val, y_train, y_val = train_test_split(X_train_total, y_train_total, test_size=0.2, random_state=1, stratify=y_train_total)

### Standardize Features 

In [155]:
# Fit StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
# Transform data
train_scaled = scaler.transform(X_train)
val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(X_test)

# Prepare Training
### Create Data Set

In [156]:
dataset_train = DataSetLoans(train_scaled, y_train)
dataset_val = DataSetLoans(val_scaled, y_val)

### Initialize DataLoader

In [157]:
data_loader_training = DataLoader(dataset_train, 
                                  batch_size=param["model"]["batch_size"], 
                                  num_workers=0, 
                                  shuffle=True, 
                                  drop_last=True
                                 )
data_loader_val = DataLoader(dataset_val, 
                             batch_size=param["model"]["batch_size"], 
                             num_workers=0, 
                             shuffle=True, 
                             drop_last=True
                             )

In [158]:
for batch_idx, data in enumerate(data_loader_training):
    x,y = data
    print('Data of batch: {}'.format(batch_idx))
    print("Size of input data: {}".format(x.size()))
    print("Size of target data: {}".format(y.size()))
    if batch_idx >=1: break

Data of batch: 0
Size of input data: torch.Size([8, 125])
Size of target data: torch.Size([8, 2])
Data of batch: 1
Size of input data: torch.Size([8, 125])
Size of target data: torch.Size([8, 2])


### Initialize Neural Network

In [159]:
torch.manual_seed(0)
model = NeuralNetwork(batch_size=param['model']['batch_size'], 
                      input_dim=param['model']['input_size'], 
                      units_fc1=param['model']['units_fc1'], 
                      units_fc2=param['model']['units_fc1'], 
                      units_fc3=param['model']['units_fc1'], 
                      dropout_rate_fc= param['model']['dropout_rate_fc'],
                     )

### Define Binary Cross Entropy Loss function

In [160]:
criterion = nn.BCELoss()

### Initialize Optimizer and Cyclic Learning Rate Scheduler

In [161]:
optimizer = torch.optim.SGD(model.parameters(), lr=1.)  
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimizer, 
                                              base_lr=param['cycling_lr']['base_lr'], 
                                              max_lr=param['cycling_lr']['max_lr'], 
                                              step_size_up=(len(X_train)/param['model']['batch_size'])*2, # Authors of Cyclic LR suggest setting step_size 2-8 x training iterations in epoch.
                                              mode=param['cycling_lr']['mode'],
                                              gamma=param['cycling_lr']['gamma']
                                             )

### Initialize Trainer

In [162]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  scheduler=scheduler,
                  scheduler_active = param["cycling_lr"]["scheduler_active"],
                  criterion=criterion, 
                  location_model=param["filed_location"]["trained_model"], 
                  patience=param['training']['patience']
                 )

# Start Training with Validation

In [163]:
# Initialise Logger
session_id = str(randint(10000, 99999))
logger = Logger(param["filed_location"]["log_file"], session_id)

# Log model architecture and training configuration
logger.log_message("Architecture and Training configuration:")
logger.log_message("Batch size: {}".format(param['model']['batch_size']))
logger.log_message("Input size: {}".format(param['model']['input_size']))
logger.log_message("Number of Layers: {}".format(3))
logger.log_message("Units Layer 1: {}".format(param['model']['units_fc1']))
logger.log_message("Units Layer 2: {}".format(param['model']['units_fc2']))
logger.log_message("Units Layer 3: {}".format(param['model']['units_fc3']))

logger.log_message("Dropout rate fc NN: {}".format(param['model']['dropout_rate_fc']))
logger.log_message("Cycling LR mode: {}".format(param['cycling_lr']['mode']))
logger.log_message("Cycling LR base LR: {}".format(param['cycling_lr']['base_lr']))
logger.log_message("Cycling LR max LR: {}".format(param['cycling_lr']['max_lr']))
logger.log_message("- -"*20)

print("Training phase is started")
logger.log_message("Training phase is started")
torch.manual_seed(0)

for epoch in range(param['training']['n_epochs']):
    # Train with batches 
    mean_epoch_training_loss = trainer.train(data_loader_training)
    mean_epoch_test_loss = trainer.evaluate(data_loader_val)

    # Save model if its the best one since the last change in configuration of hyperparameters
    status_ok = trainer.save_model(epoch, mean_epoch_test_loss, session_id)
    
    # Log information of current epoch
    logger.log_current_statistics(epoch, mean_epoch_test_loss)
    
    if not status_ok:
        break

print("Training phase is finished")
logger.log_message("Training phase is finished")

Training phase is started
Epoch 0: best model saved with loss: 0.0646147196813122
Epoch 2: best model saved with loss: 0.06451253936447106
Epoch 3: best model saved with loss: 0.06135131168531977
Epoch 4: best model saved with loss: 0.06075552679137766
Epoch 6: best model saved with loss: 0.060310679740110576
Epoch 7: best model saved with loss: 0.059257919152598044
Epoch 8: best model saved with loss: 0.058897964040577594
Epoch 9: best model saved with loss: 0.05875155231882986
Epoch 10: best model saved with loss: 0.05852654331776725
Epoch 11: best model saved with loss: 0.05784867741711423
Epoch 12: best model saved with loss: 0.057488227890817395
Epoch 13: best model saved with loss: 0.05747834583686379
Epoch 15: best model saved with loss: 0.057189733104195985
Epoch 16: best model saved with loss: 0.05691413197582714
Epoch 17: best model saved with loss: 0.05688596595484936
Epoch 18: best model saved with loss: 0.0568499493152024
Epoch 19: best model saved with loss: 0.05658452352

# Predictions from Test Set
Das Modell mit folgender Konfiguration erzielte den geringsten Loss: <br>
Number of Layers: 3 <br>
Units Layer 1: 60 <br>
Units Layer 2: 30 <br>
Units Layer 3: 5 <br>

Daher wird hier das Modell mit der ID 77463 geladen
### Load trained model

In [164]:
checkpoint = torch.load(param["filed_location"]["trained_model"]+"id"+str(77463))
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

### Create Data Set

In [165]:
dataset_test = DataSetLoans(test_scaled, y_test)

### Initialize DataLoader

In [166]:
data_loader_test = DataLoader(dataset_test, 
                              batch_size=param["model"]["batch_size"], 
                              num_workers=0, 
                              shuffle=False, 
                              drop_last=True
                             )

### Predict

In [167]:
y_hat = []
for batch_number, data in enumerate(data_loader_test):
    with torch.no_grad():
        input_data, target_data = data
        model.eval()
        output_mini_batch = model(input_data)
        for prediction_single_sample in output_mini_batch:
            if prediction_single_sample[0] > prediction_single_sample[1]:
                y_hat.append(0)
            else:
                y_hat.append(1)

An dieser Stelle könnte man noch mit verschiedenen Thresholds arbeiten, ab der die Prediction einer gewissen Klasse zugeordnet wird

In [168]:
# Revert Zero Hot Encoding
y_test = [0 if i[0]==1 else 1 for i in y_test]

# Evaluation Results

In [169]:
# because of the mini-batches, 4 elements
y_test = y_test[:len(y_test)-4]
# Evaluate Test Set
print("The F1 Score is: {}".format(f1_score(y_test, y_hat, average="binary")))
print("The Precision Score is: {}".format(precision_score(y_test, y_hat, average="binary")))
print("The Recall Score is: {}".format(recall_score(y_test, y_hat, average="binary")))
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
print("TN: {}".format(tn))
print("FP: {}".format(fp))
print("FN: {}".format(fn))
print("TP: {}".format(tp))

The F1 Score is: 0.952204395308076
The Precision Score is: 0.922720146328717
The Recall Score is: 0.9836350974930362
TN: 34473
FP: 1183
FN: 235
TP: 14125
