In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from random import randint
import os

# own Modules 
from data_preperator import DataPreperator
from nn_model import NeuralNetwork
from trainer import Trainer
from logger import Logger

__Aus zeitlichen Gründen habe ich hier kein Cross Validation implementiert, sondern nur ein Training Datensatz und ein Test Datensatz.__

## Parameters

In [3]:
param = {
    "data" : {
        "path" : '../../data/reduced_dataset.csv' ,
    },
    "model" : {
        "input_size" : 12,
        "batch_size" : 8,
        "dropout_rate_fc": 0.2,
        "units_fc1": 40,
        "units_fc2": 20
        "units_fc3": 10,
        "units_fc4": 5,
    },
    "cycling_lr" : {
        "scheduler_active" : True, 
        # Mode can be one of {triangular, triangular2, exp_range}
        "mode" : "triangular", 
        "gamma" : 0.9995,
        "base_lr" : 0.001, 
        "max_lr" :0.05
    },
    "training": {
        "n_epochs" : 50,
        "patience" : 10,
    },
    "filed_location": {
        "trained_model" : "./models/trained_model/",
        "log_file" : "./models/log/",
    },
}

## Preprocessing

In [89]:
data_set = pd.read_csv("../../data/reduced_dataset.csv")
data_set.shape

(421096, 76)

In [90]:
data_set = data_set.dropna()
data_set.shape

(326529, 76)

### Create Target Variable and Delet Samples with Status "Current"
Bei allen Loans mit dem Status Current ist nicht klar, ob diese zu einem guten oder schlechten Loan entwicklen. Deshalb lösche ich diese Samples.

In [91]:
def create_target_variable(attribute):
    if attribute == 'Fully Paid' or attribute == 'In Grace Period':
        return 0
    elif attribute == 'Current':
        return -1
    else:
        return 1
data_set['target'] = data_set['loan_status'].apply(create_target_variable)
data_set = data_set[data_set['target'] != -1]
data_set.drop(labels='loan_status', axis=1, inplace=True)
data_set.shape

(166731, 76)

### Reduce Purpose Feature

In [92]:
def new_purpose_attributes(attribute):
    if attribute=='debt_consolidation' or attribute=='credit_card':
        return 'refinance'
    elif attribute=='house' or attribute=='home_improvement' or attribute=='moving':
        return 'house'
    elif attribute=='car' or attribute=='major_purchase':
        return 'major_purchase'
    else:
        return 'other'
data_set['purpose'] = data_set['purpose'].apply(new_purpose_attributes)

### Drop Grade Feature
Ich schließe die Noten des Lending Clubs  aus. Sie werden wahrscheinlich mit Hilfe eines Clustering-Algorithmus generiert und würde einen künstlichen Einblick in die Kreditnehmer gewähren (Information Leakage) 

In [93]:
data_set.drop(labels='grade', axis=1, inplace=True)
data_set.shape

(166731, 75)

### Zero-Hot Encoding for Categorical Features

In [95]:
data_set = pd.get_dummies(data_set)
data_set.shape

In [4]:
train_loader = DataPreperator(dataset=data_set, 
                              ignored_features=[],
                              stake_training_data=0.8,
                              features_not_to_scale=[])
train_data, validation_data = train_loader.prepare_data()

23611


## Initialize Dataset 

## Initialize DataLoader

In [7]:
data_loader_training = DataLoader(train_data, 
                                  batch_size=param["model"]["batch_size"], 
                                  num_workers=0, 
                                  shuffle=True, 
                                  drop_last=True
                                 )
data_loader_validation = DataLoader(validation_data, 
                                  batch_size=param["model"]["batch_size"], 
                                  num_workers=0, 
                                  shuffle=True, 
                                  drop_last=True
                                 )

In [8]:
for batch_idx, data in enumerate(data_loader_training):
    x,y = data
    print('Data of batch: {}'.format(batch_idx))
    print("Size of input data: {}".format(x.size()))
    print("Size of target data: {}".format(y.size()))
    if batch_idx >=1: break

Data of batch: 0
Size of input data: torch.Size([8, 25, 2])
Size of target data: torch.Size([8, 2])
Data of batch: 1
Size of input data: torch.Size([8, 25, 2])
Size of target data: torch.Size([8, 2])


## Initialize Neural Network

In [21]:
torch.manual_seed(0)
model = NeuralNetwork(batch_size=param['model']['batch_size'], 
                      input_dim=param['model']['input_size'], 
                      units_fc1=param['model']['units_fc1'], 
                      units_fc2=param['model']['units_fc1'], 
                      units_fc3=param['model']['units_fc1'], 
                      units_fc4=param['model']['units_fc1'], 
                      dropout_rate_fc= param['model']['dropout_rate_fc'],
                     )

## Define MSE Loss function as torch module

In [22]:
criterion = nn.CrossEntropyLoss()

## Initialize Optimizer and Cyclic Learning Rate Scheduler

In [23]:
optimizer = torch.optim.SGD(model.parameters(), lr=1.)  
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimizer, 
                                              base_lr=param['cycling_lr']['base_lr'], 
                                              max_lr=param['cycling_lr']['max_lr'], 
                                              step_size_up=(len(train_data)/param['model']['batch_size'])*2, # Authors of Cyclic LR suggest setting step_size 2-8 x training iterations in epoch.
                                              mode=param['cycling_lr']['mode'],
                                              gamma=param['cycling_lr']['gamma']
                                             )

## Initialize Trainer

In [24]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  scheduler=scheduler,
                  scheduler_active = param["cycling_lr"]["scheduler_active"],
                  criterion=criterion, 
                  location_model=param["filed_location"]["trained_model"], 
                  patience=param['training']['patience']
                 )

## Training Loop

In [25]:
# Initialise Logger
session_id = str(randint(10000, 99999))
logger = Logger(param["filed_location"]["log_file"], session_id)

# Log model architecture and training configuration
logger.log_message("Architecture and Training configuration:")
logger.log_message("Batch size: {}".format(param['model']['batch_size']))
logger.log_message("Input size: {}".format(param['model']['input_size']))

logger.log_message("Dropout rate fc NN: {}".format(param['model']['dropout_rate_fc']))
logger.log_message("Cycling LR mode: {}".format(param['cycling_lr']['mode']))
logger.log_message("Cycling LR base LR: {}".format(param['cycling_lr']['base_lr']))
logger.log_message("Cycling LR max LR: {}".format(param['cycling_lr']['max_lr']))
logger.log_message("- -"*20)

print("Training phase is started")
logger.log_message("Training phase is started")
torch.manual_seed(0)

for epoch in range(param['training']['n_epochs']):
    # Train with batches 
    mean_epoch_training_loss = trainer.train(data_loader_training)
    mean_epoch_validation_loss = trainer.evaluate(data_loader_training)

    # Save model if its the best one since the last change in configuration of hyperparameters
    status_ok = trainer.save_model(epoch, mean_epoch_validation_loss, session_id)
    
    # Log information of current epoch
    logger.log_current_statistics(epoch, mean_epoch_training_loss)
    
    if not status_ok:
        break

print("Training phase is finished")
logger.log_message("Training phase is finished")

Training phase is started
Epoch 0: best model saved with loss: 0.16020687000072864


KeyboardInterrupt: 