In [1]:
import pickle

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_excel("loan_data_final.xlsx")
# df = pd.read_excel("loan_data_filtered_important.xlsx")

In [3]:
X = df.drop(['loan_status'], axis = 1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

cols = X_train.columns

(17789, 42) (7625, 42) (17789,) (7625,)


In [4]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

## Read-in pre-trained model

In [5]:
# Assume we have finish training the model

train = False

if train == True:
    # Initialze model here and train the model
    pass

else:
    with open('saved_model/rf.sav', 'rb') as f:
        rf = pickle.load(f)
        f.close()
    with open('saved_model/lg.sav', 'rb') as f:
        lg = pickle.load(f)
        f.close()
    with open('saved_model/ADA.sav', 'rb') as f:
        ADA = pickle.load(f)
        f.close()

# score = ADA.score(X_test, y_test)
# print ("Test score: {0:.2f} %".format(100 * score))
print (classification_report(y_test, ADA.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.57      0.69      1124
           1       0.93      0.99      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.90      0.78      0.82      7625
weighted avg       0.92      0.92      0.92      7625



https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


## Visualize Data

In [6]:
# import plotly.express as px
# from sklearn.decomposition import PCA

# X = df.drop(['loan_status'], axis = 1)
# y = df['loan_status']

# pca = PCA(n_components=2)
# components = pca.fit_transform(X)

# fig = px.scatter(components, x=0, y=1, color=y, color_continuous_scale=px.colors.sequential.Viridis)
# fig.show()

In [7]:
# pca = PCA(n_components=3)
# components = pca.fit_transform(X)

# total_var = pca.explained_variance_ratio_.sum() * 100

# fig = px.scatter_3d(
#     components, x=0, y=1, z=2, color=df['loan_status'],
#     title=f'Total Explained Variance: {total_var:.2f}%',
#     labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
# )
# fig.show()

## Ensemble Learning

In [8]:
# Ensemble Learning

max_model = VotingClassifier(estimators=[('random forest', rf), 
                                         ('logistic regression', lg), 
                                         ('ADA', ADA)], voting='hard')
max_model.fit(X_train, y_train)
max_model.score(X_test, y_test)
metrics = classification_report(y_test, max_model.predict(X_test))
print (metrics)



              precision    recall  f1-score   support

           0       0.86      0.57      0.69      1124
           1       0.93      0.98      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.78      0.82      7625
weighted avg       0.92      0.92      0.92      7625





## More advanced ensemble learning

### Stacking

In [9]:
def Stacking(model, train, y, test, n_fold):
    folds = StratifiedKFold(n_splits=n_fold)
    test_pred = np.empty((test.shape[0],1), float)
    train_pred = np.empty((0,1), float)
    
    for train_indices, val_indices in folds.split(train, y.values):
        x_train, x_val = train.iloc[train_indices], train.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
        
        model.fit(X=x_train, y=y_train)
        train_pred = np.append(train_pred, model.predict(x_val))
        test_pred = np.append(test_pred, model.predict(test))
    
    return test_pred.reshape(-1,1), train_pred

In [10]:
# Create three basic models

model_rf = RandomForestClassifier(max_depth = 40, 
                                  n_estimators = 300)

model_lg = LogisticRegression(C = 0.001, 
                              solver = 'sag', 
                              penalty = 'none')

model_ADA = AdaBoostClassifier(random_state = 0, 
                               base_estimator = DecisionTreeClassifier(max_depth=3, random_state=0), 
                               learning_rate = 0.05, 
                               n_estimators = 100)

In [11]:
test_pred_rf, train_pred_rf = Stacking(model=model_rf, n_fold=10, train=X_train, test=X_test, y=y_train)
train_pred_rf = pd.DataFrame(train_pred_rf)
test_pred_rf = pd.DataFrame(test_pred_rf)

test_pred_ADA, train_pred_ADA = Stacking(model=model_ADA, n_fold=10, train=X_train, test=X_test, y=y_train)
train_pred_ADA = pd.DataFrame(train_pred_ADA)
test_pred_ADA = pd.DataFrame(test_pred_ADA)





## DNN 

In [14]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

from sklearn.metrics import accuracy_score

In [15]:
class loanDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return len(self.X)

In [16]:
class Classifier(nn.Module):

    def __init__(self):
        super().__init__()
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(42, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)
        self.act = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x =  nn.functional.relu(x)
        x = self.fc2(x)
        x =  nn.functional.relu(x)
        x = self.fc3(x)
        outputs = self.act(x)
        return outputs

In [17]:
from torch.utils.data import DataLoader

train_dataset = loanDataset(X_train.values, y_train.values)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)

test_dataset = loanDataset(X_test.values, y_test.values)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=32)

In [18]:
def get_model():
    model = Classifier()
    return model

def get_loss():
    return nn.CrossEntropyLoss()

In [19]:
from sklearn.metrics import f1_score

def binary_accuracy(preds, y):
        # round predictions to the closest integer
    preds = torch.max(preds, 1)[1]
    correct = 0
    total = 0
    correct += (preds == y).float().sum()
    total += y.shape[0]
    acc = correct / total
    return acc, preds

def train(epochs):
    model = get_model()
    criterion = get_loss()
    
    params = model.parameters()
    learning_rate = 0.001
    optimizer = torch.optim.Adam(params, lr=learning_rate)
    
    # Train the model.
    total_steps = len(train_loader)
    n_steps = 0

    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0
    
    val_acc = []
    val_loss = []
    train_acc = []
    train_loss = []
    f1s = []

    for epoch in range(epochs):
        epoch_loss = 0
        epoch_acc = 0
        n_steps = 0
        model.train()
        
        for i, (data, labels) in enumerate(train_loader):
            n_steps += 1
            
            optimizer.zero_grad()
            
            model.double()
            predictions = model(data)
            loss = criterion(predictions.squeeze(), labels)
                
            y_pred = torch.max(predictions, 1)[1]
            acc = accuracy_score(labels, y_pred)
            f1 = f1_score(labels, y_pred)
            loss.backward()

            optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        train_acc.append(epoch_acc/n_steps)
        train_loss.append(epoch_loss/n_steps)
        
        # Evaluate the model over test set
        test_pred, test_real = evaluate(model)
        test_pred = torch.max(test_pred, 1)[1]
        
#         print (f'Epoch {epoch}, Train Loss: {loss.item()}, Train Acc: {acc.item()}, f1: {f1}, Test Acc: {accuracy_score(test_real, test_pred)}')
        print (f'Epoch {epoch}, Train Acc: {acc.item()}, f1: {f1}, Test Acc: {accuracy_score(test_real, test_pred)}')

        val_acc.append(accuracy_score(test_real, test_pred))
        f1s.append(f1_score(test_real, test_pred))
        
        if epoch % 2 == 0:
            print (classification_report(test_real, test_pred))
        torch.save(model.state_dict(), 'model-tf-%d.pkl' % (epoch + 1))
    
    return train_acc, train_loss, val_acc, f1s
        
def evaluate(model):
    predictions = []
    real = []
    model.eval()
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            
            y_pred = model(x_batch)
            predictions += list(y_pred.detach().numpy())
            real += list(y_batch.detach().numpy())
    return torch.tensor(predictions), torch.tensor(real)

train_acc, train_loss, val_acc, f1s = train(30)

Epoch 0, Train Acc: 0.9310344827586207, f1: 0.9600000000000001, Test Acc: 0.9215737704918032
              precision    recall  f1-score   support

           0       0.84      0.58      0.68      1124
           1       0.93      0.98      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.78      0.82      7625
weighted avg       0.92      0.92      0.92      7625

Epoch 1, Train Acc: 0.9310344827586207, f1: 0.9615384615384616, Test Acc: 0.920655737704918
Epoch 2, Train Acc: 0.8275862068965517, f1: 0.8936170212765958, Test Acc: 0.921311475409836
              precision    recall  f1-score   support

           0       0.85      0.57      0.68      1124
           1       0.93      0.98      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.77      0.82      7625
weighted avg       0.92      0.92      0.91      7625

Epoch 3, Train Acc: 0.9655172413793104, f1: 0.9811320754716981, Test 

In [20]:
# from matplotlib import pyplot as plt

# fig = plt.figure()
# ax = plt.axes()

# x = [i for i in range(1, 31)]

# plt.title("f1 (overall) v.s. Epoch")
# plt.xlabel("Epoch")
# plt.ylabel("f1 score")

# ax.plot(x, f1s)

## 1D CNN

In [21]:
from torch.utils.data import DataLoader

train_dataset = loanDataset(X_train.values, y_train.values)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32, drop_last=True)

test_dataset = loanDataset(X_test.values, y_test.values)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=32, drop_last=True)

In [22]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.dropout = nn.Dropout(p=0.5)

        self.conv1d = nn.Sequential(
            nn.Conv1d(42, 32, kernel_size=3, stride=3, padding=0), # in_channels, out_channels, kernel_size, stride=1, padding=0
            nn.BatchNorm1d(32), nn.ReLU())
        self.maxpool1d = nn.AdaptiveMaxPool1d(1)  # the target output size H
        self.conv1d2 = nn.Sequential(
            nn.Conv1d(32, 16, kernel_size=3, stride=3, padding=0),
            nn.BatchNorm1d(16), nn.ReLU())
        self.conv1d3 = nn.Sequential(
            nn.Conv1d(16, 8, kernel_size=3, stride=3, padding=0),
            nn.BatchNorm1d(8), nn.ReLU())

        self.fc1 = nn.Sequential(nn.Linear(8, 16), nn.Dropout(p=0.5), nn.ReLU())
        self.fc2 = nn.Sequential(nn.Linear(16, 2), nn.Sigmoid())
        self.act = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = x.repeat(1, 42).reshape((32, 42, 42))
        output = self.conv1d(x)
        output = self.conv1d2(output)
        output = self.conv1d3(output)

        output = self.maxpool1d(output)
        output = output.view(output.size()[0], -1)
        output = self.fc1(output)
        output = self.fc2(output)
        output = self.act(output)
        return output

In [23]:
from sklearn.metrics import f1_score

def binary_accuracy(preds, y):
        # round predictions to the closest integer
    preds = torch.max(preds, 1)[1]
    correct = 0
    total = 0
    correct += (preds == y).float().sum()
    total += y.shape[0]
    acc = correct / total
    return acc, preds

def train(epochs):
    model = CNN()
    criterion = get_loss()
    
    params = model.parameters()
    learning_rate = 0.001
    optimizer = torch.optim.Adam(params, lr=learning_rate)
    
    # Train the model.
    total_steps = len(train_loader)
    n_steps = 0

    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0
    
    val_acc = []
    val_loss = []
    train_acc = []
    train_loss = []
    f1s = []

    for epoch in range(epochs):
        epoch_loss = 0
        epoch_acc = 0
        n_steps = 0
        model.train()
        
        for i, (data, labels) in enumerate(train_loader):
            n_steps += 1
            
            optimizer.zero_grad()
            
            model.double()
            predictions = model(data)
            
#             print (predictions.squeeze().shape, labels.shape)
            
            loss = criterion(predictions.squeeze(), labels)
                
            y_pred = torch.max(predictions, 1)[1]
            acc = accuracy_score(labels, y_pred)
            f1 = f1_score(labels, y_pred)
            loss.backward()

            optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        train_acc.append(epoch_acc/n_steps)
        train_loss.append(epoch_loss/n_steps)
        
        # Evaluate the model over test set
        test_pred, test_real = evaluate(model)
        test_pred = torch.max(test_pred, 1)[1]
        
#         print (f'Epoch {epoch}, Train Loss: {loss.item()}, Train Acc: {acc.item()}, f1: {f1}, Test Acc: {accuracy_score(test_real, test_pred)}')
        print (f'Epoch {epoch}, Train Acc: {acc.item()}, f1: {f1}, Test Acc: {accuracy_score(test_real, test_pred)}')

        val_acc.append(accuracy_score(test_real, test_pred))
        f1s.append(f1_score(test_real, test_pred))
        
        if epoch % 2 == 0:
            print (classification_report(test_real, test_pred))
        torch.save(model.state_dict(), 'model-tf-%d.pkl' % (epoch + 1))
    
    return train_acc, train_loss, val_acc, f1s
        
def evaluate(model):
    predictions = []
    real = []
    model.eval()
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            
            y_pred = model(x_batch)
            predictions += list(y_pred.detach().numpy())
            real += list(y_batch.detach().numpy())
    return torch.tensor(predictions), torch.tensor(real)

In [24]:
train_acc, train_loss, val_acc, f1s = train(30)

[W NNPACK.cpp:79] Could not initialize NNPACK! Reason: Unsupported hardware.


Epoch 0, Train Acc: 0.75, f1: 0.8571428571428571, Test Acc: 0.852547268907563
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1123
           1       0.85      1.00      0.92      6493

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1, Train Acc: 0.875, f1: 0.9333333333333333, Test Acc: 0.852547268907563
Epoch 2, Train Acc: 0.875, f1: 0.9333333333333333, Test Acc: 0.852547268907563
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1123
           1       0.85      1.00      0.92      6493

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3, Train Acc: 0.8125, f1: 0.896551724137931, Test Acc: 0.8528098739495799
Epoch 4, Train Acc: 0.875, f1: 0.9333333333333333, Test Acc: 0.8526785714285714
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1122
           1       0.85      1.00      0.92      6494

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5, Train Acc: 0.84375, f1: 0.9152542372881356, Test Acc: 0.8526785714285714
Epoch 6, Train Acc: 0.84375, f1: 0.9152542372881356, Test Acc: 0.852547268907563
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1123
           1       0.85      1.00      0.92      6493

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 7, Train Acc: 0.8125, f1: 0.896551724137931, Test Acc: 0.852547268907563
Epoch 8, Train Acc: 0.9375, f1: 0.967741935483871, Test Acc: 0.8524159663865546
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1124
           1       0.85      1.00      0.92      6492

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 9, Train Acc: 0.8125, f1: 0.896551724137931, Test Acc: 0.8526785714285714
Epoch 10, Train Acc: 0.875, f1: 0.9333333333333333, Test Acc: 0.852547268907563
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1123
           1       0.85      1.00      0.92      6493

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 11, Train Acc: 0.875, f1: 0.9333333333333333, Test Acc: 0.8524159663865546
Epoch 12, Train Acc: 0.96875, f1: 0.9841269841269841, Test Acc: 0.852547268907563
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1123
           1       0.85      1.00      0.92      6493

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 13, Train Acc: 0.84375, f1: 0.9152542372881356, Test Acc: 0.852547268907563
Epoch 14, Train Acc: 0.90625, f1: 0.9508196721311475, Test Acc: 0.8524159663865546
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1124
           1       0.85      1.00      0.92      6492

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 15, Train Acc: 0.8125, f1: 0.896551724137931, Test Acc: 0.852547268907563
Epoch 16, Train Acc: 0.84375, f1: 0.9152542372881356, Test Acc: 0.8528098739495799
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1121
           1       0.85      1.00      0.92      6495

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.79      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 17, Train Acc: 0.90625, f1: 0.9508196721311475, Test Acc: 0.8524159663865546
Epoch 18, Train Acc: 0.78125, f1: 0.8771929824561403, Test Acc: 0.852547268907563
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1123
           1       0.85      1.00      0.92      6493

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 19, Train Acc: 0.9375, f1: 0.967741935483871, Test Acc: 0.8528098739495799
Epoch 20, Train Acc: 0.875, f1: 0.9333333333333333, Test Acc: 0.8526785714285714
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1122
           1       0.85      1.00      0.92      6494

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 21, Train Acc: 0.8125, f1: 0.896551724137931, Test Acc: 0.8528098739495799
Epoch 22, Train Acc: 0.875, f1: 0.9333333333333333, Test Acc: 0.852547268907563
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1123
           1       0.85      1.00      0.92      6493

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 23, Train Acc: 0.78125, f1: 0.8771929824561403, Test Acc: 0.8524159663865546
Epoch 24, Train Acc: 0.9375, f1: 0.967741935483871, Test Acc: 0.8526785714285714
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1122
           1       0.85      1.00      0.92      6494

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 25, Train Acc: 0.84375, f1: 0.9152542372881356, Test Acc: 0.8524159663865546
Epoch 26, Train Acc: 0.8125, f1: 0.896551724137931, Test Acc: 0.8526785714285714
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1122
           1       0.85      1.00      0.92      6494

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 27, Train Acc: 0.90625, f1: 0.9508196721311475, Test Acc: 0.852547268907563
Epoch 28, Train Acc: 0.84375, f1: 0.9152542372881356, Test Acc: 0.8524159663865546
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1124
           1       0.85      1.00      0.92      6492

    accuracy                           0.85      7616
   macro avg       0.43      0.50      0.46      7616
weighted avg       0.73      0.85      0.78      7616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 29, Train Acc: 0.84375, f1: 0.9152542372881356, Test Acc: 0.8528098739495799


## LSTM

In [25]:
from torch.utils.data import DataLoader

train_dataset = loanDataset(X_train.values, y_train.values)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)

test_dataset = loanDataset(X_test.values, y_test.values)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=32)

In [31]:
class Classifier(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.lstm = nn.LSTM(1, 50, batch_first=True)
        self.linear = nn.Linear(50, 2)
        self.dropout = nn.Dropout(0.2)
        self.act = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(-1)
        x = self.dropout(x) # size([64, 42, 1])
        
        ht = torch.randn(2, 3, 20)
        ct = torch.randn(2, 3, 20)
        lstm_out, (ht, ct) = self.lstm(x)
        dense_outputs = self.linear(ht[-1])
        outputs = self.act(dense_outputs)
        return outputs

In [34]:
from sklearn.metrics import f1_score

def binary_accuracy(preds, y):
        # round predictions to the closest integer
    preds = torch.max(preds, 1)[1]
    correct = 0
    total = 0
    correct += (preds == y).float().sum()
    total += y.shape[0]
    acc = correct / total
    return acc, preds

def train(epochs):
    model = get_model()
    criterion = get_loss()
    
    params = model.parameters()
    learning_rate = 0.001
    optimizer = torch.optim.Adam(params, lr=learning_rate)
    
    # Train the model.
    total_steps = len(train_loader)
    n_steps = 0

    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0
    
    val_acc = []
    val_loss = []
    train_acc = []
    train_loss = []
    f1s = []

    for epoch in range(epochs):
        epoch_loss = 0
        epoch_acc = 0
        n_steps = 0
        model.train()
        
        for i, (data, labels) in enumerate(train_loader):
            n_steps += 1
            
            optimizer.zero_grad()
            
            model.double()
            predictions = model(data)
            loss = criterion(predictions.squeeze(), labels)
                
            y_pred = torch.max(predictions, 1)[1]
            acc = accuracy_score(labels, y_pred)
            f1 = f1_score(labels, y_pred)
            loss.backward()

            optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        train_acc.append(epoch_acc/n_steps)
        train_loss.append(epoch_loss/n_steps)
        
        # Evaluate the model over test set
        test_pred, test_real = evaluate(model)
        test_pred = torch.max(test_pred, 1)[1]
        
#         print (f'Epoch {epoch}, Train Loss: {loss.item()}, Train Acc: {acc.item()}, f1: {f1}, Test Acc: {accuracy_score(test_real, test_pred)}')
        print (f'Epoch {epoch}, Train Acc: {acc.item()}, f1: {f1}, Test Acc: {accuracy_score(test_real, test_pred)}')

        val_acc.append(accuracy_score(test_real, test_pred))
        f1s.append(f1_score(test_real, test_pred))
        
        if epoch % 2 == 0:
            print (classification_report(test_real, test_pred))
        torch.save(model.state_dict(), 'model-tf-%d.pkl' % (epoch + 1))
    
    return train_acc, train_loss, val_acc, f1s
        
def evaluate(model):
    predictions = []
    real = []
    model.eval()
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            
            y_pred = model(x_batch)
            predictions += list(y_pred.detach().numpy())
            real += list(y_batch.detach().numpy())
    return torch.tensor(predictions), torch.tensor(real)

In [35]:
train_acc, train_loss, val_acc, f1s = train(30)

Epoch 0, Train Acc: 0.7931034482758621, f1: 0.8846153846153846, Test Acc: 0.8525901639344262
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1124
           1       0.85      1.00      0.92      6501

    accuracy                           0.85      7625
   macro avg       0.43      0.50      0.46      7625
weighted avg       0.73      0.85      0.78      7625



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1, Train Acc: 0.8620689655172413, f1: 0.9259259259259259, Test Acc: 0.8525901639344262
Epoch 2, Train Acc: 0.896551724137931, f1: 0.9454545454545454, Test Acc: 0.8525901639344262
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1124
           1       0.85      1.00      0.92      6501

    accuracy                           0.85      7625
   macro avg       0.43      0.50      0.46      7625
weighted avg       0.73      0.85      0.78      7625



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3, Train Acc: 0.8275862068965517, f1: 0.9056603773584906, Test Acc: 0.8525901639344262
Epoch 4, Train Acc: 0.896551724137931, f1: 0.9454545454545454, Test Acc: 0.8525901639344262
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1124
           1       0.85      1.00      0.92      6501

    accuracy                           0.85      7625
   macro avg       0.43      0.50      0.46      7625
weighted avg       0.73      0.85      0.78      7625



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5, Train Acc: 0.896551724137931, f1: 0.9454545454545454, Test Acc: 0.8525901639344262
Epoch 6, Train Acc: 0.8620689655172413, f1: 0.9259259259259259, Test Acc: 0.8525901639344262
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1124
           1       0.85      1.00      0.92      6501

    accuracy                           0.85      7625
   macro avg       0.43      0.50      0.46      7625
weighted avg       0.73      0.85      0.78      7625



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 7, Train Acc: 0.896551724137931, f1: 0.9454545454545454, Test Acc: 0.8525901639344262
Epoch 8, Train Acc: 0.896551724137931, f1: 0.9454545454545454, Test Acc: 0.8525901639344262
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1124
           1       0.85      1.00      0.92      6501

    accuracy                           0.85      7625
   macro avg       0.43      0.50      0.46      7625
weighted avg       0.73      0.85      0.78      7625



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 9, Train Acc: 0.9310344827586207, f1: 0.9615384615384616, Test Acc: 0.9217049180327869
Epoch 10, Train Acc: 0.8275862068965517, f1: 0.9019607843137255, Test Acc: 0.9217049180327869
              precision    recall  f1-score   support

           0       0.84      0.58      0.69      1124
           1       0.93      0.98      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.78      0.82      7625
weighted avg       0.92      0.92      0.92      7625

Epoch 11, Train Acc: 0.8620689655172413, f1: 0.9166666666666666, Test Acc: 0.9217049180327869
Epoch 12, Train Acc: 0.9655172413793104, f1: 0.9803921568627451, Test Acc: 0.9217049180327869
              precision    recall  f1-score   support

           0       0.84      0.58      0.69      1124
           1       0.93      0.98      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.78      0.82      7625
weighted avg       0.92   

In [36]:
from matplotlib import pyplot as plt

# fig = plt.figure()
# ax = plt.axes()

# x = [i for i in range(1, 31)]

# plt.title("Val Acc v.s. Epoch")
# plt.xlabel("Epoch")
# plt.ylabel("Val Acc")

# ax.plot(x, val_acc)