### Imports

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression

from torch import nn
import neural_network
import losses
import torch

### Read In loan application data

In [3]:
df = pd.read_csv("clean2_data-ext-norm.csv", index_col=0)

In [4]:
seed = 100
test_size = 0.15
X = df.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
y = df["TARGET"]

# note: stratify=df.buy generates
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y)
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

### Linear Regression --> Might want to switch this to soft SVM classifier bec this is not a regression task

In [7]:
# # linear model 
# x = sm.add_constant(X_train, prepend=False)
# lin_mod = sm.OLS(y_train, x.astype(float))
# lin_mod = lin_mod.fit()
# print(lin_mod.summary())

In [8]:
# y_pred_tr = lin_mod.predict(x.astype(float))
# y_pred_te = lin_mod.predict(sm.add_constant(X_test, prepend=False).astype(float))

# accuracy_tr = accuracy_score(y_train, y_pred_tr)
# print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

# accuracy_te = accuracy_score(y_test, y_pred_te)
# print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

# print('f1 score on train set', f1_score(y_train, y_pred_tr))
# print('f1 score on test set', f1_score(y_test, y_pred_te))

### Logistic Regression

In [9]:
logreg = LogisticRegression(class_weight='balanced', max_iter=10000)
logreg.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', max_iter=10000)

In [10]:
f1_tr = f1_score(y_train, logreg.predict(X_train))
print('f1 score on train set', f1_tr)

f1_te = f1_score(y_test, logreg.predict(X_test))
print('f1 score on test set', f1_te)

f1 score on train set 0.17542486583184255
f1 score on test set 0.17456678219141564


In [11]:
print('Classification Report for Train Set: \n', classification_report(y_train, logreg.predict(X_train)))
print('Classification Report for Test Set: \n', classification_report(y_test, logreg.predict(X_test)))

Classification Report for Train Set: 
               precision    recall  f1-score   support

           0       0.94      0.61      0.74    207743
           1       0.10      0.54      0.18     17416

    accuracy                           0.61    225159
   macro avg       0.52      0.58      0.46    225159
weighted avg       0.88      0.61      0.70    225159

Classification Report for Test Set: 
               precision    recall  f1-score   support

           0       0.94      0.62      0.75     36661
           1       0.10      0.53      0.17      3074

    accuracy                           0.61     39735
   macro avg       0.52      0.57      0.46     39735
weighted avg       0.88      0.61      0.70     39735



In [12]:
tn, fp, fn, tp = confusion_matrix(y_train, logreg.predict(X_train)).ravel()
tn, fp, fn, tp

(127245, 80498, 8002, 9414)

### Extreme Gradient Boosted Trees

In [13]:
xgb_model = XGBClassifier(base_score=np.sum(y_train == 0)/len(y_train), max_depth=10)
xgb_model.fit(X_train, y_train, sample_weight=classes_weights)

XGBClassifier(base_score=0.9226502160695331, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [14]:
y_pred_tr = xgb_model.predict(X_train)
y_pred_te = xgb_model.predict(X_test)

accuracy_tr = accuracy_score(y_train, y_pred_tr)
print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

accuracy_te = accuracy_score(y_test, y_pred_te)
print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

print('f1 score on train set', f1_score(y_train, y_pred_tr))
print('f1 score on test set', f1_score(y_test, y_pred_te))

Train Accuracy: 89.12%
Test Accuracy: 81.48%
f1 score on train set 0.5766784207978981
f1 score on test set 0.2577912254160363


In [15]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_tr).ravel()
tn, fp, fn, tp

(183988, 23755, 735, 16681)

In [16]:
feat_dict= {}
for col, val in sorted(zip(X_train.columns, xgb_model.feature_importances_),key=lambda x:x[1],reverse=True):
  feat_dict[col]=val
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
feat_df

Unnamed: 0,Feature,Importance
0,EXT_SOURCE_3,0.025717
1,EXT_SOURCE_2,0.017720
2,NAME_CONTRACT_TYPE,0.015495
3,Higher education,0.015250
4,CODE_GENDER,0.014997
...,...,...
147,Pensioner,0.000000
148,Student,0.000000
149,Unemployed,0.000000
150,Advertising,0.000000


### Neural Network

In [17]:
n, d = X_train.shape
input_dim = d
hidden_dim = d//2
output_dim = 1
num_epochs = 500
model_nn = neural_network.Model(input_dim, hidden_dim, output_dim)

model_nn = neural_network.train_regression_model(torch.tensor(X_train.values, dtype=torch.float32), 
                                      torch.tensor(y_train.values, dtype=torch.float32), 
                                      model_nn, 
                                      num_epochs, 
                                      loss_fn = losses.DiceBCELoss(weight=torch.tensor(classes_weights)),
                                      # loss_fn = losses.DiceLoss(),
                                      # loss_fn = nn.BCELoss(weight=torch.tensor(classes_weights)), 
                                      lr=1e-3, print_freq=25, display_loss=True)

epoch 25 loss 50.85640335083008
epoch 50 loss 50.85640335083008
epoch 75 loss 50.85640335083008
epoch 100 loss 50.85640335083008
epoch 125 loss 50.85640335083008
epoch 150 loss 50.85640335083008
epoch 175 loss 50.85640335083008
epoch 200 loss 50.85640335083008
epoch 225 loss 50.85640335083008
epoch 250 loss 50.85640335083008
epoch 275 loss 50.85640335083008
epoch 300 loss 50.85640335083008
epoch 325 loss 50.85640335083008
epoch 350 loss 50.85640335083008
epoch 375 loss 50.85640335083008
epoch 400 loss 50.85640335083008
epoch 425 loss 50.85640335083008
epoch 450 loss 50.85640335083008
epoch 475 loss 50.85640335083008
epoch 500 loss 50.85640335083008


In [18]:
model_nn.eval()

y_pred_tr = model_nn(torch.tensor(X_train.values, dtype=torch.float32)).detach().numpy().flatten()
y_pred_tr = np.round(y_pred_tr)
y_pred_te = model_nn(torch.tensor(X_test.values, dtype=torch.float32)).detach().numpy().flatten()
y_pred_te = np.round(y_pred_te)

accuracy_tr = accuracy_score(y_train, y_pred_tr)
print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

accuracy_te = accuracy_score(y_test, y_pred_te)
print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

print('f1 score on train set', f1_score(y_train, y_pred_tr))
print('f1 score on test set', f1_score(y_test, y_pred_te))

Train Accuracy: 7.73%
Test Accuracy: 7.74%
f1 score on train set 0.143592703287643
f1 score on test set 0.14361466046859306


In [19]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_tr).ravel()
tn, fp, fn, tp

(0, 207743, 0, 17416)