### Imports

In [11]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression

from torch import nn
import neural_network
import losses
import torch

### Read In loan application data

In [12]:
df = pd.read_csv("../data/clean2_data_normalized.csv", index_col=0)

In [13]:
seed = 100
test_size = 0.15
X = df.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
y = df["TARGET"]

# note: stratify=df.buy generates
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y)
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

### Linear Regression --> Might want to switch this to soft SVM classifier bec this is not a regression task

In [14]:
# # linear model 
# x = sm.add_constant(X_train, prepend=False)
# lin_mod = sm.OLS(y_train, x.astype(float))
# lin_mod = lin_mod.fit()
# # print(lin_mod.summary())

In [15]:
# y_pred_tr = lin_mod.predict(x.astype(float))
# y_pred_te = lin_mod.predict(sm.add_constant(X_test, prepend=False).astype(float))

# accuracy_tr = accuracy_score(y_train, y_pred_tr)
# print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

# accuracy_te = accuracy_score(y_test, y_pred_te)
# print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

# print('f1 score on train set', f1_score(y_train, y_pred_tr))
# print('f1 score on test set', f1_score(y_test, y_pred_te))

### Logistic Regression

In [16]:
logreg = LogisticRegression(class_weight='balanced', max_iter=10000)
logreg.fit(X_train, y_train)

In [17]:
f1_tr = f1_score(y_train, logreg.predict(X_train))
print('f1 score on train set', f1_tr)

f1_te = f1_score(y_test, logreg.predict(X_test))
print('f1 score on test set', f1_te)

f1 score on train set 0.21053634190248505
f1 score on test set 0.20696833570724224


In [18]:
print('Classification Report for Train Set: \n', classification_report(y_train, logreg.predict(X_train)))
print('Classification Report for Test Set: \n', classification_report(y_test, logreg.predict(X_test)))

Classification Report for Train Set: 
               precision    recall  f1-score   support

         0.0       0.95      0.63      0.76    207743
         1.0       0.13      0.63      0.21     17416

    accuracy                           0.63    225159
   macro avg       0.54      0.63      0.49    225159
weighted avg       0.89      0.63      0.72    225159

Classification Report for Test Set: 
               precision    recall  f1-score   support

         0.0       0.95      0.64      0.76     36661
         1.0       0.12      0.61      0.21      3074

    accuracy                           0.64     39735
   macro avg       0.54      0.63      0.49     39735
weighted avg       0.89      0.64      0.72     39735



In [19]:
tn, fp, fn, tp = confusion_matrix(y_train, logreg.predict(X_train)).ravel()
tn, fp, fn, tp

(131222, 76521, 6364, 11052)

### Extreme Gradient Boosted Trees

In [20]:
xgb_model = XGBClassifier(base_score=np.sum(y_train == 0)/len(y_train), max_depth=10)
xgb_model.fit(X_train, y_train, sample_weight=classes_weights)

In [21]:
y_pred_tr = xgb_model.predict(X_train)
y_pred_te = xgb_model.predict(X_test)

accuracy_tr = accuracy_score(y_train, y_pred_tr)
print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

accuracy_te = accuracy_score(y_test, y_pred_te)
print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

print('f1 score on train set', f1_score(y_train, y_pred_tr))
print('f1 score on test set', f1_score(y_test, y_pred_te))

Train Accuracy: 89.24%
Test Accuracy: 80.50%
f1 score on train set 0.5822378635751992
f1 score on test set 0.20647275706677592


In [22]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_tr).ravel()
tn, fp, fn, tp

(184068, 23675, 541, 16875)

In [23]:
feat_dict= {}
for col, val in sorted(zip(X_train.columns, xgb_model.feature_importances_),key=lambda x:x[1],reverse=True):
  feat_dict[col]=val
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
feat_df

Unnamed: 0,Feature,Importance
0,Higher education,0.058260
1,REGION_RATING_CLIENT_W_CITY,0.020932
2,car_owned_less_10,0.016948
3,CODE_GENDER,0.016613
4,Secretaries,0.013290
...,...,...
144,FLAG_EMP_PHONE,0.000000
145,Businessman,0.000000
146,Pensioner,0.000000
147,Unemployed,0.000000


### Neural Network

In [24]:
n, d = X_train.shape
input_dim = d
hidden_dim = d//2
output_dim = 1
num_epochs = 500
model_nn = neural_network.Model(input_dim, hidden_dim, output_dim)

model_nn = neural_network.train_regression_model(torch.tensor(X_train.values, dtype=torch.float32), 
                                      torch.tensor(y_train.values, dtype=torch.float32), 
                                      model_nn, 
                                      num_epochs, 
                                      loss_fn = losses.DiceBCELoss(weight=torch.tensor(classes_weights)),
                                      # loss_fn = losses.DiceLoss(),
                                      # loss_fn = nn.BCELoss(weight=torch.tensor(classes_weights)), 
                                      lr=1e-3, print_freq=25, display_loss=True)

epoch 25 loss 1.5252749919891357
epoch 50 loss 1.507124900817871
epoch 75 loss 1.498724102973938
epoch 100 loss 1.493520975112915
epoch 125 loss 1.49061119556427
epoch 150 loss 1.4890379905700684
epoch 175 loss 1.4879469871520996
epoch 200 loss 1.4869275093078613
epoch 225 loss 1.4858903884887695
epoch 250 loss 1.4848406314849854
epoch 275 loss 1.4838814735412598
epoch 300 loss 1.482987642288208
epoch 325 loss 1.482181429862976
epoch 350 loss 1.4814462661743164
epoch 375 loss 1.4807624816894531
epoch 400 loss 1.4801074266433716
epoch 425 loss 1.47951078414917
epoch 450 loss 1.4788575172424316
epoch 475 loss 1.478256344795227
epoch 500 loss 1.4776177406311035


In [25]:
model_nn.eval()

y_pred_tr = model_nn(torch.tensor(X_train.values, dtype=torch.float32)).detach().numpy().flatten()
y_pred_tr = np.round(y_pred_tr)
y_pred_te = model_nn(torch.tensor(X_test.values, dtype=torch.float32)).detach().numpy().flatten()
y_pred_te = np.round(y_pred_te)

accuracy_tr = accuracy_score(y_train, y_pred_tr)
print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

accuracy_te = accuracy_score(y_test, y_pred_te)
print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

print('f1 score on train set', f1_score(y_train, y_pred_tr))
print('f1 score on test set', f1_score(y_test, y_pred_te))

Train Accuracy: 64.51%
Test Accuracy: 64.55%
f1 score on train set 0.21332230688955833
f1 score on test set 0.20598680872653474


In [26]:
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_tr).ravel()
tn, fp, fn, tp

(134419, 73324, 6582, 10834)