### Imports

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression

from torch import nn
import neural_network
import torch

  from .autonotebook import tqdm as notebook_tqdm


### Read In loan application data

In [2]:
df = pd.read_csv("../data/clean2_data.csv", index_col=0)

In [3]:
seed = 100
test_size = 0.15
X = df.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
y = df["TARGET"]

# note: stratify=df.buy generates
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y)
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

### Linear Regression --> Might want to switch this to soft SVM classifier bec this is not a regression task

In [4]:
# # linear model 
# x = sm.add_constant(X_train, prepend=False)
# lin_mod = sm.OLS(y_train, x.astype(float))
# lin_mod = lin_mod.fit()
# # print(lin_mod.summary())

In [5]:
# y_pred_tr = lin_mod.predict(x.astype(float))
# y_pred_te = lin_mod.predict(sm.add_constant(X_test, prepend=False).astype(float))

# accuracy_tr = accuracy_score(y_train, y_pred_tr)
# print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

# accuracy_te = accuracy_score(y_test, y_pred_te)
# print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

# print('f1 score on train set', f1_score(y_train, y_pred_tr))
# print('f1 score on test set', f1_score(y_test, y_pred_te))

### Logistic Regression

In [6]:
logreg = LogisticRegression(class_weight='balanced', max_iter=10000)
logreg.fit(X_train, y_train)

In [7]:
f1_tr = f1_score(y_train, logreg.predict(X_train))
print('f1 score on train set', f1_tr)

f1_te = f1_score(y_test, logreg.predict(X_test))
print('f1 score on test set', f1_te)

f1 score on train set 0.17542682472228438
f1 score on test set 0.1743906991627113


In [8]:
print('Classification Report for Train Set: \n', classification_report(y_train, logreg.predict(X_train)))
print('Classification Report for Test Set: \n', classification_report(y_test, logreg.predict(X_test)))

Classification Report for Train Set: 
               precision    recall  f1-score   support

           0       0.94      0.61      0.74    207743
           1       0.10      0.54      0.18     17416

    accuracy                           0.61    225159
   macro avg       0.52      0.58      0.46    225159
weighted avg       0.88      0.61      0.70    225159

Classification Report for Test Set: 
               precision    recall  f1-score   support

           0       0.94      0.62      0.75     36661
           1       0.10      0.53      0.17      3074

    accuracy                           0.61     39735
   macro avg       0.52      0.57      0.46     39735
weighted avg       0.88      0.61      0.70     39735



### Extreme Gradient Boosted Trees

In [9]:
xgb_model = XGBClassifier(base_score=np.sum(y_train == 0)/len(y_train), max_depth=6)
xgb_model.fit(X_train, y_train, sample_weight=classes_weights)

In [10]:
y_pred_tr = xgb_model.predict(X_train)
y_pred_te = xgb_model.predict(X_test)

accuracy_tr = accuracy_score(y_train, y_pred_tr)
print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

accuracy_te = accuracy_score(y_test, y_pred_te)
print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

print('f1 score on train set', f1_score(y_train, y_pred_tr))
print('f1 score on test set', f1_score(y_test, y_pred_te))

Train Accuracy: 72.84%
Test Accuracy: 70.10%
f1 score on train set 0.30340585488096594
f1 score on test set 0.222178580780309


In [11]:
feat_dict= {}
for col, val in sorted(zip(X_train.columns, xgb_model.feature_importances_),key=lambda x:x[1],reverse=True):
  feat_dict[col]=val
feat_df = pd.DataFrame({'Feature':feat_dict.keys(),'Importance':feat_dict.values()})
feat_df

Unnamed: 0,Feature,Importance
0,Higher education,0.106579
1,CODE_GENDER,0.038968
2,car_owned_less_10,0.033034
3,REGION_RATING_CLIENT_W_CITY,0.032383
4,NAME_CONTRACT_TYPE,0.020415
...,...,...
144,Industry: type 10,0.000000
145,Industry: type 13,0.000000
146,Industry: type 2,0.000000
147,Insurance,0.000000


### Neural Network

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression

from torch import nn
import neural_network
import torch

df = pd.read_csv("../data/clean2_data.csv", index_col=0)

seed = 100
test_size = 0.15
X = df.drop(columns=["TARGET", "SK_ID_CURR"], axis=1)
y = df["TARGET"]

# note: stratify=df.buy generates
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y)
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
n, d = X_train.shape
input_dim = d
hidden_dim = d//2
output_dim = 1
num_epochs = 1000
model_nn = neural_network.Model(input_dim, hidden_dim, output_dim)

model_nn = neural_network.train_regression_model(torch.tensor(X_train.values, dtype=torch.float32), 
                                      torch.tensor(y_train.values, dtype=torch.float32), 
                                      model_nn, 
                                      num_epochs, 
                                      loss_fn = nn.BCELoss(weight=torch.tensor(classes_weights)), 
                                      lr=1e-3, print_freq=25, display_loss=True)

epoch 25 loss 49.999996185302734
epoch 50 loss 49.999996185302734
epoch 75 loss 49.999996185302734
epoch 100 loss 49.999996185302734
epoch 125 loss 49.999996185302734
epoch 150 loss 49.999996185302734
epoch 175 loss 49.999996185302734
epoch 200 loss 49.999996185302734
epoch 225 loss 49.999996185302734
epoch 250 loss 49.999996185302734


KeyboardInterrupt: 

In [None]:
model_nn.eval()

y_pred_tr = model_nn(torch.tensor(X_train.values, dtype=torch.float32)).detach().numpy().flatten()
y_pred_tr = np.round(y_pred_tr)
y_pred_te = model_nn(torch.tensor(X_test.values, dtype=torch.float32)).detach().numpy().flatten()
y_pred_te = np.round(y_pred_te)

accuracy_tr = accuracy_score(y_train, y_pred_tr)
print("Train Accuracy: %.2f%%" % (accuracy_tr * 100.0))

accuracy_te = accuracy_score(y_test, y_pred_te)
print("Test Accuracy: %.2f%%" % (accuracy_te * 100.0))

print('f1 score on train set', f1_score(y_train, y_pred_tr))
print('f1 score on test set', f1_score(y_test, y_pred_te))

In [6]:
y_pred_tr

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [9]:
pd.value_counts(np.round(y_pred_tr))

0.0    225123
1.0        36
dtype: int64

In [11]:
classes_weights

array([0.54191718, 0.54191718, 0.54191718, ..., 0.54191718, 0.54191718,
       6.46414217])

In [12]:
y_train

228590    0
158554    0
211236    0
48113     0
32847     0
         ..
135447    0
20871     0
37447     0
90991     0
183616    1
Name: TARGET, Length: 225159, dtype: int64