## Import packages

In [52]:
# Package for ignoring necessary warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
# Packages for data handling
import pandas as pd
import numpy as np
import random
# Packages for modeling
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier
# Packages for model evaluations
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc, roc_auc_score

In [2]:
data = pd.read_csv('heloc_dataset_v1.csv')
for i in data.columns:
    data = data[data[i] != -9]

In [3]:
data = data.join(pd.get_dummies(data.RiskPerformance, drop_first=True)).drop('RiskPerformance', axis=1)
data = data.rename(columns = {'Good':'RiskPerformance_Good'})

In [4]:
data = data.replace([-8, -7], np.nan)
data.head(10)

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance_Good
0,55,144.0,4,84,20,3,0,83,2.0,3,...,0.0,0,0,33.0,,8.0,1.0,1.0,69.0,0
1,61,58.0,15,41,2,4,4,100,,0,...,0.0,0,0,0.0,,0.0,,,0.0,0
2,67,66.0,5,24,9,0,0,100,,7,...,0.0,4,4,53.0,66.0,4.0,2.0,1.0,86.0,0
3,66,169.0,1,73,28,1,1,93,76.0,6,...,0.0,5,4,72.0,83.0,6.0,4.0,3.0,91.0,0
4,81,333.0,27,132,12,0,0,100,,7,...,0.0,1,1,51.0,89.0,3.0,1.0,0.0,80.0,0
5,59,137.0,11,78,31,0,0,91,1.0,4,...,0.0,0,0,62.0,93.0,12.0,4.0,3.0,94.0,0
6,54,88.0,7,37,25,0,0,92,9.0,4,...,0.0,4,4,89.0,76.0,7.0,7.0,2.0,100.0,1
7,68,148.0,7,65,17,0,0,83,31.0,6,...,0.0,0,0,28.0,48.0,2.0,2.0,2.0,40.0,1
8,59,324.0,2,138,24,0,0,85,5.0,4,...,0.0,1,1,68.0,,7.0,1.0,3.0,90.0,0
9,61,79.0,4,36,19,0,0,95,5.0,4,...,0.0,6,6,31.0,86.0,5.0,3.0,1.0,62.0,0


In [5]:
for col in data.columns:
    median = data[col].median()
    data[col].fillna(median, inplace=True)

In [6]:
df1 = pd.get_dummies(data.MaxDelq2PublicRecLast12M, prefix='MaxDelq2PublicRecLast12M')
df1['MaxDelq2PublicRecLast12M_5or6'] = df1.iloc[:,5]+df1.iloc[:,6]
df1 = df1.drop(columns = ['MaxDelq2PublicRecLast12M_5', 'MaxDelq2PublicRecLast12M_6'])
df1.head()

Unnamed: 0,MaxDelq2PublicRecLast12M_0,MaxDelq2PublicRecLast12M_1,MaxDelq2PublicRecLast12M_2,MaxDelq2PublicRecLast12M_3,MaxDelq2PublicRecLast12M_4,MaxDelq2PublicRecLast12M_7,MaxDelq2PublicRecLast12M_9,MaxDelq2PublicRecLast12M_5or6
0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0


In [7]:
df2 = pd.get_dummies(data.MaxDelqEver, prefix='MaxDelqEver')
df2.head()

Unnamed: 0,MaxDelqEver_2,MaxDelqEver_3,MaxDelqEver_4,MaxDelqEver_5,MaxDelqEver_6,MaxDelqEver_7,MaxDelqEver_8
0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1


In [8]:
data = data.join(df1).join(df2).drop(columns=['MaxDelq2PublicRecLast12M', 'MaxDelqEver'])
data.head()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,NumTotalTrades,...,MaxDelq2PublicRecLast12M_7,MaxDelq2PublicRecLast12M_9,MaxDelq2PublicRecLast12M_5or6,MaxDelqEver_2,MaxDelqEver_3,MaxDelqEver_4,MaxDelqEver_5,MaxDelqEver_6,MaxDelqEver_7,MaxDelqEver_8
0,55,144.0,4,84,20,3,0,83,2.0,23,...,0,0,0,0,0,0,1,0,0,0
1,61,58.0,15,41,2,4,4,100,15.0,7,...,0,0,0,0,0,0,0,0,0,1
2,67,66.0,5,24,9,0,0,100,15.0,9,...,1,0,0,0,0,0,0,0,0,1
3,66,169.0,1,73,28,1,1,93,76.0,30,...,0,0,1,0,0,0,0,1,0,0
4,81,333.0,27,132,12,0,0,100,15.0,12,...,1,0,0,0,0,0,0,0,0,1


In [9]:
# Split the data into training dataset and testing dataset
from sklearn.model_selection import train_test_split
X = data.drop('RiskPerformance_Good', axis=1)
y = data['RiskPerformance_Good']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
dfx = pd.DataFrame(X_train_scaled, columns = X_train.columns)
dfx

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,NumTotalTrades,...,MaxDelq2PublicRecLast12M_7,MaxDelq2PublicRecLast12M_9,MaxDelq2PublicRecLast12M_5or6,MaxDelqEver_2,MaxDelqEver_3,MaxDelqEver_4,MaxDelqEver_5,MaxDelqEver_6,MaxDelqEver_7,MaxDelqEver_8
0,-0.003563,0.407321,-0.199155,0.220121,0.613418,-0.471416,-0.391434,0.055398,0.034805,0.574479,...,-0.866794,-0.01126,1.398361,-0.288219,-0.181291,-0.181663,-0.325578,1.566877,-0.120014,-0.923081
1,1.214016,1.436443,-0.199155,0.101652,0.790970,-0.471416,-0.391434,0.647162,-0.228167,0.883737,...,1.153676,-0.01126,-0.715123,-0.288219,-0.181291,-0.181663,-0.325578,-0.638212,-0.120014,1.083329
2,1.416946,-0.580220,0.934861,0.427441,-1.250876,-0.471416,-0.391434,0.647162,-0.228167,-1.203758,...,1.153676,-0.01126,-0.715123,-0.288219,-0.181291,-0.181663,-0.325578,-0.638212,-0.120014,1.083329
3,-0.206493,-0.050066,-0.766164,0.072034,0.879746,0.338898,0.618586,0.647162,-0.228167,0.961052,...,-0.866794,-0.01126,1.398361,-0.288219,-0.181291,-0.181663,-0.325578,-0.638212,-0.120014,1.083329
4,0.503762,-0.309946,-0.685162,0.101652,1.234849,0.338898,-0.391434,0.224473,0.560750,1.115681,...,-0.866794,-0.01126,1.398361,-0.288219,-0.181291,-0.181663,3.071459,-0.638212,-0.120014,-0.923081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7883,0.706691,1.270120,-0.685162,0.397824,0.524642,-0.471416,-0.391434,0.647162,-0.228167,0.574479,...,1.153676,-0.01126,-0.715123,-0.288219,-0.181291,-0.181663,-0.325578,-0.638212,-0.120014,1.083329
7884,0.097902,1.301306,2.473884,1.937921,-0.806996,-0.471416,-0.391434,0.647162,-0.228167,-0.817185,...,1.153676,-0.01126,-0.715123,-0.288219,-0.181291,-0.181663,-0.325578,-0.638212,-0.120014,1.083329
7885,0.199367,1.322096,-0.685162,0.042417,1.589953,-0.471416,-0.391434,0.647162,-0.228167,1.502254,...,1.153676,-0.01126,-0.715123,-0.288219,-0.181291,-0.181663,-0.325578,-0.638212,-0.120014,1.083329
7886,-1.424072,-0.393107,0.448854,-0.312990,-1.250876,1.149211,1.628607,-1.466283,0.034805,-1.126443,...,-0.866794,-0.01126,-0.715123,3.469584,-0.181291,-0.181663,-0.325578,-0.638212,-0.120014,-0.923081


## Logistic Regression

In [12]:
# build logistic regression model
lr_model = LogisticRegression()

# testing parameters using grid research
param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} # l1 lasso l2 ridge
lr_gs = GridSearchCV(lr_model, param_grid_lr, cv=5)
lr_gs.fit(X_train_scaled, y_train)
lr_gs.best_score_

0.7365618661257607

In [13]:
lr_gs.best_params_

{'C': 0.01, 'penalty': 'l2'}

In [14]:
# apply to validation dataset
lr_model = LogisticRegression(C=0.1,penalty='l2')
lr_model.fit(X_train_scaled, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
# test the model in the validation dataset
lr_score = lr_model.score(X_test_scaled, y_test)
lr_score

0.7181956411556006

In [16]:
# calculate auc score
lrscore = lr_model.fit(X_train_scaled, y_train).decision_function(X_test_scaled)
lr_auc_score = roc_auc_score(y_test, lrscore)
lr_auc_score

0.7886719603550565

## KNN

In [17]:
# testing different k-value accuracy
knn_model = KNeighborsClassifier()
param_grid_knn = {'n_neighbors': np.arange(1, 23)}
knn_gs = GridSearchCV(knn_model, param_grid_knn, cv=5)
knn_gs.fit(X_train_scaled, y_train)
knn_gs.best_score_

0.7247718052738337

In [18]:
knn_gs.best_params_

{'n_neighbors': 22}

In [19]:
# apply to validation set
knn_model = KNeighborsClassifier(22)
knn_model.fit(X_test_scaled, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=22, p=2,
                     weights='uniform')

In [20]:
# validation set accuracy
knn_score = knn_model.score(X_test_scaled, y_test)
knn_score

0.7192093258996453

## SVM

In [40]:
# build SVM models: including linear, poly, rbf
svm_model = SVC()

# after comparing the SVM using above three kernels, the rbf is most likely to fit better
# explore accuracy of the SVM using rbf kernel with different parameters
param_grid_svm = {'C':[0.1,1,10,100],'kernel':['rbf'], 'gamma':[0.001, 0.01, 0.1], 'max_iter':[-1],'random_state':[1]}
svm_gs = GridSearchCV(svm_model, param_grid_svm, cv=5, scoring='accuracy')
svm_gs.fit(X_train_scaled, y_train)
svm_gs.best_params_

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf', 'max_iter': -1, 'random_state': 1}

In [41]:
svm_gs.best_score_

0.7371957403651116

In [42]:
# apply to test dataset
svm_model = SVC(C=10, kernel='rbf', gamma=0.001, max_iter=-1, random_state=1)
svm_model.fit(X_test_scaled, y_test)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001,
    verbose=False)

In [43]:
svm_score = svm_model.score(X_test_scaled, y_test)
svm_score

0.7344145970603142

In [44]:
# calculate auc score
svmscore = svm_model.fit(X_train_scaled, y_train).decision_function(X_test_scaled)
svm_auc_score = roc_auc_score(y_test, svmscore)
svm_auc_score

0.7890667194386382

## Single decision Tree, bagging, random forest

In [26]:
# scale the dataset for tree-based models
scaler = StandardScaler()
for columns in data.columns:
    if columns != 'RiskPerformance_Good':
        data[columns] = scaler.fit_transform(data[columns].values.reshape(-1,1))
X_dt = data.drop('RiskPerformance_Good',axis = 1)
y_dt = data['RiskPerformance_Good']

In [27]:
random.seed(123)
ST_max = []
n_ST = []
Bagging_max = []
n_Bagging = []
RF_max = []
n_RF = []
df_tree = pd.DataFrame()

In [28]:
# use for loops to try singletree, tree(bagging) and random forest by using many different hyperparameters
# find the best model by tring tree depth from 1 to 5
for tree_depth in range(1,6,1):
    st_model = DecisionTreeClassifier(max_depth=tree_depth)
    st_model = st_model.fit(X_dt, y_dt)    
    st_scores = cross_val_score(st_model, X_dt, y_dt, cv=5)

# create a Bagging and RF classifier
    base_clf = DecisionTreeClassifier(max_depth=tree_depth) # base classifier
    res = []
    n_range = range(1,40,1)
    # find the best model by tring estimators from 1 to 40
    for n in n_range:
        # bagging classifier with n bootstrapped data sets
        bagging_model = BaggingClassifier(n_estimators=n, base_estimator=base_clf)
        bagging_scores = cross_val_score(bagging_model, X_dt, y_dt, cv=5)
    
        # random forest classifier with n bootstrapped data sets m=sqrt(p)
        rf_model = RandomForestClassifier(max_features="sqrt", n_estimators=n, max_depth=tree_depth)
        rf_scores = cross_val_score(rf_model, X_dt, y_dt, cv=5)
        
        res.append((n, bagging_scores.mean(), bagging_scores.std(), rf_scores.mean(), rf_scores.std(), st_scores.mean()))
    
    df_accuracy = pd.DataFrame(data=res,columns=['n','Bagging accuracy','Bagging error','RF accuracy','RF error','ST accuracy'])
    df_accuracy.index=df_accuracy['n']
    df_accuracy = df_accuracy[['Bagging accuracy','RF accuracy','ST accuracy']]
    Bagging_max.append(max(df_accuracy['Bagging accuracy']))
    RF_max.append(max(df_accuracy['RF accuracy']))
    ST_max.append(max(df_accuracy['ST accuracy']))
    
    n_Bagging.append(df_accuracy.idxmax()['Bagging accuracy'])
    n_RF.append(df_accuracy.idxmax()['RF accuracy'])
    n_ST.append(df_accuracy.idxmax()['ST accuracy'])
    
df_accuracy.idxmax()['Bagging accuracy']
df_tree['RF_max'] = RF_max
df_tree['n_RF'] = n_RF 
df_tree['Bagging_max'] = Bagging_max
df_tree['n_Bagging'] = n_Bagging
df_tree['ST_max'] = ST_max
Tree_Depth = df_tree.index+1
df_tree['Tree_Depth'] = Tree_Depth
df_tree

Unnamed: 0,RF_max,n_RF,Bagging_max,n_Bagging,ST_max,Tree_Depth
0,0.712605,31,0.708852,1,0.70429,1
1,0.719707,20,0.709257,4,0.70429,2
2,0.724572,24,0.720819,24,0.705609,3
3,0.727716,35,0.723964,15,0.714227,4
4,0.729947,38,0.727207,33,0.707737,5


In [29]:
bagging_score = max(df_tree['Bagging_max'])
bagging_score

0.7272074065330979

In [30]:
st_score = max(df_tree['ST_max'])
st_score

0.7142270988121477

## Boosting

In [31]:
# tree(boosting)
boost_model = AdaBoostClassifier()
boost_model = boost_model.fit(X_dt, y_dt)
boost_score = boost_model.score(X_dt, y_dt)
boost_score

0.7349153229895549

In [32]:
# use GridSearchCV to find the best hyperparameters for tree(boosting)
param_grid_boost = [{'n_estimators':[10,20,30,40,50],'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}]
boost_model_reg = AdaBoostRegressor()
boost_model_reg
boost_gs = GridSearchCV(boost_model_reg, param_grid_boost, cv=5, scoring='neg_mean_squared_error')
boost_gs.fit(X_dt,y_dt)
boost_cvres = boost_gs.cv_results_
for mean_score, params in zip(boost_cvres["mean_test_score"], boost_cvres["params"]):  
    print(np.sqrt(-mean_score), params)

0.4389928057500797 {'learning_rate': 0.1, 'n_estimators': 10}
0.43638880670300784 {'learning_rate': 0.1, 'n_estimators': 20}
0.4367048139793892 {'learning_rate': 0.1, 'n_estimators': 30}
0.4362628496429353 {'learning_rate': 0.1, 'n_estimators': 40}
0.4374553634271099 {'learning_rate': 0.1, 'n_estimators': 50}
0.4380469462942017 {'learning_rate': 0.2, 'n_estimators': 10}
0.43889608622100723 {'learning_rate': 0.2, 'n_estimators': 20}
0.4383067040710816 {'learning_rate': 0.2, 'n_estimators': 30}
0.4377906082470449 {'learning_rate': 0.2, 'n_estimators': 40}
0.4375001425260739 {'learning_rate': 0.2, 'n_estimators': 50}
0.437332992570143 {'learning_rate': 0.3, 'n_estimators': 10}
0.437685660610373 {'learning_rate': 0.3, 'n_estimators': 20}
0.4396000010957063 {'learning_rate': 0.3, 'n_estimators': 30}
0.43824468882297146 {'learning_rate': 0.3, 'n_estimators': 40}
0.4375906172635625 {'learning_rate': 0.3, 'n_estimators': 50}
0.44078562158691276 {'learning_rate': 0.4, 'n_estimators': 10}
0.4395

In [33]:
# from the result we can see that when learning rate = 0.9 and n_estimators = 20, tree(boosting) has the highest accuracy
boost_model = AdaBoostClassifier(n_estimators = 20, learning_rate = 0.9)
boost_model = boost_model.fit(X_dt, y_dt)
boost_model.score(X_dt, y_dt)
# The accuracy of tree(boosting) is 0.72893

0.7289321569820505

In [34]:
# By comparing single tree, tree(bagging) and random forest, we find that When random forest has the highest accuracy
# when tree depth = 5, estimators = 32. The accuracy is 0.739783 using the entire dataset.

# Best Model:
rf_model = RandomForestClassifier(max_features="sqrt", n_estimators=28, max_depth=5)
rf_model.fit(X_dt, y_dt)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=28,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
rf_score = rf_model.score(X_dt, y_dt)
rf_score

0.7397829834702363

## Model results

In [45]:
models_res = pd.DataFrame()
models_res['model_names'] = ['Logistic regression', 'KNN', 'SVM', 'Single decision tree', 'Bagging', 'Boosting', 'Random forest']
models_res['model_score'] = [lr_score, knn_score, svm_score, st_score, bagging_score, boost_score, rf_score]
models_res

Unnamed: 0,model_names,model_score
0,Logistic regression,0.718196
1,KNN,0.719209
2,SVM,0.734415
3,Single decision tree,0.714227
4,Bagging,0.727207
5,Boosting,0.734915
6,Random forest,0.739783


In [54]:
print('The best model is:', models_res['model_names'][6], '\n')
print('Its accuracy (i.e. model score)', round(max(models_res['model_score']), 2), 'is the highest among the models.')

The best model is: Random forest 

Its accuracy (i.e. model score) 0.74 is the highest among the models.


In [None]:
#generate file
pickle.dump(X_train, open('X_train.sav', 'wb'))
pickle.dump(X_dt, open('X_dt.sav', 'wb'))
pickle.dump(y_dt, open('y_dt.sav', 'wb'))
pickle.dump(lr_model, open('LogR.sav', 'wb'))
pickle.dump(knn_model, open('KNN.sav', 'wb'))
pickle.dump(svm_model, open('SVMrbf.sav', 'wb'))
pickle.dump(rf_model, open('RF.sav', 'wb'))
pickle.dump(boost_model, open('Boosting.sav', 'wb'))
pickle.dump(X_test, open('X_test.sav', 'wb'))
pickle.dump(y_test, open('y_test.sav', 'wb'))
pickle.dump(df_tree, open('df_tree.sav', 'wb'))