In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("bank_dataset.csv")

#Deleting ID as useless variable

del df['ID']

#Dropping education variables 0,5,6 & Marriage status 0
#since we don't know what these are per UCI page and these are such a small portion of data less than 3%

df_new = df[(df.EDUCATION !=0)&(df.EDUCATION !=5) &
            (df.EDUCATION !=6) & (df.MARRIAGE!= 0)
           ]
#Sex, Education & Marriage are already int values, so need to transform them
df_new.info()
round(df.describe(),4)

#This is concerning that we have an imbalanced dataset
#We will need to address this imbalance by uppersampling.
print('---------------------------------------------')
df_new = df_new.rename(columns={'default payment next month' : 'default_payment_next_month'})
print("default payment next month")
df_new['default_payment_next_month'].value_counts(normalize=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 29601 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   LIMIT_BAL                   29601 non-null  int64
 1   SEX                         29601 non-null  int64
 2   EDUCATION                   29601 non-null  int64
 3   MARRIAGE                    29601 non-null  int64
 4   AGE                         29601 non-null  int64
 5   PAY_0                       29601 non-null  int64
 6   PAY_2                       29601 non-null  int64
 7   PAY_3                       29601 non-null  int64
 8   PAY_4                       29601 non-null  int64
 9   PAY_5                       29601 non-null  int64
 10  PAY_6                       29601 non-null  int64
 11  BILL_AMT1                   29601 non-null  int64
 12  BILL_AMT2                   29601 non-null  int64
 13  BILL_AMT3                   29601 non-null  int64
 14  BILL_A

0    0.776866
1    0.223134
Name: default_payment_next_month, dtype: float64

In [17]:
#Training and testing split.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn import metrics as mt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from collections import Counter
import imblearn
import numpy as np
import random


#Setting Seed
#will use this in CV portion
random.seed(10)
seed = random.randint(1,500)
print("seed is:",seed)

#Before we do any model building, we need to upper sample "1" which is defaults.

from imblearn.over_sampling import SMOTE

if 'default_payment_next_month' in df_new:
    y = df_new['default_payment_next_month'].values
    del df_new['default_payment_next_month']
    X = df_new.values

#Saving out the column names , so we can make dataframes later on 
col = ['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE','PAY_0',
       'PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','BILL_AMT1','BILL_AMT2',
       'BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2',
       'PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']


#Code followed from https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
os = SMOTE(random_state=99)


#Our new datasets to use will be X_res,y_res
#X_res is the resample dataset that is now more balance
#Y_res is the target column that is now more balance.

X_res, y_res = os.fit_resample(X, y)

#Previous class distribtion
counter = Counter(y)
print(counter)

# summarize the new class distribution
counter_res = Counter(y_res)
print("OS_breakout",counter_res)
print("Since we OS, we now have a more balance dataset.This is clear by the counter counts")



#Setting up  the CV, code utlized from 04 Logits & SVM notebook
num_cv_iterations = 10
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         random_state = seed,
                         test_size  = 0.2)

#code utlized from 04 Logits & SVM notebook
std_scl = StandardScaler()
lr_clf = LogisticRegression(penalty='l2', C=0.05, solver='liblinear') 

# create the pipline
#code utlized from 04 Logits & SVM notebook
piped_object = Pipeline([('scale', std_scl),  # do this
                         ('logit_model', lr_clf)]) # and then do this


#Calc the weights
#code utlized from 04 Logits & SVM notebook
#We trained our model(including all variables at the moment) on 10 CV with random state set to the seed variable
#We are looping through and calcing ACC,Spec and Sensitivty.
#This section utliztied code from unit 4 notebook &
#https://statinfer.com/204-4-2-calculating-sensitivity-and-specificity-in-python/



weights = []
for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X_res,y_res)):
    piped_object.fit(X_res[train_indices],y_res[train_indices])  # train object
    y_hat = piped_object.predict(X_res[test_indices]) # get test set precitions
    weights.append(piped_object.named_steps['logit_model'].coef_[0])
    
    
    cm1 = mt.confusion_matrix(y_res[test_indices],y_hat)
    
    sensitivity1= cm1[0,0]/(cm1[0,0]+cm1[0,1])
    
    specificity1= cm1[1,1]/(cm1[1,0]+cm1[1,1])
    
    # print the accuracy and confusion matrix 
    print("====Iteration",iter_num," ====")
    print("Accuracy", round(mt.accuracy_score(y_res[test_indices],y_hat),2))
   
    
    print('sensitivity:',round(sensitivity1,2))
    print('specificity:',round(specificity1,2))

weights = np.array(weights)


seed is: 293
Counter({0: 22996, 1: 6605})
OS_breakout Counter({1: 22996, 0: 22996})
Since we OS, we now have a more balance dataset.This is clear by the counter counts
====Iteration 0  ====
Accuracy 0.73
sensitivity: 0.73
specificity: 0.72
====Iteration 1  ====
Accuracy 0.72
sensitivity: 0.72
specificity: 0.72
====Iteration 2  ====
Accuracy 0.72
sensitivity: 0.73
specificity: 0.71
====Iteration 3  ====
Accuracy 0.72
sensitivity: 0.73
specificity: 0.71
====Iteration 4  ====
Accuracy 0.72
sensitivity: 0.72
specificity: 0.72
====Iteration 5  ====
Accuracy 0.73
sensitivity: 0.73
specificity: 0.72
====Iteration 6  ====
Accuracy 0.73
sensitivity: 0.73
specificity: 0.72
====Iteration 7  ====
Accuracy 0.72
sensitivity: 0.73
specificity: 0.71
====Iteration 8  ====
Accuracy 0.72
sensitivity: 0.72
specificity: 0.72
====Iteration 9  ====
Accuracy 0.72
sensitivity: 0.72
specificity: 0.72


# Weights of General Log model

In [18]:
#Is this correct? I'm not sure if it's refering to model after scaling 

zip_vars = zip(lr_clf.coef_.T,df_new.columns) # combine attributes
zip_vars = sorted(zip_vars)
for coef, name in zip_vars:
    print(name, 'has weight of', round(coef[0],4)) 


MARRIAGE has weight of -0.5677
BILL_AMT1 has weight of -0.5167
SEX has weight of -0.3953
EDUCATION has weight of -0.3847
PAY_AMT1 has weight of -0.2727
PAY_AMT2 has weight of -0.2419
LIMIT_BAL has weight of -0.1823
PAY_AMT3 has weight of -0.0929
AGE has weight of -0.0898
PAY_AMT4 has weight of -0.0703
PAY_AMT5 has weight of -0.062
PAY_AMT6 has weight of -0.036
BILL_AMT5 has weight of -0.0037
PAY_6 has weight of 0.008
PAY_4 has weight of 0.0273
BILL_AMT4 has weight of 0.0297
BILL_AMT6 has weight of 0.0358
PAY_3 has weight of 0.071
BILL_AMT3 has weight of 0.0815
PAY_5 has weight of 0.0912
PAY_2 has weight of 0.0987
BILL_AMT2 has weight of 0.3155
PAY_0 has weight of 0.6212


In [19]:
#For building our logic model, we used RFE
# we utlized code from https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

#Since our model has a bunch of coffeicnets , we're going to use RFE in order to figure out which variables to elimate from our
#model. once we near our list, then we will have completed building a logistic regression model!

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


#I picked 10 features as the number of features to select 
#Not sure if we need to make logisticregression equation = previous ones
 
rfe = RFE(lr_clf, 10)


rfe = rfe.fit(X_res, y_res.ravel())

print(rfe.support_)
print(rfe.ranking_)
#This gives us the column names from RFE feature selection.
df_new.columns[rfe.support_]



[False  True  True  True  True  True  True  True  True  True  True False
 False False False False False False False False False False False]
[13  1  1  1  1  1  1  1  1  1  1  4  5 11 14 12 10  2  3  7  6  8  9]


Index(['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3',
       'PAY_4', 'PAY_5', 'PAY_6'],
      dtype='object')

In [20]:
#The above True/False line up with the columns
#'SEX','EDUCATION','EDUCATION','AGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','PAY_AMT1'
#So we need to reorg the dataframe X_res then continue with the process

X_rs = pd.DataFrame(data=X_res,columns=col)

imp_col = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3',
          'PAY_4', 'PAY_5', 'PAY_6']

X_rs  = X_rs[imp_col]

y_rs = pd.DataFrame(data=y_res,columns=['default_payment_next_month'])

#Checking the heads to make sure it worked
X_rs.head()


Unnamed: 0,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
0,2,2,1,24,2,2,-1,-1,-2,-2
1,2,2,2,26,-1,2,0,0,0,2
2,2,2,2,34,0,0,0,0,0,0
3,2,2,1,37,0,0,0,0,0,0
4,1,2,1,57,-1,0,-1,0,0,0


In [9]:
#Now we check the P-values of our model to futhur elimate variables!
import statsmodels.api as sm
logit_model=sm.Logit(y_rs,X_rs)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.605215
         Iterations 5
                              Results: Logit
Model:              Logit                      Pseudo R-squared: 0.127     
Dependent Variable: default_payment_next_month AIC:              55690.1095
Date:               2021-02-07 20:08           BIC:              55777.4717
No. Observations:   45992                      Log-Likelihood:   -27835.   
Df Model:           9                          LL-Null:          -31879.   
Df Residuals:       45982                      LLR p-value:      0.0000    
Converged:          1.0000                     Scale:            1.0000    
No. Iterations:     5.0000                                                 
------------------------------------------------------------------------------
                Coef.     Std.Err.       z        P>|z|      [0.025     0.975]
------------------------------------------------------------------------------
SEX     

In [21]:
#So based on the output above, We are only removing PAY_6 since it isn't signficiant as per pvalue of .1521 &
#PAY_4 0.0862 
p_cols =  ['SEX','EDUCATION','AGE','PAY_0','PAY_2','PAY_3','PAY_5']

X_cols = X_rs[p_cols]


X_cols

Unnamed: 0,SEX,EDUCATION,AGE,PAY_0,PAY_2,PAY_3,PAY_5
0,2,2,24,2,2,-1,-2
1,2,2,26,-1,2,0,0
2,2,2,34,0,0,0,0
3,2,2,37,0,0,0,0
4,1,2,57,-1,0,-1,0
...,...,...,...,...,...,...,...
45987,1,1,49,-1,-2,-2,-1
45988,2,2,27,1,0,0,0
45989,2,1,23,0,0,1,0
45990,1,2,24,-1,-1,-1,-2


# Final Logistic Regression model

In [22]:
#Now we are going to reuse the code from above and score our final logic regression model!


X_res = X_cols.values


#Setting up  the CV, code utlized from 04 Logits & SVM notebook
num_cv_iterations = 10
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         random_state = seed,
                         test_size  = 0.2)

#code utlized from 04 Logits & SVM notebook
std_scl = StandardScaler()
lr_clf = LogisticRegression(penalty='l2', C=0.05, solver='liblinear') 

# create the pipline
#code utlized from 04 Logits & SVM notebook
piped_object = Pipeline([('scale', std_scl),  # do this
                         ('logit_model', lr_clf)]) # and then do this



weights_final = []
for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X_res,y_res)):
    piped_object.fit(X_res[train_indices],y_res[train_indices])  # train object
    y_hat = piped_object.predict(X_res[test_indices])
    weights_final.append(piped_object.named_steps['logit_model'].coef_[0])
    
    cm1 = mt.confusion_matrix(y_res[test_indices],y_hat)
    
    sensitivity1= cm1[0,0]/(cm1[0,0]+cm1[0,1])
    
    specificity1= cm1[1,1]/(cm1[1,0]+cm1[1,1])
    
    # print the accuracy and confusion matrix 
    print("====Iteration",iter_num," ====")
    print("Accuracy", round(mt.accuracy_score(y_res[test_indices],y_hat),2))
   
    
    print('sensitivity:',round(sensitivity1,2))
    print('specificity:',round(specificity1,2))

weights_final = np.array(weights_final)





====Iteration 0  ====
Accuracy 0.7
sensitivity: 0.71
specificity: 0.69
====Iteration 1  ====
Accuracy 0.69
sensitivity: 0.7
specificity: 0.68
====Iteration 2  ====
Accuracy 0.68
sensitivity: 0.7
specificity: 0.67
====Iteration 3  ====
Accuracy 0.68
sensitivity: 0.69
specificity: 0.67
====Iteration 4  ====
Accuracy 0.69
sensitivity: 0.7
specificity: 0.68
====Iteration 5  ====
Accuracy 0.69
sensitivity: 0.71
specificity: 0.68
====Iteration 6  ====
Accuracy 0.69
sensitivity: 0.7
specificity: 0.68
====Iteration 7  ====
Accuracy 0.68
sensitivity: 0.7
specificity: 0.67
====Iteration 8  ====
Accuracy 0.69
sensitivity: 0.7
specificity: 0.68
====Iteration 9  ====
Accuracy 0.69
sensitivity: 0.7
specificity: 0.68


In [23]:
#Weights of the final model, Again not sure if this code is correct or not.

print('---------------------------------------------')
zip_vars = zip(lr_clf.coef_.T,X_cols.columns) # combine attributes
zip_vars = sorted(zip_vars)
for coef, name in zip_vars:
    print(name, 'has weight of', round(coef[0],4))

---------------------------------------------
SEX has weight of -0.3795
EDUCATION has weight of -0.2841
AGE has weight of 0.0664
PAY_5 has weight of 0.0908
PAY_3 has weight of 0.0963
PAY_2 has weight of 0.1318
PAY_0 has weight of 0.6629


In [24]:
#Put interpation of weights

#Need to finalize ROC curve for final logistric model.


# SVM

In [25]:
%%time
##Just to get an idea, this can be deleted later.
scl_obj = StandardScaler()

# okay, so run through the cross validation loop and set the training and testing variable for one single iteration
for train_indices, test_indices in cv_object.split(X_res,y_res): 
    # I will create new variables here so that it is more obvious what 
    # the code is doing (you can compact this syntax and avoid duplicating memory,
    # but it makes this code less readable)
    X_train = X_res[train_indices]
    y_train = y_res[train_indices]
    
    X_test = X_res[test_indices]
    y_test = y_res[test_indices]
    


from sklearn.svm import SVC

# train the model just as before
svm_clf = SVC(C=0.5, kernel='rbf', degree=3, gamma='auto') # get object
svm_clf.fit(X_train, y_train)  # train object

y_hat = svm_clf.predict(X_test) # get test set precitions

acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print('accuracy:', acc )
print(conf)



accuracy: 0.7175779976084357
[[3511 1074]
 [1524 3090]]
Wall time: 36.6 s


In [None]:
%%time
#followed example from Unit 4 workbook
from sklearn.svm import SVC

#Support vector
svm_clf = SVC(C=0.5, kernel='rbf', degree=3, gamma='auto')

piped_object_svm = Pipeline([('scale', std_scl),  # do this
                         ('logit_model', svm_clf)]) # and then do this


for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X_res,y_res)):
    piped_object_svm.fit(X_res[train_indices],y_res[train_indices])  
    y_hat = piped_object_svm.predict(X_res[test_indices])
    
    cm1 = mt.confusion_matrix(y_res[test_indices],y_hat)
    
    sensitivity1= cm1[0,0]/(cm1[0,0]+cm1[0,1])
    
    specificity1= cm1[1,1]/(cm1[1,0]+cm1[1,1])
    
    # print the accuracy and confusion matrix 
    print("====Iteration",iter_num," ====")
    print("Accuracy", round(mt.accuracy_score(y_res[test_indices],y_hat),2))
   
    
    print('sensitivity:',round(sensitivity1,2))
    print('specificity:',round(specificity1,2))
                            

Ok so we need to do a stochastic descent to reduce run time, as 10 mins to do above

In [None]:
%%time
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC



# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']



for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score
    )

    piped_object_svm = Pipeline([('scale', std_scl),  
                         ('logit_model', clf)])

  

    piped_object_svm.fit(X_res, y_res)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()


In [None]:
%%time

from sklearn.linear_model import SGDClassifier

regularize_const = 0.1
iterations = 5

svm_sgd = SGDClassifier(alpha=regularize_const,
        fit_intercept=True, l1_ratio=0.0, learning_rate='optimal',
        loss='hinge', n_iter_no_change=iterations, n_jobs=-1, penalty='l2')

piped_object_svm = Pipeline([('scale', std_scl),  
                         ('svm', svm_sgd)])


for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X_res,y_res)):
    piped_object_svm.fit(X_res[train_indices],y_res[train_indices])  
    y_hat = piped_object_svm.predict(X_res[test_indices])

    cm1 = mt.confusion_matrix(y_res[test_indices],y_hat)
    
    sensitivity1= cm1[0,0]/(cm1[0,0]+cm1[0,1])
    
    specificity1= cm1[1,1]/(cm1[1,0]+cm1[1,1])
    
    # print the accuracy and confusion matrix 
    print("====Iteration",iter_num," ====")
    print("Accuracy", round(mt.accuracy_score(y_res[test_indices],y_hat),2))
   
    
    print('sensitivity:',round(sensitivity1,2))
    print('specificity:',round(specificity1,2))

In [None]:
%%time

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.20)

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [.001,.01,.1,1],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
piped_object_grid = Pipeline([('scale', std_scl),  
                         ('svm', grid)])


piped_object_grid.fit(X_train,y_train)

print(piped_object_grid.best_estimator_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=  45.0s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=  46.0s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=  45.8s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=  44.9s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=  46.2s
[CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time=  29.7s
[CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time=  29.5s
[CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time=  29.0s
[CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time=  29.3s
[CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time=  29.7s
[CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time=  38.8s
[CV] END .................C=0.1, gamma=0.001, k