In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from scipy import linalg
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import pymc3 as pm

In [2]:
#reading in and splitting data

Df = pd.read_csv("cs-training.csv", index_col = 0)
print(len(Df))
Df = pd.DataFrame(Df)
Df = Df.dropna()
X_Cols = list(Df.columns)
X_Cols.remove("SeriousDlqin2yrs")
X = Df[X_Cols].to_numpy()
Y = Df["SeriousDlqin2yrs"].to_numpy()

150000


In [3]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

In [4]:
#resampling to correct data imbalance

Default = []
for i in range(len(X_Train)):
    if(Y_Train[i] == 1):
        Default.append(X_Train[i])
Default = np.asarray(Default)
        
Resampled = []
Resampled_y = []
Target = X_Train.shape[0]/2
Current = np.sum(Y_Train)

while(Current <= Target):
    Index = np.random.choice(Default.shape[0], 1)  
    Resampled.append(Default[Index].squeeze())
    Resampled_y.append(1)
    Current += 1
    Target += .498
    
X_Train_resamp = np.concatenate((X_Train, np.asarray(Resampled)))
Y_Train_resamp = np.concatenate((Y_Train, np.asarray(Resampled_y)))

# Premilinaries

In [None]:
# Correlation Matrix
plt.figure(figsize = (10, 10))
Corr_Matrix = Df.corr()
round(Corr_Matrix, 2)
Fig = sns.heatmap(Corr_Matrix, annot = True)
Figure = Fig.get_figure()    
Figure.savefig('CorrPlot.pdf', bbox_inches = "tight")

In [None]:
### Two by Two Plots
Plt = sns.pairplot(Df)
Plt
Figure1 = Plt.get_figure()    
Figure1.savefig('PairwisePlots.pdf', bbox_inches = "tight")

# Fisher LDA

In [5]:
mu = np.mean(X_Train)
X_Train_Demeaned = (X_Train - mu).T
X_Test_Demeaned = (X_Test - mu).T
S_t = np.cov(X_Train_Demeaned)
S_w = np.zeros(S_t.shape)
for c in np.unique(Y_Train):
    S_w += np.cov(X_Train_Demeaned[:, Y_Train == c])

S_b = S_t - S_w

Vals, Vecs = linalg.eig(np.linalg.inv(S_w)@S_b)
Vecs = Vecs[:, np.argsort(Vals)]

W_lda = Vecs[:, -1:].real

X_Train_Lda = (W_lda.T@X_Train_Demeaned).T
X_Test_Lda = (W_lda.T@X_Test_Demeaned).T

print("Scores for Train Are:")
print(np.mean(X_Train_Lda[Y_Train == 0]), np.mean(X_Train_Lda[Y_Train == 1]))

print("Scores for Test Are:")
print(np.mean(X_Test_Lda[Y_Test == 0]), np.mean(X_Test_Lda[Y_Test == 1]))

Scores for Train Are:
85.44373421738862 83.43138114569095
Scores for Test Are:
85.49241455142446 83.98570707599899


# Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
Clf = LogisticRegression(random_state = 0, max_iter = 1000).fit(X_Train_resamp, Y_Train_resamp)
Probs = Clf.predict_proba(X_Train_resamp)

Train_Predicted = np.zeros(len(Probs))
for i in range(len(Probs)):
    if (Probs[i, 0] > Probs[i, 1]):  
        Train_Predicted[i] = 0
    else:
        Train_Predicted[i] = 1  
        
Equal = 0
default = 0
non_default = 0
for i in range(len(Y_Train_resamp)):
    if(Y_Train_resamp[i] == 1):
        if(Y_Train_resamp[i] == Train_Predicted[i]):
            Equal += 1
            default += 1
    else:
        if(Y_Train_resamp[i] == Train_Predicted[i]):
            Equal += 1
            non_default += 1        


Score = Probs[:, 1]/(1 - Probs[:, 1])
print("Scores Are:")
print(np.sum(Score[Y_Train_resamp == 0]))
print(np.mean(Score[Y_Train_resamp == 0]), np.mean(Score[Y_Train_resamp == 1]))

Scores Are:
inf
inf inf


  Score = Probs[:, 1]/(1 - Probs[:, 1])


In [9]:
print("Classification Accuracy on Training Set is:")
print(Equal/len(Y_Train_resamp))
print("Classification Accuracy on Non-Default Training Set is:")
print(non_default/(len(Y_Train_resamp) - np.sum(Y_Train_resamp)))
print("Classification Accuracy on Default Training Set is:")
print(default/np.sum(Y_Train_resamp))

Classification Accuracy on Training Set is:
0.7261610589445283
Classification Accuracy on Non-Default Training Set is:
0.8706886207586161
Classification Accuracy on Default Training Set is:
0.5810997069410805


In [10]:
Probs = Clf.predict_proba(X_Test)
Test_Predicted = np.zeros(len(Probs))

for i in range(len(Probs)):
    if (Probs[i, 0] > Probs[i, 1]):  
        Test_Predicted[i] = 0
    else:
        Test_Predicted[i] = 1

        
Equal_test = 0
default_test = 0
non_default_test = 0
for i in range(len(Y_Test)):
    if(Y_Test[i] == 1):
        if(Y_Test[i] == Test_Predicted[i]):
            Equal_test += 1
            default_test += 1
    else:
        if(Y_Test[i] == Test_Predicted[i]):
            Equal_test += 1
            non_default_test += 1
        
        
        
Score = Probs[:, 1]/(1 - Probs[:, 1])
print("Scores Are:")
print(np.mean(Score[Y_Test == 0]), np.mean(Score[Y_Test == 1]))

Scores Are:
inf inf


  Score = Probs[:, 1]/(1 - Probs[:, 1])


In [11]:
print("Classification Accuracy on Test Set is:")
print(Equal_test/len(Y_Test))
print("Classification Accuracy on Non-Default Test Set is:")
print(non_default_test/(len(Y_Test) - np.sum(Y_Test)))
print("Classification Accuracy on Default Test Set is:")
print(default_test/np.sum(Y_Test))

Classification Accuracy on Test Set is:
0.8507143037113558
Classification Accuracy on Non-Default Test Set is:
0.8708104153683583
Classification Accuracy on Default Test Set is:
0.5841121495327103


In [None]:
#comparison with logit for our own checks

import statsmodels.api as sm

Logit_Model = sm.Logit(Y_Train, sm.add_constant(X_Train)).fit()
print (Logit_Model.summary())

In [None]:
Probs = Clf.predict_proba(X_Test)
Test_Predicted = np.zeros(len(Probs))

for i in range(len(Probs)):
    if (Probs[i, 0] > Probs[i, 1]):  
        Test_Predicted[i] = 0
    else:
        Test_Predicted[i] = 1


Equal = 0
for i in range(len(Y_Test)):
    if (Y_Test[i] == Test_Predicted[i]):
        Equal = Equal + 1
        
Score = Probs[:, 1]/(1 - Probs[:, 1])
print("Mean Estimated Odds Ratio for Test Are:")
print(np.mean(Score[Y_Test == 0]), np.mean(Score[Y_Test == 1]))

In [None]:
print("Classification Accuracy on Test Set is:")
Equal/len(Y_Test)

# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
Clf = RandomForestClassifier(max_depth=5, random_state=0)
Clf.fit(X_Train_resamp, Y_Train_resamp)
Probs = Clf.predict_proba(X_Train_resamp)

Train_Predicted = np.zeros(len(Probs))
for i in range(len(Probs)):
    if (Probs[i, 0] > Probs[i, 1]):  
        Train_Predicted[i] = 0
    else:
        Train_Predicted[i] = 1
        
Equal = 0
default = 0
non_default = 0
for i in range(len(Y_Train_resamp)):
    if(Y_Train_resamp[i] == 1):
        if(Y_Train_resamp[i] == Train_Predicted[i]):
            Equal += 1
            default += 1
    else:
        if(Y_Train_resamp[i] == Train_Predicted[i]):
            Equal += 1
            non_default += 1          
        
        

Score = Probs[:, 1]/(1 - Probs[:, 1])
print("Scores Are:")
print(np.mean(Score[Y_Train_resamp == 0]), np.mean(Score[Y_Train_resamp == 1]))

Scores Are:
0.8313504321271528 4.443956780907908


In [14]:
print("Classification Accuracy on Training Set is:")
print(Equal/len(Y_Train_resamp))
print("Classification Accuracy on Non-Default Training Set is:")
print(non_default/(len(Y_Train_resamp) - np.sum(Y_Train_resamp)))
print("Classification Accuracy on Default Training Set is:")
print(default/np.sum(Y_Train_resamp))

Classification Accuracy on Training Set is:
0.7751011794248467
Classification Accuracy on Non-Default Training Set is:
0.7764815678954736
Classification Accuracy on Default Training Set is:
0.7737156927029667


In [None]:
Probs = Clf.predict_proba(X_Test)

Test_Predicted = np.zeros(len(Probs))

for i in range(len(Probs)):
    if (Probs[i, 0] > Probs[i, 1]):  
        Test_Predicted[i] = 0
    else:
        Test_Predicted[i] = 1

Equal_test = 0
default_test = 0
non_default_test = 0
for i in range(len(Y_Test)):
    if(Y_Test[i] == 1):
        if(Y_Test[i] == Test_Predicted[i]):
            Equal_test += 1
            default_test += 1
    else:
        if(Y_Test[i] == Test_Predicted[i]):
            Equal_test += 1
            non_default_test += 1

Score = Probs[:, 1]/(1 - Probs[:, 1])
print("Scores Are:")
print(np.mean(Score[Y_Test == 0]), np.mean(Score[Y_Test == 1]))

In [None]:
print("Classification Accuracy on Test Set is:")
print(Equal_test/len(Y_Test))
print("Classification Accuracy on Non-Default Test Set is:")
print(non_default_test/(len(Y_Test) - np.sum(Y_Test)))
print("Classification Accuracy on Default Test Set is:")
print(default_test/np.sum(Y_Test))

# Dense NN

In [None]:
# Class_Weight = {1: 0.95,0: 0.05}
import random
random.seed(1)
Model_in = keras.Input(shape = (10, ))
X = layers.Dense(10, activation = "relu")(Model_in)
X2 = layers.Dense(10, activation= "relu")(X)
X3 = layers.Dense(10, activation= "relu")(X2)
X4 = layers.Dense(10, activation= "relu")(X3)

Out = layers.Dense(1, activation= "sigmoid")(X4)

Model = keras.Model(Model_in, Out)
Model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
Model.fit(X_Train_resamp, Y_Train_resamp, epochs = 100,
                batch_size = 128,
                shuffle = True)

In [None]:
Probs = Model.predict(X_Train_resamp)
Probs

In [None]:
Train_Predicted = np.zeros(len(Probs))
for i in range(len(Probs)):
    if (Probs[i] < .5):  
        Train_Predicted[i] = 0
    else:
        Train_Predicted[i] = 1

Equal = 0
default = 0
non_default = 0
for i in range(len(Y_Train_resamp)):
    if(Y_Train_resamp[i] == 1):
        if(Y_Train_resamp[i] == Train_Predicted[i]):
            Equal += 1
            default += 1
    else:
        if(Y_Train_resamp[i] == Train_Predicted[i]):
            Equal += 1
            non_default += 1
        
Probs

Score = Probs/(1 - Probs)
print(np.mean(Score[Y_Train_resamp == 0]), np.mean(Score[Y_Train_resamp == 1]))


Test_Predicted = np.zeros(len(Probs_test))
for i in range(len(Probs_test)):
    if (Probs_test[i] < .5):  
        Test_Predicted[i] = 0
    else:
        Test_Predicted[i] = 1

Equal_test = 0
default_test = 0
non_default_test = 0
for i in range(len(Y_Test)):
    if(Y_Test[i] == 1):
        if(Y_Test[i] == Test_Predicted[i]):
            Equal_test += 1
            default_test += 1
    else:
        if(Y_Test[i] == Test_Predicted[i]):
            Equal_test += 1
            non_default_test += 1
        
Probs

Score_test = Probs_test/(1 - Probs_test)
print(np.mean(Score_test[Y_Test == 0]), np.mean(Score_test[Y_Test == 1]))

In [None]:
np.sum(Train_Predicted)

In [None]:
print("Classification Accuracy on Training Set is:")
print(Equal/len(Y_Train_resamp))
print("Classification Accuracy on Non-Default Training Set is:")
print(Non_Default/(len(Y_Train_resamp) - np.sum(Y_Train_resamp)))
print("Classification Accuracy on Default Training Set is:")
print(Default/np.sum(Y_Train_resamp))

In [None]:
print("Classification Accuracy on Test Set is:")
print(Equal_test/len(Y_Test))
print("Classification Accuracy on Non-Default Test Set is:")
print(non_default_test/(len(Y_Test) - np.sum(Y_Test)))
print("Classification Accuracy on Default Test Set is:")
print(default_test/np.sum(Y_Test))