In [43]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from scipy import linalg
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import pymc3 as pm
import random

In [104]:
Df = pd.read_csv("cs-training.csv", index_col = 0)
print(len(Df))
Df = pd.DataFrame(Df)
Df = Df.dropna()
Df.insert(1, 'Intercept', '1')
Df['Intercept'] = Df['Intercept'].astype(float)



print(Df.columns)

Df.rename(columns= {'NumberOfTime30-59DaysPastDueNotWorse':'NumberOfTimeThirtyFiftyNineDaysPastDueNotWorse'}, inplace = True)
Df.rename(columns= {'NumberOfTimes90DaysLate':'NumberOfTimesNinetyDaysLate'}, inplace = True)
Df.rename(columns= {'NumberOfTime60-89DaysPastDueNotWorse':'NumberOfTimeSixtytoEightyNineDaysPastDueNotWorse'}, inplace = True)

X_Cols = list(Df.columns)
X_Cols.remove("SeriousDlqin2yrs")
X = Df[X_Cols]
Y = Df["SeriousDlqin2yrs"]


X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

150000
Index(['SeriousDlqin2yrs', 'Intercept', 'RevolvingUtilizationOfUnsecuredLines',
       'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
       'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'],
      dtype='object')


In [78]:
with pm.Model() as Logistic_Model:
    
    Lambda1 = pm.HalfNormal("Lambda1", sigma = 100)
    Lambda2 = pm.HalfNormal("Lambda2", sigma = 100)
    Lambda3 = pm.HalfNormal("Lambda3", sigma = 100)
    
    Lambda4 = pm.HalfNormal("Lambda4", sigma = 100)
    Lambda5 = pm.HalfNormal("Lambda5", sigma = 100)
    Lambda6 = pm.HalfNormal("Lambda6", sigma = 100)
    Lambda7 = pm.HalfNormal("Lambda7", sigma = 100)
    
    Intercept = pm.Normal('Intercept', 0, sd = 100)
    Beta_1 = pm.Normal('Beta_1', 0, sd = 100)
    Beta_2 = pm.Poisson('Beta_2', mu = Lambda1)
    Beta_3 = pm.Poisson('Beta_3', mu = Lambda2)
    
    Beta_4 = pm.Normal('Beta_4', 0, sd = 100)
    Beta_5 = pm.Normal('Beta_5', 0, sd = 100)
    
    
    Beta_6 = pm.Poisson('Beta_6', mu = Lambda3)
    Beta_7 = pm.Poisson('Beta_7', mu = Lambda4)
    
    Beta_8 = pm.Poisson('Beta_8', mu = Lambda5)
    
    Beta_9 = pm.Poisson('Beta_9', mu = Lambda6)
    
    Beta_10 = pm.Poisson('Beta_10', mu = Lambda7)
    
    Likelihood = pm.invlogit(Intercept + Beta_1 * X_Train.RevolvingUtilizationOfUnsecuredLines + Beta_2 * X_Train.age +
                            Beta_3 * X_Train.NumberOfTimeThirtyFiftyNineDaysPastDueNotWorse + Beta_4 * X_Train.DebtRatio +
                            Beta_5 * X_Train.MonthlyIncome + Beta_6 * X_Train.NumberOfOpenCreditLinesAndLoans + Beta_7 * X_Train.NumberOfTimesNinetyDaysLate +
                            Beta_8 * X_Train.NumberRealEstateLoansOrLines + Beta_9 * X_Train.NumberOfTimeSixtytoEightyNineDaysPastDueNotWorse +
                            Beta_10 * X_Train.NumberOfDependents)
    
    pm.Bernoulli(name = "logit", p = Likelihood, observed = Y_Train)

In [80]:
with pm.Model() as Logistic_Model:
    
    Intercept = pm.Normal('Intercept', 0, sd = 100)
    Beta_1 = pm.Normal('Beta_1', 0, sd = 100)
    Beta_2 = pm.Normal('Beta_2', 0, sd = 100)
    Beta_3 = pm.Normal('Beta_3', 0, sd = 100)
    Beta_4 = pm.Normal('Beta_4', 0, sd = 100)
    Beta_5 = pm.Normal('Beta_5', 0, sd = 100)
    
    Beta_6 = pm.Normal('Beta_6', 0, sd = 100)
    Beta_7 = pm.Normal('Beta_7', 0, sd = 100)
    Beta_8 = pm.Normal('Beta_8', 0, sd = 100)
    Beta_9 = pm.Normal('Beta_9', 0, sd = 100)
    Beta_10 = pm.Normal('Beta_10', 0, sd = 100)
    
    Likelihood = pm.invlogit(Intercept + Beta_1 * X_Train.RevolvingUtilizationOfUnsecuredLines + Beta_2 * X_Train.age +
                            Beta_3 * X_Train.NumberOfTimeThirtyFiftyNineDaysPastDueNotWorse + Beta_4 * X_Train.DebtRatio +
                            Beta_5 * X_Train.MonthlyIncome + Beta_6 * X_Train.NumberOfOpenCreditLinesAndLoans + Beta_7 * X_Train.NumberOfTimesNinetyDaysLate +
                            Beta_8 * X_Train.NumberRealEstateLoansOrLines + Beta_9 * X_Train.NumberOfTimeSixtytoEightyNineDaysPastDueNotWorse +
                            Beta_10 * X_Train.NumberOfDependents)
    
    pm.Bernoulli(name = "logit", p = Likelihood, observed = Y_Train)

In [81]:
random.seed(1)
with Logistic_Model:
    MAP_Estimate = pm.find_MAP()

def Print_Map(Res):
    return pd.Series({k: np.asscalar(v) for k, v in Res.items()})

Print_Map(MAP_Estimate)




  


Intercept   -0.672393
Beta_1      -0.000085
Beta_2      -0.040680
Beta_3       0.532546
Beta_4      -0.000119
Beta_5      -0.000042
Beta_6      -0.002517
Beta_7       0.443406
Beta_8       0.028243
Beta_9      -0.941775
Beta_10      0.080549
dtype: float64

In [40]:
random.seed(1)
with Logistic_Model:
    Trace = pm.sample(tune = 1000, draws = 1000, chains = 4, init = 'adapt_diag', cores = 3)

  return wrapped_(*args_, **kwargs_)
Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (4 chains in 3 jobs)
NUTS: [Beta_10, Beta_9, Beta_8, Beta_7, Beta_6, Beta_5, Beta_4, Beta_3, Beta_2, Beta_1, Intercept]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 4832 seconds.


In [41]:
pm.summary(Trace)

Got error No model on context stack. trying to find log_likelihood in translation.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
Intercept,-1.509,0.057,-1.615,-1.403,0.001,0.001,1516.0,1877.0,1.0
Beta_1,-0.0,0.0,-0.0,0.0,0.0,0.0,2101.0,1916.0,1.0
Beta_2,-0.025,0.001,-0.027,-0.023,0.0,0.0,1689.0,1953.0,1.0
Beta_3,0.504,0.015,0.477,0.532,0.0,0.0,2809.0,2524.0,1.0
Beta_4,-0.0,0.0,-0.0,-0.0,0.0,0.0,3174.0,2524.0,1.0
Beta_5,-0.0,0.0,-0.0,-0.0,0.0,0.0,2782.0,2240.0,1.0
Beta_6,-0.004,0.003,-0.011,0.002,0.0,0.0,2252.0,2171.0,1.0
Beta_7,0.42,0.02,0.385,0.459,0.0,0.0,2218.0,2374.0,1.0
Beta_8,0.068,0.013,0.043,0.093,0.0,0.0,2451.0,2747.0,1.0
Beta_9,-0.887,0.023,-0.929,-0.844,0.001,0.0,1999.0,2284.0,1.0


In [114]:
Map_Est_Coef = np.array([-0.672393, -0.000085, -0.040680, 0.532546, -0.000119, -0.000042, -0.002517, 0.443406, 0.028243, -0.941775, 0.080549])

### Training Classification Accuracy

Probs = np.zeros(len(X_Train))

for i in range(len(X_Train)):
    Vector = np.array(X_Train.iloc[i, :])
    Probs[i] = 1/(1 + np.exp(-np.dot(Vector, Map_Est_Coef)))


Train_Predicted = np.zeros(len(Probs))

for i in range(len(Probs)):
    if (Probs[i] > 0.5):  
        Train_Predicted[i] = 1
    else:
        Train_Predicted[i] = 0

In [119]:
Equal = 0
for i in range(len(Y_Train)):
    if (Y_Train.iloc[i] == Train_Predicted[i]):
        Equal = Equal + 1
        
        
Score = Probs/(1 - Probs)
print("Mean Scores for Train Are:")
print(np.mean(Score[Y_Train == 0]), np.mean(Score[Y_Train == 1]))


Mean Scores for Train Are:
0.08943172873376908 0.3181951908261956


In [120]:
print("Classification Accuracy on Training Set is:")
Equal/len(Y_Train)

Classification Accuracy on Training Set is:


0.9313849590469099

In [123]:
### Testing Classification Accuracy

Probs = np.zeros(len(X_Test))

for i in range(len(X_Test)):
    Vector = np.array(X_Test.iloc[i, :])
    Probs[i] = 1/(1 + np.exp(-np.dot(Vector, Map_Est_Coef)))


Test_Predicted = np.zeros(len(Probs))

for i in range(len(Probs)):
    if (Probs[i] > 0.5):  
        Test_Predicted[i] = 1
    else:
        Test_Predicted[i] = 0
        
Equal = 0
for i in range(len(Y_Test)):
    if (Y_Test.iloc[i] == Test_Predicted[i]):
        Equal = Equal + 1
        
Score = Probs/(1 - Probs)
print("Mean Scores for Test Are:")
print(np.mean(Score[Y_Test == 0]), np.mean(Score[Y_Test == 1]))


print("Classification Accuracy on Test Set is:")
print(Equal/len(X_Test))

Mean Scores for Test Are:
0.08140064197453614 0.26779875144101095
Classification Accuracy on Test Set is:
0.9303081458338582
