In [1]:
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import shap

In [2]:
# fetch dataset 
credit_score = pd.read_csv("../../data/credit_score.csv")
credit_score['R_EXPENDITURE_SQR'] = credit_score['R_EXPENDITURE']**2

# data (as pandas dataframes) 
X = credit_score[['INCOME','R_DEBT_INCOME','R_EXPENDITURE','R_EXPENDITURE_SQR','R_EXPENDITURE_SAVINGS','R_ENTERTAINMENT','CAT_GAMBLING']].copy()

# one-hot encoding
X['GAMBLING_LOW'] = X['CAT_GAMBLING'].apply(lambda x: 1 if x == 'Low' else 0)
X['GAMBLING_HIGH']  = X['CAT_GAMBLING'].apply(lambda x: 1 if x == 'High' else 0)
X.drop(columns=['CAT_GAMBLING'], inplace=True)

y = credit_score['DEFAULT']

# get a balanced dataset
n = np.sum(y)
X = pd.concat([X[y==1],X[y==0].sample(n=n, random_state=0)])
y = pd.concat([y[y==1],y[y==0].sample(n=n, random_state=0)])

print(n,len(X),len(y))


152 304 304


In [3]:
# fit model
model = sm.Logit(y, X).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.570557
         Iterations 6


0,1,2,3
Dep. Variable:,DEFAULT,No. Observations:,304.0
Model:,Logit,Df Residuals:,296.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 24 Oct 2023",Pseudo R-squ.:,0.1769
Time:,12:36:23,Log-Likelihood:,-173.45
converged:,True,LL-Null:,-210.72
Covariance Type:,nonrobust,LLR p-value:,1.782e-13

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
INCOME,-3.14e-06,1.35e-06,-2.326,0.020,-5.79e-06,-4.94e-07
R_DEBT_INCOME,0.1334,0.026,5.212,0.000,0.083,0.184
R_EXPENDITURE,-14.1720,4.719,-3.003,0.003,-23.421,-4.923
R_EXPENDITURE_SQR,18.8127,5.291,3.556,0.000,8.443,29.182
R_EXPENDITURE_SAVINGS,0.2304,0.075,3.070,0.002,0.083,0.377
R_ENTERTAINMENT,1.9536,1.963,0.995,0.320,-1.894,5.801
GAMBLING_LOW,0.2502,0.442,0.566,0.572,-0.617,1.117
GAMBLING_HIGH,0.5603,0.309,1.815,0.070,-0.045,1.165


In [4]:
# calculate model accuracy
model_pred = model.predict(X)
model_pred = model_pred.apply(lambda x: 1 if x > 0.5 else 0)
model_accuracy = np.mean(model_pred == y)
print("Model accuracy: ", model_accuracy)

# model confusion matrix
model_cm = pd.crosstab(model_pred, y, rownames=['Predicted'], colnames=['Actual'])
model_cm


Model accuracy:  0.694078947368421


Actual,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,112,53
1,40,99
