In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
Credit = pd.read_csv("C:/Users/arab/Downloads/Credit.csv")
print('Dimension of the data: ' + str(Credit.shape))
Credit.head()
     

Dimension of the data: (400, 12)


Unnamed: 0,ID,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


In [4]:
Credit.describe()
Credit['Balance_bin'] = pd.cut(Credit.Balance, bins=[-1,600,2000], labels=[0,1])
print(Credit['Balance_bin'].value_counts(normalize=True))
Credit.head()

0    0.59
1    0.41
Name: Balance_bin, dtype: float64


Unnamed: 0,ID,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance,Balance_bin
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333,0
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903,1
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580,0
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964,1
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331,0


Classification: Fit a logistic regression model with no regularizations on Balance_bin as response:

In [5]:
# data: dropping the credit-card related features (Limit & Rating) to make the classification problem more challenging
X = pd.get_dummies(Credit.drop(['ID', 'Balance', 'Balance_bin', 'Limit', 'Rating'], axis=1))
y = Credit['Balance_bin']

# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# create model
model = linear_model.LogisticRegression(penalty='none')
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
     

Accuracy: 0.664 (0.079)


LR test from ANOVA for each predictor:

In [6]:
from scipy import stats

def calculate_nested_f_statistic(small_model, big_model):
    """Given two fitted GLMs, the larger of which contains the parameter space of the smaller, return the F Stat and P value corresponding to the larger model adding explanatory power"""
    addtl_params = big_model.df_model - small_model.df_model
    f_stat = (small_model.deviance - big_model.deviance) / (addtl_params * big_model.scale)
    df_numerator = addtl_params
    # use fitted values to obtain n_obs from model object:
    df_denom = (big_model.fittedvalues.shape[0] - big_model.df_model)
    p_value = stats.f.sf(f_stat, df_numerator, df_denom)
    return ('F stat ', f_stat, ' P-Val: ', p_value)

from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families

X_norm = pd.DataFrame(MinMaxScaler().fit_transform(X), columns = X.columns)
model_full = GLM(y, X_norm[["Age", "Education", "Income"]], family=families.Binomial(),).fit()
model_red = GLM(y, X_norm[["Age", "Education"]], family=families.Binomial(),).fit()
calculate_nested_f_statistic(model_red, model_full)

('F stat ', 38.940506674167636, ' P-Val: ', 1.1193369874683984e-09)

Logistic regression model with LASSO regularization:

In [9]:
modelCV = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear', cv=5, random_state=0, max_iter=1000).fit(X, y)
print('Best regularization parameter (inverse of lambda): ', modelCV.C_)
print('Model accuracy: ', modelCV.score(X, y))
y_pred = modelCV.predict(X)
print('Confusion matrix with counts: ')
print(confusion_matrix(y, y_pred))
print('Confusion matrix with proportions: ')
print(confusion_matrix(y, y_pred, normalize='true'))

Best regularization parameter (inverse of lambda):  [0.35938137]
Model accuracy:  0.6875
Confusion matrix with counts: 
[[205  31]
 [ 94  70]]
Confusion matrix with proportions: 
[[0.86864407 0.13135593]
 [0.57317073 0.42682927]]


Multi-class: Create a three-class version of balance

In [10]:
y_multi = pd.cut(Credit.Balance, bins=[-1,300,700,2000], labels=['0','1','2'])
print(y_multi.value_counts(normalize=True))
     

0    0.4000
2    0.3475
1    0.2525
Name: Balance, dtype: float64


Multinomial regression model: no regularization.

In [12]:
# create model
model = linear_model.LogisticRegression(penalty='none')
# evaluate model
scores = cross_val_score(model, X, y_multi, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Accuracy: 0.513 (0.075)


Multinomial regression model with LASSO regularization.

In [14]:
MultiModelCV = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear', cv=5, random_state=0, max_iter=1000).fit(X, y_multi)
print('Best regularization parameter: ', MultiModelCV.C_)
print('Model accuracy: ', MultiModelCV.score(X, y_multi))
y_pred = MultiModelCV.predict(X)
print('Confusion matrix with counts: ')
print(confusion_matrix(y_multi, y_pred))
print('Confusion matrix with proportions: ')
print(confusion_matrix(y_multi, y_pred, normalize='true'))

Best regularization parameter:  [5.99484250e-03 1.00000000e-04 3.59381366e-01]
Model accuracy:  0.4175
Confusion matrix with counts: 
[[ 41 106  13]
 [ 17  71  13]
 [ 16  68  55]]
Confusion matrix with proportions: 
[[0.25625    0.6625     0.08125   ]
 [0.16831683 0.7029703  0.12871287]
 [0.11510791 0.48920863 0.39568345]]
