In [1]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
Credit = pd.read_csv("C:/Users/arab/Downloads/Credit.csv")
Credit['Balance_bin'] = pd.cut(Credit.Balance, bins=[-1,460,2000],labels=['0','1'])
Credit = Credit.drop(['ID', 'Balance', 'Limit', 'Rating'], axis=1)
print('Dimension of the data: ' + str(Credit.shape))
Credit.head()

Dimension of the data: (400, 9)


Unnamed: 0,Income,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance_bin
0,14.891,2,34,11,Male,No,Yes,Caucasian,0
1,106.025,3,82,15,Female,Yes,Yes,Asian,1
2,104.593,4,71,11,Male,No,No,Asian,1
3,148.924,3,36,11,Female,No,No,Asian,1
4,55.882,2,68,16,Male,No,Yes,Caucasian,0


Classification: Fit an LDA with Balance_bin as response:

In [3]:
# data: dropping the credit-card related features (Limit & Rating) to make the classification problem more challenging
X = pd.get_dummies(Credit.drop(['Balance_bin'], axis=1))
y = Credit['Balance_bin']
# CV
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# define model
LDA_mod = LinearDiscriminantAnalysis()
# evaluate model
LDA_scores = cross_val_score(LDA_mod, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy for LDA: %.3f (%.3f)' % (np.mean(LDA_scores), np.std(LDA_scores)))
     

Mean Accuracy for LDA: 0.629 (0.093)


Classification: Fit an QDA with Balance_bin as response:

In [4]:
# define model
QDA_mod = QuadraticDiscriminantAnalysis()
# evaluate model
QDA_scores = cross_val_score(QDA_mod, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy for QDA: %.3f (%.3f)' % (np.mean(QDA_scores), np.std(QDA_scores)))

Mean Accuracy for QDA: 0.521 (0.096)


Classification: Fit a Naive Bayes with Balance_bin as response:

In [5]:
# define model
NB_mod = GaussianNB()
# evaluate model
NB_scores = cross_val_score(NB_mod, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy for QDA: %.3f (%.3f)' % (np.mean(NB_scores), np.std(NB_scores)))
     

Mean Accuracy for QDA: 0.616 (0.085)


Classification: Fit a Naive Bayes on PCs with Balance_bin as response:

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# PCA
std_scale = StandardScaler().fit(x_train)
X_train_std = std_scale.transform(x_train)
X_test_std = std_scale.transform(x_test)

pca = PCA()
PCs_train = pd.DataFrame(pca.fit_transform(X_train_std))
PCs_test = pd.DataFrame(pca.transform(X_test_std))

# define model
NB_pc_mod = GaussianNB().fit(PCs_train, y_train)
NB_pc_pred = NB_pc_mod.predict(PCs_test)

# summarize result
print(accuracy_score(y_test, NB_pc_pred))
print(confusion_matrix(y_test, NB_pc_pred, normalize='true'))
     

0.6875
[[0.88888889 0.11111111]
 [0.47727273 0.52272727]]
