In [2]:
#Lab 4
#Housekeeping
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
     (LinearDiscriminantAnalysis as LDA,
      QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [7]:
# Loading Data and Examining Structure

df = load_data('Default')
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   default  10000 non-null  category
 1   student  10000 non-null  category
 2   balance  10000 non-null  float64 
 3   income   10000 non-null  float64 
dtypes: category(2), float64(2)
memory usage: 176.2 KB
None
  default student      balance        income
0      No      No   729.526495  44361.625074
1      No     Yes   817.180407  12106.134700
2      No      No  1073.549164  31767.138947
3      No      No   529.250605  35704.493935
4      No      No   785.655883  38463.495879


#### Question 1 - Data Examination

This data set is 10,000 observations, with 4 columns. The columns are 'default', which is a boolean indicator for if the individual defaulted on their loan;'student', which is another boolean indicator for if the individual was a student or not; 'balance', a number indicating the loan balance at the time of observation;and 'income', a number indicating the individual's income at the time of observation.

In [8]:
# 'Default Distribution'

df['default'].value_counts()

default
No     9667
Yes     333
Name: count, dtype: int64

There was 9667 individuals who did not default, and 333 individuals who did default.

In [12]:
# Fitting logistic regression model
allvars = df.columns.drop(['default'])
design = MS(allvars)
X = design.fit_transform(df)
x = sm.add_constant(X, has_constant='add')
y = (df.default == 'Yes').astype(int)
logit = sm.Logit(y,X)
results = logit.fit()
summarize(results)

Optimization terminated successfully.
         Current function value: 0.078577
         Iterations 10


Unnamed: 0,coef,std err,z,P>|z|
intercept,-10.869,0.492,-22.079,0.0
student[Yes],-0.6468,0.236,-2.738,0.006
balance,0.0057,0.0,24.737,0.0
income,3e-06,8e-06,0.37,0.712


In [14]:
e = 2.718281828459045
e**0.0057

1.0057162759095335

### Question 1 Regression Discussion
The coefficient on balance is 0.0057, with a z score of 24.737, indicating statistical significance at the 5% level. The interpretation is that a $1 increase in loan balance will be associated with a  $e^\beta$ increased chance of default by 1.0057, or 0.57%.


In [44]:
# Question 2
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

lda = LDA(store_covariance=True)
X_train, X_test = [M.drop(columns=['intercept', 'student[Yes]'])
                   for M in [X_train, X_test]]
lda.fit(X_train, y_train)



In [45]:
for label, mean in zip(lda.classes_, lda.means_):
    print(f"Class {label}: Mean = {mean}")

Class 0: Mean = [  802.15837363 33681.79366744]
Class 1: Mean = [ 1768.16582059 31570.35768985]


In [46]:
for label, prior in zip(lda.classes_, lda.priors_):
    print(f"Class {label}: Prior = {prior}")

Class 0: Prior = 0.9658571428571429
Class 1: Prior = 0.03414285714285714


In [54]:
ldapredict = lda.predict(X_test)
ldaaccuracy = lda.score(X_test, y_test)


In [None]:
confusion_table(y_test, lda.predict(X_test))    

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2900,6
1,75,19


In [56]:
print(ldaaccuracy)

0.973


In [57]:
qda = QDA(store_covariance=True)
qda.fit(X_train, y_train)

In [58]:
for label, mean in zip(qda.classes_, qda.means_):
    print(f"Class {label}: Mean = {mean}")

Class 0: Mean = [  802.15837363 33681.79366744]
Class 1: Mean = [ 1768.16582059 31570.35768985]


In [59]:
for label, prior in zip(qda.classes_, qda.priors_):
    print(f"Class {label}: Prior = {prior}")

Class 0: Prior = 0.9658571428571429
Class 1: Prior = 0.03414285714285714


In [67]:
qdapredict = qda.predict(X_test)
qdaaccuracy = np.mean(qdapredict == y_test)
confusion_table(y_test, qdapredict)


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2898,8
1,70,24


In [68]:
print(qdaaccuracy)

0.974


In [66]:
# Question 3 - Naive Bayes Classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnbpredict = gnb.predict(X_test)
gnbaccuracy = gnb.score(X_test, y_test)
confusion_table(y_test, gnbpredict)


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2893,13
1,75,19


In [65]:
print(gnbaccuracy)

0.9706666666666667


### Naive Bayes and LDA/QDA Accuracy Discussion
The Naive Bayes classifier has slightly lower accuracy than both LDA (0.973), and QDA (0.974).

In [74]:
sample = pd.DataFrame({'balance': [2000], 'income': [40000]})
gnb.predict_proba(sample)

array([[0.48919461, 0.51080539]])

The model predicts an individual with 2000 balance and 40000 income has a %51.1 chance of default

In [None]:
#Question 4 - Feature Scaling and KNN Modeling
scaler = StandardScaler(with_mean=True,
                        with_std=True,
                        copy=True)


In [86]:
featuredf = df[['income', 'balance']]
scaler.fit(featuredf)
X_std = scaler.transform(featuredf)

In [92]:
(X_train,
 X_test,
 y_train,
 y_test) = train_test_split(np.asarray(X_std),
                            df['default'] ,
                            test_size=1000,
                            random_state=42)

knn1 = KNeighborsClassifier(n_neighbors=1)
knn1_pred = knn1.fit(X_train, y_train).predict(X_test)
np.mean(y_test != knn1_pred), np.mean(y_test != "No")

(np.float64(0.049), np.float64(0.042))

In [96]:
knn1accuracy = knn1.score(X_test, y_test)

In [97]:
knn3 = KNeighborsClassifier(n_neighbors=1)
knn3_pred = knn3.fit(X_train, y_train).predict(X_test)
np.mean(y_test != knn3_pred), np.mean(y_test != "No")

(np.float64(0.049), np.float64(0.042))

In [98]:
knn3accuracy = knn3.score(X_test, y_test)

In [99]:
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5_pred = knn5.fit(X_train, y_train).predict(X_test)
np.mean(y_test != knn5_pred), np.mean(y_test != "No")

(np.float64(0.037), np.float64(0.042))

In [100]:
knn5accuracy = knn5.score(X_test, y_test)

In [101]:
knn10 = KNeighborsClassifier(n_neighbors=10)
knn10_pred = knn10.fit(X_train, y_train).predict(X_test)
np.mean(y_test != knn10_pred), np.mean(y_test != "No")

(np.float64(0.036), np.float64(0.042))

In [102]:
knn10accuracy = knn10.score(X_test, y_test)

In [129]:
confusion_table(y_test, knn10_pred)

Truth,No,Yes
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,953,5
Yes,31,11


In [105]:
data = {
    'KNN Model': ['KNN 1', 'KNN 3', 'KNN 5', 'KNN 10'],
    'Accuracy': [knn1accuracy, knn3accuracy, knn5accuracy, knn10accuracy]
}

table = pd.DataFrame(data)
print(table)        # prints as plain text


  KNN Model  Accuracy
0     KNN 1     0.951
1     KNN 3     0.951
2     KNN 5     0.963
3    KNN 10     0.964


### KNN Discussion
The table above demonstrates that in this case, using K= 10 is preferred to lesser amounts of K, because the model is able to predict more accurately. In this case, the K=1 specifier is underfit, and so adding more nearest neighbors to the model can help capture more signal and thus is more accurate.

In [113]:
# Question 5 - Model Comparison

#Logit Model Refit
logit = LogisticRegression(C=1e10, solver='liblinear')
logit.fit(X_train, y_train)
logit_pred = logit.predict_proba(X_test)
logit_labels = np.where(logit_pred[:,1] > .5, 'Yes', 'No')
logitaccuracy = logit.score(X_test, y_test)

In [128]:
confusion_table(y_test, logit_labels)

Truth,No,Yes
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,954,4
Yes,29,13


In [118]:

data = {
    'Model': ['Logistic Regression', 'LDA', 'QDA', 'Naive Bayes', 'KNN 10'],
    'Accuracy': [logitaccuracy, ldaaccuracy, qdaaccuracy, gnbaccuracy, knn10accuracy]
}
table2 = pd.DataFrame(data)
table2 = table2.sort_values('Accuracy', ascending=False)
print(table2)

                 Model  Accuracy
2                  QDA  0.974000
1                  LDA  0.973000
3          Naive Bayes  0.970667
0  Logistic Regression  0.967000
4               KNN 10  0.964000


In [132]:
data = {
    'Model': ['Logistic Regression', 'LDA', 'QDA', 'Naive Bayes', 'KNN 10'],
    'False Negative Rate': [0.004, 0.002, 0.003, 0.004, 0.005]
}
table3 = pd.DataFrame(data)
table3 = table3.sort_values('False Negative Rate', ascending=False)
print(table3)

                 Model  False Negative Rate
4               KNN 10                0.005
0  Logistic Regression                0.004
3          Naive Bayes                0.004
2                  QDA                0.003
1                  LDA                0.002


In [133]:
# if the false negative rate is the most expensive error, we should choose the method which acheive the lowest ratio of false negatives/total error. 
ldaratio = 0.002 / (1 - ldaaccuracy)
qdaratio = 0.003 / (1 - qdaaccuracy)
knn10ratio = 0.005 / (1 - knn10accuracy)    
logitratio = 0.004 / (1 - logitaccuracy)
naivebayesratio = 0.004 / (1 - gnbaccuracy)
ratios = {
    'Model': ['Logistic Regression', 'LDA', 'QDA', 'Naive Bayes', 'KNN 10'],
    'False Negative to Total Error Ratio': [logitratio, ldaratio, qdaratio, naivebayesratio, knn10ratio]
}
table4 = pd.DataFrame(ratios)   
table4 = table4.sort_values('False Negative to Total Error Ratio', ascending=True)
print(table4)

                 Model  False Negative to Total Error Ratio
1                  LDA                             0.074074
2                  QDA                             0.115385
0  Logistic Regression                             0.121212
3          Naive Bayes                             0.136364
4               KNN 10                             0.138889


Using this ratio, LDA is the best method to limit false negatives.

In [136]:
#Using LDA under a 0.3 probability threshold:
lda_probs = lda.predict_proba(X_test)[:, 1]

# Threshold 0.3
labels_03 = np.where(lda_probs > 0.3, 'Yes', 'No')
print("Confusion matrix for threshold 0.3:")
print(confusion_table(y_test, labels_03))


Confusion matrix for threshold 0.3:
Truth       No  Yes
Predicted          
No         958    0
Yes         42    0




Changing the threshold from o.5 to 0.3 reduced the false negative rate to 0, and raised the false positive rate from 0.025 (2.5%) to 0.042 (4.2%). If false negatives are 10x the cost, this trade off is worth it.