In [117]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis as LDA,
                                           QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm


In [118]:
Boston = load_data('Boston')
Boston.columns


Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'lstat', 'medv'],
      dtype='object')

In [119]:
crimemedian = Boston['crim'].median()
print(crimemedian)

0.25651


In [120]:
crim01 = np.where(Boston['crim']>crimemedian,1,0)

In [121]:
Boston['crim01'] = crim01

In [122]:
Boston.corr()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv,crim01
crim,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,0.455621,-0.388305,0.409395
zn,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,-0.412995,0.360445,-0.436151
indus,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,0.6038,-0.483725,0.60326
chas,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,-0.053929,0.17526,0.070097
nox,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,0.590879,-0.427321,0.723235
rm,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,-0.613808,0.69536,-0.156372
age,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,0.602339,-0.376955,0.61394
dis,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,-0.496996,0.249929,-0.616342
rad,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,0.488676,-0.381626,0.619786
tax,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,0.543993,-0.468536,0.608741


In [123]:
#zn, indus, nox, rm, age, dis, rad, tax, lstat, medv, ptratio seem to be relevant variables

In [124]:
x = Boston[['zn','indus','nox','rm','age','dis','rad','tax','lstat','medv','ptratio']].values
y = Boston['crim01'].values

(X_train,X_test,y_train,y_test) = train_test_split(x,y,test_size=0.1)

In [135]:
def ClassifierCompare(testsize):
    (X_train,X_test,y_train,y_test) = train_test_split(x,y,test_size=testsize)
    method = ['LDA','NB','KNN','LR']
    accuracy = []

    #LDA
    lda = LDA(store_covariance=True)
    lda.fit(X_train,y_train)
    lda_pred = lda.predict(X_test)
    accuracy.append(np.mean(lda_pred==y_test))

    #NB
    nb = GaussianNB()
    nb.fit(X_train,y_train)
    nb_pred = nb.predict(X_test)
    accuracy.append(np.mean(nb_pred==y_test))

    #KNN
    kneighbors = []
    for K in range(1,11):
        
        knn = KNeighborsClassifier(n_neighbors=K)
        knn.fit(X_train,y_train)
        knn_pred = knn.predict(X_test)
        kneighbors.append(np.mean(knn_pred==y_test))
    accuracy.append(kneighbors[0])
    method[2] = 'KNN1'
    for k in range(len(kneighbors)):
        if kneighbors[k]>accuracy[2]:
            method[2] = 'KNN' + str(k)
            accuracy[2] = kneighbors[k]
    
    #LR
    logit = LogisticRegression(C=1e10,solver='liblinear')
    logit.fit(X_train,y_train)
    logit_pred = logit.predict(X_test)
    accuracy.append(np.mean(logit_pred==y_test))

    #Summary
    for i in range(4):
        print(method[i],accuracy[i])

for k in range(1,5):
    print(k)
    ClassifierCompare(k*0.1)

1
LDA 0.8627450980392157
NB 0.7843137254901961
KNN2 0.9411764705882353
LR 0.9019607843137255
2
LDA 0.8529411764705882
NB 0.7549019607843137
KNN1 0.8823529411764706
LR 0.8529411764705882
3
LDA 0.8157894736842105
NB 0.7763157894736842
KNN1 0.9473684210526315
LR 0.868421052631579
4
LDA 0.7931034482758621
NB 0.7881773399014779
KNN3 0.9211822660098522
LR 0.8472906403940886


In [125]:
lda = LDA(store_covariance=True)
lda.fit(X_train,y_train)
lda_pred = lda.predict(X_test)
print(np.mean(lda_pred == y_test))
confusion_table(lda_pred,y_test)

0.8235294117647058


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21,7
1,2,21


In [126]:
nb = GaussianNB()
nb.fit(X_train,y_train)
nb_pred = nb.predict(X_test)
print(np.mean(nb_pred==y_test))
confusion_table(nb_pred,y_test)

0.8627450980392157


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,20,4
1,3,24


In [127]:
for K in range(1,11):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn.fit(X_train,y_train)
    knn_pred = knn.predict(X_test)
    print(np.mean(knn_pred==y_test))
#Seems to perform best for K=4

0.9411764705882353
0.9411764705882353
0.9411764705882353
0.9411764705882353
0.9411764705882353
0.9411764705882353
0.9019607843137255
0.9019607843137255
0.9019607843137255
0.9019607843137255


In [128]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)
print(np.mean(knn_pred==y_test))
confusion_table(knn_pred,y_test)

0.9411764705882353


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,23,3
1,0,25


In [129]:
logit = LogisticRegression(C=1e10,solver='liblinear')
logit.fit(X_train,y_train)
logit_pred = logit.predict(X_test)
print(np.mean(logit_pred == y_test))
confusion_table(logit_pred,y_test)

0.8235294117647058


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,20,6
1,3,22
