# CS 178 Project part 1 KNN

In [1]:
import numpy as np

import mltools as ml
import sys
sys.path.append('code')

import matplotlib.pyplot as plt 

In [2]:
def print_error(learner, data):
    Xtr,Xte,Ytr,Yte = data
    print("Trainning error = ",learner.err(Xtr,Ytr))
    print("Testing error = ",learner.err(Xte,Yte))

In [13]:
def plot_roc(learner,data,file=""):
    Xtr,Xte,Ytr,Yte = data
    fpr,tpr,tnr = learner.roc(Xtr,Ytr)
    plt.plot(fpr,tpr, label="training roc")
    fpr,tpr,tnr = learner.roc(Xte,Yte)
    plt.plot(fpr,tpr,label="testing roc")
    plt.title("ROC curve")
    plt.legend()
    if(file != ""):
        plt.savefig('output/{}.png'.format(file))
    plt.show()

In [14]:
def generating_output(learner,filename ="output",feature_space = [i for i in range(X.shape[1])]):
    Xte = np.genfromtxt('data/X_test.txt', delimiter=',')
    Xte = Xte[:,feature_space]
    Yte = np.vstack((np.arange(Xte.shape[0]), learner.predictSoft(Xte)[:,1])).T
    np.savetxt("{}.txt".format(filename),Yte,'%d, %.2f',header='Id,Predicted',comments='',delimiter=',')

In [6]:
np.random.seed(0)
X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')
X,Y = ml.shuffleData(X,Y)
data = Xtr,Xte,Ytr,Yte = ml.splitData(X,Y,0.75) #75%training and 25% testing

In [None]:
naive_knn = ml.knn.knnClassify()
naive_knn.train(Xtr,Ytr)
naive_knn.K = 10

k=2^4=16 seems like a good choice, we will stick to that.

In [None]:
knn_tr_error = []
knn_te_error = []
k_values = [2**i for i in range(10)]
for k in k_values:
    naive_knn.K = k
    knn_tr_error.append(naive_knn.err(Xtr,Ytr))
    knn_te_error.append(naive_knn.err(Xte,Yte))
    print("k = {} done.".format(k))

In [None]:
plt.plot(range(10),knn_tr_error, label= "training error")
plt.plot(range(10),knn_te_error, label= "testing error")
plt.title("k value vs error rate")
plt.xlabel("K value = 2**x")
plt.ylabel("error rate")
plt.savefig('output/kvalue_error.png')
plt.legend()
plt.show()

### Well, naive Knn does not seem doing well, need some feature selection

First come into mind is the emsemble technique, which pick only a portion of feature to the leaner.

In [None]:
# KNN with limited feature enabled
class KNN_f(ml.knn.knnClassify):
    def __init__(self, X=None, Y=None,features = 1, K=1):
        self.selected_feature = sorted( np.random.choice(range(X.shape[1]),features , replace=False) )
        ml.knn.knnClassify.__init__(self,X[:,self.selected_feature],Y,K)
    
    def predictSoft(self,X):
        if(X.shape[1] == len(self.selected_feature)):
            return ml.knn.knnClassify.predictSoft(self,X)
        return ml.knn.knnClassify.predictSoft(self,X[:,self.selected_feature])

In [None]:
class E_knn(ml.classifier):
    def __init__(self,Xtr,Ytr,nbags=1,features=1,K = 1, partition = 1):
        self.classifiers = [ None ] * nbags # Allocate space for learners
        self.features_number = features
        self.nbags = nbags
        self.n_boot = X.shape[0]//partition
        self.classes = list(np.unique(Ytr))
        for i in range(nbags):
            Xi, Yi = ml.bootstrapData(Xtr,Ytr,self.n_boot)
            knn_f = ml.knn.knnClassify(Xi, Yi,K)
            self.classifiers[i]=knn_f
            
    def predictSoft(self,X):
        Y = np.zeros( (X.shape[0], self.nbags,2) )
        for i in range(self.nbags):
            Y[:,i] = self.classifiers[i].predictSoft(X)
        return np.mean(Y,axis=1)

In [None]:
auc_list = []
nbags = [2**i for i in range(12)]
for nb in nbags:
    a = E_knn(Xtr,Ytr,nb,107,1,100)
    auc_list.append(a.auc(Xte,Yte))
    print("nb = {} done.".format(nb))

In [None]:
plt.title("number of learner vs AUC value")
plt.xlabel("number of learner = 2**x")
plt.ylabel("AUC value")
plt.plot(auc_list)
plt.show()

In [None]:
plt.plot(range(10),knn_tr_error, label= "training error")
plt.plot(range(10),knn_te_error, label= "testing error")
plt.legend()
plt.xlabel("K value = 2**x")
plt.ylabel("error rate")
plt.savefig('output/knn.png')
plt.title("k value vs error rate")
plt.show()

In [None]:
plt.plot(auc_list,label = "testing auc")
plt.xlabel("number of learner (2**x)")
plt.ylabel("AUC value")
plt.legend()
plt.title("number of learner vs AUC value")
plt.savefig('output/eknn.png')
plt.show()

In [None]:
class E_knn_limited(ml.classifier):
    def __init__(self,Xtr,Ytr,nbags=1,features=1,K = 1, partition = 1):
        self.classifiers = [ None ] * nbags # Allocate space for learners
        self.features_number = features
        self.nbags = nbags
        self.n_boot = X.shape[0]//partition
        self.classes = list(np.unique(Ytr))
        for i in range(nbags):
            Xi, Yi = ml.bootstrapData(Xtr,Ytr,self.n_boot)
            knn_f = ml.knn.KNN_f(Xi, Yi,features,K)
            self.classifiers[i]=knn_f
            
    def predictSoft(self,X):
        Y = np.zeros( (X.shape[0], self.nbags,2) )
        for i in range(self.nbags):
            Y[:,i] = self.classifiers[i].predictSoft(X)
        return np.mean(Y,axis=1)

In [None]:
auc_list_limited = []
features = [(107//(2*i)) for i in range(1,10)]
for f in features:
    a = E_knn(Xtr,Ytr,256,f,1,100)
    auc_list_limited.append(a.auc(Xte,Yte))
    print("feature = {} done.".format(f))

In [None]:
plt.plot(features,auc_list_limited,label = "testing auc")
plt.xlabel("number of feature used (2**x)")
plt.ylabel("AUC value")
plt.title("number of features used vs AUC value")
plt.savefig('output/feature_auc.png')
plt.legend()
plt.show()

In [None]:
auc_list_limited_new = []
new_features = [i for i in range(17,26)]
for f in new_features:
    a = E_knn(Xtr,Ytr,256,f,1,100)
    auc_list_limited_new.append(a.auc(Xte,Yte))
    print("feature = {} done.".format(f))

In [None]:
plt.plot(new_features,auc_list_limited_new,label = "testing auc")
plt.title("number of features used vs AUC value")
plt.ylabel("AUC value")
plt.legend()
plt.savefig('output/auc_2.png')
plt.title("number of feature vs AUC")
plt.show()

In [None]:
a = E_knn(Xtr,Ytr,256,21,1,100)
plot_roc(a,data,"roc_f21")

In [None]:
generating_output(a,"knn_out")

feature #  0 done
feature #  1 done
feature #  2 done
feature #  3 done
feature #  4 done
feature #  5 done
feature #  6 done
feature #  7 done
feature #  8 done
feature #  9 done
feature #  10 done
feature #  11 done
feature #  12 done
feature #  13 done
feature #  14 done
feature #  15 done
feature #  16 done
feature #  17 done
feature #  18 done
feature #  19 done
feature #  20 done
feature #  21 done
feature #  22 done
feature #  23 done
feature #  24 done
feature #  25 done
feature #  26 done
feature #  27 done
feature #  28 done
feature #  29 done
feature #  30 done
feature #  31 done
feature #  32 done
feature #  33 done
feature #  34 done
feature #  35 done
feature #  36 done
feature #  37 done
feature #  38 done
feature #  39 done
feature #  40 done
feature #  41 done
feature #  42 done
feature #  43 done
feature #  44 done
feature #  45 done
feature #  46 done
feature #  47 done
feature #  48 done
feature #  49 done
feature #  50 done
feature #  51 done
feature #  52 done
fea

In [32]:
Xtr[:,0].reshape(Xtr.shape[0],1)

array([[ 833.],
       [3472.],
       [1316.],
       ...,
       [ 560.],
       [1449.],
       [ 966.]])

In [44]:
tr = np.genfromtxt('feature_score.txt', delimiter=',')

In [51]:
np.max(tr[:,1])

0.5237068965517241

In [4]:
from sklearn import preprocessing
np.random.seed(0)
X = np.genfromtxt('data/X_train.txt', delimiter=',')
preprocessing.normalize(X,axis=0)[0]
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')
X,Y = ml.shuffleData(X,Y)
data = Xtr,Xte,Ytr,Yte = ml.splitData(X,Y,0.5) #75%training and 25% testing

In [22]:
knn = ml.knn.knnClassify()
knn.train(Xtr[:,[28, 44, 49, 74, 54, 68, 73, 70, 60, 94, 57, 1, 43, 53, 10, 71, 42, 56]  ],Ytr)
knn.k = 1
knn.auc(Xte[:,[28, 44, 49, 74, 54, 68, 73, 70, 60, 94, 57, 1, 43, 53, 10, 71, 42, 56]  ],Yte)

0.5764513395169264

In [23]:
generating_output(knn,"knnout_18_features",feature_space=[28, 44, 49, 74, 54, 68, 73, 70, 60, 94, 57, 1, 43, 53, 10, 71, 42, 56])