We begin by writing a naive KNN classifier.

In [24]:
import numpy as np
from scipy.stats import mode

class KNN:
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def classify(self, k, point):
        dists = np.sum((self.features - point)**2, axis=1)
        neighbor_inds = np.argsort(dists)[:min(k, len(dists))]
        neighbor_labels = self.labels[neighbor_inds]
        return mode(neighbor_labels)[0][0]
        
    def classify_arr(self, k, points):
        npoints = points.shape[0]
        res = np.zeros(npoints, dtype=self.labels.dtype)
        for i in xrange(npoints):
            res[i] = self.classify(k, points[i])
        return res.astype(self.labels.dtype)
    


We evaluate the performance of the classifier on the iris dataset.

In [27]:
from sklearn.datasets import load_iris
iris = load_iris()

features = iris['data']
labels = iris['target']

def split_data(features, labels, split_factor=.6):
    npoints = len(labels)
    cutoff = int(npoints*split_factor)
    mask = np.arange(npoints)
    np.random.shuffle(mask)
    train_mask = mask[:cutoff]
    test_mask = mask[cutoff:]
    return features[train_mask], labels[train_mask], features[test_mask], labels[test_mask]

trainf, trainl, testf, testl = split_data(features, labels)
kmodel = KNN(trainf, trainl)
print "KNN Accuracy on Iris Dataset %f" % (np.sum(kmodel.classify_arr(2, testf) == testl)/float(len(testl)))

KNN Accuracy on Iris Dataset 0.950000


Now that we've destroyed the Iris dataset with KNN, we move onto harder stuff: SVM's and tumor classification.

In [56]:
import cvxopt
from cvxopt.solvers import qp

def solve_SVM(kernel, X, Y):
    n_samples = len(Y)
    K = np.zeros((n_samples,n_samples))
    for i in xrange(n_samples):
        for j in xrange(n_samples):
            K[i,j] = kernel(X[i,:], X[j,:])
    Q = cvxopt.matrix(np.outer(Y, Y) * K)
    q = cvxopt.matrix(np.ones(n_samples) * -1)
    A = cvxopt.matrix(Y, (1, n_samples))
    b = cvxopt.matrix(0.0)
    G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
    h = cvxopt.matrix(np.zeros(n_samples))
    solution = qp(Q, q, G, h, A, b)
    return np.ravel(solution['x'])

class SVM:
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        self.kernel = None
        self.a = None
    
    def setKernel(self, kernel):
        self.kernel = kernel
        
    def train(self):
        self.a = solve_SVM(kernel, self.X, self.Y)
    
    def classify_arr(self, X):
        npoints = X.shape[0]
        res = np.zeros(npoints)
        for i in xrange(npoints):
            res[i] = self.classify(X[i])
        return res
        
        
    def classify(self, x):
        npoints = len(self.Y)
        tot = 0
        for i in xrange(npoints):
            tot += self.Y[i] * self.a[i] * self.kernel(x, self.X[i])
        if tot > 0: return 1
        else: return -1
        


We now load in the cancer tumor dataset 'provided' for this lab.

In [57]:
import pandas as pd
cancer_data = pd.read_csv("cancer.csv")
#for col in cancer_data.columns: print col, cancer_data[col].dtype
cancer_data['bare-nuclei'] = cancer_data['bare-nuclei'].convert_objects(convert_numeric=True)
cancer_data['bare-nuclei'] = cancer_data['bare-nuclei'].fillna(cancer_data['bare-nuclei'].median())
raw_mat = cancer_data.as_matrix()
X = raw_mat[:,1:-1]
Y = raw_mat[:,-1]
cancer_data.columns

Index([u'Unnamed: 0', u'clump-thickness', u'uniformity-of-cell-size', u'uniformity-of-cell-shape', u'marginal-adhesion', u'single-epithelial-cell-size', u'bare-nuclei', u'bland-chromatin', u'normal-nucleoli', u'mitoses', u'cancerous'], dtype='object')

In [86]:
from numpy.linalg import norm
trainX, trainY, testX, testY = split_data(X,Y)
SVM_model = SVM(trainX, trainY)

cvxopt.solvers.options['show_progress']=False

In [87]:
def test_svm(kernel, kernel_name):
    SVM_model.setKernel(kernel)
    SVM_model.train()
    acc = np.sum(SVM_model.classify_arr(testX) == testY)/float(len(testY))
    print "SVM with kernel "+ kernel_name+" gets an accuracy of %f" % (acc)

gam = .4
kernel = lambda x, y: np.exp(- gam * norm(x-y)**2)
test_svm(kernel, "gaussian, gamma=%f"%gam)


SVM with kernel gaussian, gamma=0.400000 gets an accuracy of 0.950000


In [88]:
a = -10
d = -1.5
test_svm(lambda x , y: (x.dot(y) +a)**d , "polynomial, a=%f, d=%f"%(a,d))

SVM with kernel polynomial, a=-10.000000, d=-1.500000 gets an accuracy of 0.653571


In [89]:
from math import tanh
r = -24
test_svm(lambda x,y : -tanh(x.dot(y) + r), "sigmoid, r=%f"%r)

SVM with kernel sigmoid, r=-24.000000 gets an accuracy of 0.935714


After tuning of parameters, I get 90%+ accuracy with sigmoid and gaussian kernels. However, the polynomial kernel only yields 60-70% accuracy.

In [98]:
knn_model = KNN(trainX, trainY)
k=3
print "KNN accuracy with k= %d: %f " % (k, np.sum(testY==knn_model.classify_arr(k, testX))/float(len(testY)))

KNN accuracy with k= 3: 0.967857 


Interestingly, KNN acheives better accuracy than all of the SVM methods on this dataset!