## Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import random

from sklearn.neighbors import KNeighborsClassifier, DistanceMetric
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

import sys
import time
import math

%config InlineBackend.figure_format='svg'
%matplotlib inline

## Import Data

In [2]:
def readCoverType (filename):
    data = np.genfromtxt(filename, delimiter=',')
    label = np.array(data[:, -1])
    feature = np.array(data[:,:-1])
    
    return feature, label

def readMNIST (filename):
    data = np.genfromtxt(filename, delimiter = ",")
    data = data.transpose()
    
    label = np.array(data[:, -1])
    feature = np.array(data[:,:-1])
    
    return feature, label

In [3]:
sys.stdout.write('Loading MNIST data... ')
MNIST_train_feature, MNIST_train_label = readMNIST('./MNIST/train.csv')
MNIST_test_feature, MNIST_test_label = readMNIST('./MNIST/test.csv')
print ('done.')

Loading MNIST data... done.


In [4]:
sys.stdout.write('Split MNIST into specific outputs... ')
# split into 0, 1, 3, 5
#   1) zip labels and features together
MNIST_train_list = list(zip(MNIST_train_feature, MNIST_train_label))
MNIST_test_list = list(zip(MNIST_test_feature, MNIST_test_label))
print ('done.')

Split MNIST into specific outputs... done.


In [5]:
sys.stdout.write('Loading Cover Type data... ')
CovType_total_feature, CovType_total_label = readCoverType('./covtype.data/covtype.data')
split = int(CovType_total_feature.shape[0] * 0.75)

CovType_train_feature = CovType_total_feature[:split]
CovType_test_feature = CovType_total_feature[split:]

CovType_train_label = CovType_total_label[:split]
CovType_test_label = CovType_total_label[split:]
print ('done.')

Loading Cover Type data... done.


In [6]:
sys.stdout.write('Split MNIST into specific outputs... ')
# split into 0, 1, 3, 5
#   1) zip labels and features together
CovType_train_list = list(zip(CovType_train_feature, CovType_train_label))
CovType_test_list = list(zip(CovType_test_feature, CovType_test_label))
print ('done.')

Split MNIST into specific outputs... done.


In [5]:
def shuffle (featurelist, labellist):
    merged = list(zip(featurelist, labellist))
    random.shuffle(merged)
    featurelist, labellist = zip(*merged)
    return np.array(featurelist), np.array(labellist)

def mergeAndShuffle(list1, list2):
    featurelist = []
    labellist = []
    for (feature, label) in list1:
        featurelist.append(feature)
        labellist.append(label)
    for (feature, label) in list2:
        featurelist.append(feature)
        labellist.append(label)
    return shuffle (featurelist, labellist)

# K-NN

## Choose Dataset

In [6]:
feature, label = mergeAndShuffle(MNIST_test_list, MNIST_train_list)

### Select Test Set

In [7]:
quarter = int (label.shape[0] / 4)
test_feature = feature[:quarter]
train_feature = feature[quarter:]

test_label = label[:quarter]
train_label = label[quarter:]

7058


### Select Training Set

In [10]:
# Randomly select a subset of d% from the training data where d={50,75,100}.
# Generate five training sets for each of the d% data, the 100% case however will just have one set.
quarter = int (train_label.shape[0] / 4)

train_data = [random.sample (list(zip(train_feature, train_label)), i * quarter) for i in [2, 3, 4]]

5294


## Parameter Estimation

### Determine Best K

In [15]:
clf = KNeighborsClassifier(n_neighbors=3) 
train_feat, train_label = zip(*(train_data[0]))
val_feat = test_feature
y_true = test_label

In [17]:
clf.fit (train_feat, train_label) # train

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [18]:

y_pred = clf.predict(val_feat) # predict
print(accuracy_score(y_true, y_pred))

0.986681779541


In [39]:
def nFold (train_data, K, n = 2, dist = DistanceMetric.get_metric('euclidean')):
    sample_size = int(len(train_data) / float(n))
    train_chunks = [train_data[x:x+sample_size] for x in range(0, len(train_data), sample_size)]
    for i in range(n):
        val_data = train_chunks[i]
        for j in range (n):
            if j != i:
                train_data.extend(train_chunks[j])
        clf = KNeighborsClassifier(n_neighbors=3) 
        train_feat, train_label = zip(*train_data)
        clf.fit (train_feat, train_label) # train
        val_feat, y_true = zip(*val_data)
        y_pred = clf.predict(val_feat) # predict
    return accuracy_score(y_true, y_pred)

def leaveOneOut (train_data, K):
    return 0.0

In [None]:

# determine the best K
min_error = 1.0
best_k = 1
for d in [50, 75, 100]:
    d = int(d / 25 - 2)
    for K in range (10, 30, 10):
        error = (nFold (train_data[d], K, n = 2) + nFold (train_data[d], K, n = 5) + nFold (train_data[d], K, n = 4)) / 3.0
        if error < min_error: # remember k for the lowest average
            best_k = K
            min_error = error

In [None]:
print ('The 2-fold error is: ', nFold (train_data[d], best_k, n = 2))
print ('The 5-fold error is: ', nFold (train_data[d], best_k, n = 5))
print ('The leave-one-out error is: ', leaveOneOut (train_data[d], best_k))

### Determine Best Distance Metric

## Test K-NN

In [None]:
# Now, test the test data with the best K and its associated training data and compare the error rate to cross validation error.
#   Report which F estimated the test error the best.

# Resources

In [None]:
# Assignment: 
#   https://nbviewer.jupyter.org/github/CSE291/CSE291/blob/master/hw/hw3/hw3.ipynb

# Library K-NN:
#   http://scikit-learn.org/stable/modules/neighbors.html
#   http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier

# Custom K-NN:
#   http://www.kdnuggets.com/2016/01/implementing-your-own-knn-using-python.html
#   http://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/

# Cross-Validation Techniques
#   https://www.cs.cmu.edu/~schneide/tut5/node42.html

# Bugs:
# Descrition: 
#   solution: