In [1]:
# Author: Miriam Heller <mheller8@gatech.edu>
# OMSCS 7641

# Application to the sk-learns's built in 0-9 digit dataset
from sklearn.neighbors import KNeighborsClassifier

# Standard scientific Python imports
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Import digits dataset
from sklearn.datasets import load_digits

In [2]:
#
# Goal: Apply the k-NN classifier to two datasets. Evaluate the models while varying hyperparameter values.
# Learn how to select the best model for each algorithm and dataset. Compare the performance of the k-NN
# algorithm on the different dataset and determine whether one type of data was more amenable to the
# algorithm than the other. Through analysis speculate or explain why.
#
# Dataset 1 = the sci-kit subset of the MNIST database of handwritten digits 0-9. This subset consists of
# 1797 samples with each digit depicted/store as 8x8 16 grey-scale # images. Thus each sample has 64
# features.
#
# Parameters to vary for model selection:
#
# • k - number of nearest neighbors to use to estimate target
#
# • Learning (Split on training/CV/test) - 70%/30%, 80%/20%, 85%/15%, 90%/10% training/test set scheme.
#   Reduce overfitting by constraining minimum samples_split to 5% of sample to be fit (90%).
#
# • d(x,q) - definition of distance, e.g., Manhatten, Euclidean, Distance-Weighted, etc.
#
# Models will be evaluated and selected based on the analysis of the results of parameter variation in
# terms of bias, variance and learning curves.
#

In [3]:

# Exploratory data analysis to verify dataset
#
# Load MNIST digit data and verify dimensions
data_set_name = "MNIST Digit Dataset"
digits = load_digits()
samples = digits.data.shape[0]
features = digits.data.shape[1]
classes = digits.target_names.shape[0]
print samples
print features
print classes
print digits.target_names
print digits.data[1:2]
print digits.target.shape[0]

1797
64
10
[0 1 2 3 4 5 6 7 8 9]
[[  0.   0.   0.  12.  13.   5.   0.   0.   0.   0.   0.  11.  16.   9.
    0.   0.   0.   0.   3.  15.  16.   6.   0.   0.   0.   7.  15.  16.
   16.   2.   0.   0.   0.   0.   1.  16.  16.   3.   0.   0.   0.   0.
    1.  16.  16.   6.   0.   0.   0.   0.   1.  16.  16.   6.   0.   0.
    0.   0.   0.  11.  16.  10.   0.   0.]]
1797


In [4]:
# Separate data into randomly selected training/validation and test
# sets keeping the test set for final model testing. * Since the iris
# data is so small will do a K-fold cv but will still do final
# test for some percent reserved for hold_out.*
#
hold_out =.4
rand = 42
X_train, X_holdout, y_train, y_holdout = cross_validation.train_test_split(
    digits.data, digits.target, test_size=hold_out, random_state=rand)
# Split holdout evenly into a cross-validation set and a final test set
X_CV, X_test, y_CV, y_test = cross_validation.train_test_split(
    X_holdout, y_holdout, test_size=.5, random_state=rand)

In [5]:
print 'samples = ', digits.data.shape[0]
print 'features = ', digits.data.shape[1]
print 'targets = ', digits.target.shape[0]
#print 'classes = ', digits.target_names.shape[0]
#print 'X_train=', X_train
#print 'y_train=', y_train
print X_train.shape
print y_train.shape
print X_CV.shape
print y_CV.shape
print X_test.shape
print y_test.shape

samples =  1797
features =  64
targets =  1797
(1078, 64)
(1078,)
(359, 64)
(359,)
(360, 64)
(360,)
