## Training NMNIST dataset using sklearn 

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import datetime as dt
from past.builtins import xrange

#Classifier
from sklearn import neighbors
#Cross validation 
from sklearn.model_selection import cross_val_score
#colour map for visualisation
from matplotlib.colors import ListedColormap

In [2]:
# Import whole N-MNIST Dataset
def load_NMNIST(path):
    xs_train = []
    ys_train = []
    xs_test = []
    ys_test = []

    for class_index in range(0, 10):
        for (root, dirs, dat_files) in os.walk('{0}/n_Train_3/{1}'.format(path, str(class_index))):
            for file in dat_files:
                single_X = np.fromfile('{0}/n_Train_3/{1}/{2}'.format(path, str(class_index), file), dtype=np.int32)
                xs_train.append(single_X)
                ys_train.append(class_index)

        for (root, dirs, dat_files) in os.walk('{0}/n_Test_3/{1}'.format(path, str(class_index))):
            for file in dat_files:
                xs_test.append(np.fromfile('{0}/n_Test_3/{1}/{2}'.format(path, str(class_index), file), dtype=np.int32))
                ys_test.append(class_index)

    Xtr = np.array(xs_train)
    Ytr = np.array(ys_train)
    Xte = np.array(xs_test)
    Yte = np.array(ys_test)
       
    return Xtr, Ytr, Xte, Yte

In [3]:
# Cleaning up variables to prevent loading data multiple times (which may cause memory issue)
try:
    del X_train, y_train
    del X_test, y_test
    print('Clear previously loaded data.')
except:
    pass

#Load data
data_set_path = 'M:/LowPowerActionRecognition/CNN/NMNIST/datasets'
data = load_NMNIST(data_set_path)

#initialise data

X_train = data[0]
y_train = data[1]
X_test = data[2]
y_test = data[3]

# As a sanity check, we print out the size of the training and test data.
print('Training data shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Training data shape:  (60000, 2312)
Training labels shape:  (60000,)
Test data shape:  (10000, 2312)
Test labels shape:  (10000,)


In [4]:
# Subsample the data for more efficient code execution in this exercise
num_training = 60000 #60000
mask = list(range(num_training))
X_train = X_train[mask]
y_train = y_train[mask]

num_test = 10000 #10000
mask = list(range(num_test))
X_test = X_test[mask]
y_test = y_test[mask]

In [5]:
#Set the number of neighbor,k
n_neighbors = 5

"""
p is the distance -> L1=1 and L2=2

weights will choose from uniform/distance

algorithm will auto choose from: 
ball_tree’ will use BallTree ;‘kd_tree’ will use KDTree ; ‘brute’ will use a brute-force search.
"""
classifier = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform', algorithm='auto', p=2)

#Set timer
start_time = dt.datetime.now()
print('Start learning at {}'.format(str(start_time)))

#Fit the data
classifier.fit(X_train, y_train)

end_time = dt.datetime.now() 
print('Stop learning {}'.format(str(end_time)))
elapsed_time= end_time - start_time
print('Elapsed learning {}'.format(str(elapsed_time)))

Start learning at 2018-12-05 15:50:17.394522
Stop learning 2018-12-05 15:50:32.384592
Elapsed learning 0:00:14.990070


In [6]:
#Set timer
start_time = dt.datetime.now()
print('Start testing at {}'.format(str(start_time)))    

classifier.predict(X_test)
final_score = classifier.score(X_test, y_test)

end_time = dt.datetime.now() 
print('Stop testing at {}'.format(str(end_time)))
elapsed_time= end_time - start_time
print('Elapsed testing at {}'.format(str(elapsed_time)))
print(">>> SCORE:",final_score,"<<<")

Start testing at 2018-12-05 15:50:32.390577
Stop testing at 2018-12-05 17:00:10.719713
Elapsed testing at 1:09:38.329136
>>> SCORE: 0.924 <<<


In [7]:
#Using L1 distance 

#Set timer
start_time = dt.datetime.now()
print('Start testing at {}'.format(str(start_time)))


# we create an instance of Neighbours Classifier and fit the data.
classifier_L1 = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform', algorithm='auto', p=1)
classifier_L1.fit(X_train, y_train)
#classifier_L1.predict(X_test)
final_score_L1 = classifier.score(X_test, y_test)

end_time = dt.datetime.now() 
print('Stop testing at {}'.format(str(end_time)))
elapsed_time= end_time - start_time
print('Elapsed testing at {}'.format(str(elapsed_time)))

print("For L1 distance:")
print(">>> SCORE:",final_score_L1,"<<<")

Start testing at 2018-12-05 17:00:10.725697
Stop testing at 2018-12-05 18:05:51.107448
Elapsed testing at 1:05:40.381751
For L1 distance:
>>> SCORE: 0.924 <<<


In [8]:
print("The comparision of L1 and L2 distance parameters:")
print("For L1, the accuracy is:",final_score_L1)
print("For L2, the accuracy is:",final_score)

The comparision of L1 and L2 distance parameters:
For L1, the accuracy is: 0.924
For L2, the accuracy is: 0.924


#### Discussion: From the result of comparing the manhattan_distance(L1) and euclidean_distance(L2) there has shown insignificant increase in accuracy for L2 and the time it takes is pretty much the same.

##                                        CROSS-VALIDATION ( with 10% dataset it still takes too long >30mins)

In [None]:
#Set timer
start_time = dt.datetime.now()
print('Start testing at {}'.format(str(start_time)))

n_neighbors_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
k_to_accuracies = {}
for n_neighbors in n_neighbors_choices:   
    #create a new KNN model
    #knn_cv = KNeighborsClassifier(n_neighbors)
    knn_cv  = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform', algorithm='auto', p=2)       
    #train model with cv of 5 
    cv_scores = cross_val_score(knn_cv, X_train, y_train, cv=5)
           
    #print each cv score (accuracy) and average them  
    k_to_accuracies.setdefault(n_neighbors, []).append(cv_scores)

# Print out the computed accuracies
for n_neighbors in sorted(k_to_accuracies):
    """for cv_scores in k_to_accuracies[n_neighbors]:
        #print('k = %d, accuracy = '%f' % (k, accuracy))
        print('k = %d, accuracy = %f' % (n_neighbors, cv_scores))"""
    print('mean for k=%d is %f' % (n_neighbors, np.mean(k_to_accuracies[n_neighbors])))
    
end_time = dt.datetime.now() 
print('Stop testing at {}'.format(str(end_time)))
elapsed_time= end_time - start_time
print('Elapsed testing at {}'.format(str(elapsed_time)))


Start testing at 2018-12-05 18:23:09.738027
