In [3]:
import numpy as np

from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

#Data-preparation

images = np.loadtxt("handwritten_digits_images.csv", delimiter=',')
labels = np.loadtxt("handwritten_digits_labels.csv", delimiter=',')

# In order to achieve reasonables execution times we tried to consider only a quarter of the original input
# and we have to perform incremental PCA to reduce images dimensions from 748 to 40

#The main idea of principal component analysis (PCA) is to reduce the dimensionality of a data set consisting
#of many variables correlated with each other, either heavily or lightly, while retaining the variation present 
#in the dataset, up to the maximum extent. 

#Incremental principal component analysis (IPCA) is typically used as a replacement for principal component 
#analysis (PCA) when the dataset to be decomposed is too large to fit in memory. IPCA builds a low-rank 
#approximation for the input data using an amount of memory which is independent of the number of input data samples. It is still dependent on the input data features, but changing the batch size allows for control of memory usage.


#Because of the dataset that is very simple to classify we can get a very good accuracy as well!
images = images[1::4]
labels = labels[1::4]

pca = IncrementalPCA(n_components=40, batch_size=100)
images_pca = pca.fit_transform(images)

print("pca performed")

X_train, X_test, y_train, y_test = train_test_split(images_pca, labels,test_size=0.3, random_state=32)

#Normalization
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalizing the RGB codes by dividing it to the max RGB value.
X_train /= 255
X_test /= 255

KnnClassifier=KNeighborsClassifier()

grid_params = {"n_neighbors": range(1, 10)}

grid_search = GridSearchCV(KnnClassifier, grid_params, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

print()
print("Best parameters set found on development set:")
print()
print(grid_search.best_params_)

predicted = grid_search.predict(X_test)

acc = accuracy_score(y_test, predicted)

print(acc)

pca performed
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=1 ...................................................
[CV] n_neighbors=2 ...................................................
[CV] n_neighbors=2 ...................................................
[CV] n_neighbors=2 ...................................................
[CV] n_neighbors=3 ...................................................
[CV] n_neighbors=3 ...................................................
[CV] .................................... n_neighbors=1, total=   3.3s
[CV] n_neighbors=3 ...................................................
[CV] .................................... n_neighbors=1, total=   3.3s
[CV] n_neighbors=4 ...................................................
[CV] .................................... n_neighbors=1, total=   3.4s
[CV

[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   41.2s finished



Best parameters set found on development set:

{'n_neighbors': 3}
0.9634285714285714
