### Import Libraries

In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

### Import datasets: training and test

In [94]:
file_train = 'train.csv'
df_train = pd.read_csv(file_train)
df_train.shape

(42000, 785)

In [95]:
file_test = 'test.csv'
df_test = pd.read_csv(file_test)
df_test.shape

(28000, 784)

### Standardize: Training and Test Set

In [96]:
X_training_set, y_training_set = df_train.drop('label', axis=1), df_train['label']
scaler = StandardScaler()
Xstd_train = scaler.fit_transform(X_training_set)
Xstd_train.shape

(42000, 784)

In [97]:
X_test_set = df_test
Xstd_test = (X_test_set - scaler.mean_)/scaler.scale_
Xstd_test = Xstd_test.replace(np.nan, 0)
Xstd_test.shape

(28000, 784)

### Split training set into: train and test

In [98]:
X_train, X_test, y_train, y_test = train_test_split(Xstd_train, y_training_set, test_size=.25, random_state=1)

### Check optimum number of dimensions

In [99]:
n_neighbors = 5
m_vec = [1, 5, 10]
neigh = KNeighborsClassifier(n_neighbors)

In [100]:
accuracy = []
for m in m_vec:
    
    print('m: {:<3}'.format(m), end=' ')
    
    # Run PCA for m dimensions
    pca = PCA(n_components=m)
    pca.fit(X_train)
    print('.', end='')
    
    # Training set: reduce and train KNN
    Xred_train = np.dot(X_train, pca.components_.T)
    neigh.fit(Xred_train, y_train)
    print('.', end='')
    
    # Test set: reduce dimensions and predict
    Xred_test = np.dot(X_test, pca.components_.T)
    y_pred = neigh.predict(Xred_test)
    print('.', end='')
    
    # Calculate accuracy of model
    acc = accuracy_score(y_test, y_pred)
    accuracy.append(acc)
    print('. acc: {:.3f}'.format(acc), end='\n')
    
    # delete pca object
    del pca
    


m: 1   .... acc: 0.252
m: 5   .... acc: 0.741
m: 10  .... acc: 0.906


### Calculate y for final test submission

In [101]:
m = 400

In [None]:
pca = PCA(n_components=m)
pca.fit(X_train)

# Training set: reduce and train KNN
Xred_train = np.dot(X_train, pca.components_.T)
neigh.fit(Xred_train, y_train)

# Test set: reduce dimensions and predict
Xred_test = np.dot(Xstd_test, pca.components_.T)
y_pred = neigh.predict(Xred_test)
y_pred.shape

In [None]:
n = 1980

plt.imshow(X_test_set.values[n].reshape((28,28)))
plt.title('Prediction: {}'.format(y_pred[n]))
plt.show()

In [None]:
df_test['Label'] = y_pred
df_test.index = df_test.index + 1
df_test.index.name = 'ImageId'

In [None]:
file = 'm_{}_knn_{}.csv'.format(m, n_neighbors)
df_test[['Label']].to_csv(file)