In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import cross_val_score, KFold
from skopt import BayesSearchCV
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings("ignore")

### Creating the train and test dataset

In [2]:
df_train = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')

In [3]:
df_train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Spliting X_train, y_train and X_test

In [4]:
X_train = df_train.drop(['label'], axis=1)
y_train = df_train['label']
X_test = df_test

# Free memory space

del df_train
del df_test

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_test :', X_test.shape)

Shape of X_train: (42000, 784)
Shape of y_train: (42000,)
Shape of X_test : (28000, 784)


In [5]:
# Counting the sample training

unique, count = np.unique(y_train, return_counts=True)
print("Counting each label: {0}".format(dict(zip(unique, count))))

Counting each label: {0: 4132, 1: 4684, 2: 4177, 3: 4351, 4: 4072, 5: 3795, 6: 4137, 7: 4401, 8: 4063, 9: 4188}


### Functions to performance measures

In [6]:
# Validation of the model with Kfold stratified splitting the data into n_splits parts

random_seed = 42

def get_accuracy(model, x, y, seed):
    n_folds = 5
    kfold = KFold(n_folds, shuffle=True, random_state=seed).get_n_splits(x)
    accuracy = cross_val_score(model, 
                               x, 
                               y=y, 
                               scoring="accuracy",
                               n_jobs=-1,
                               cv=kfold)
    return accuracy

In [7]:
def get_results(models, algorithms, X, y, seed):
    # Getting all results from n_splits validations for each classifier

    clf_results = []
    for clf in models:
        clf_results.append(get_accuracy(clf, X, y, seed))

    # Getting the mean and standard deviation from each classifier's result after few validations

    clf_means = []
    clf_std = []
    for clf_result in clf_results:
        clf_means.append(clf_result.mean())
        clf_std.append(clf_result.std())

    # Let's see the best scores of each algorithm

    df_result = pd.DataFrame({"Means":clf_means, 
                              "Stds": clf_std, 
                              "Algorithm": algorithms})
    
    return df_result.sort_values(by=['Means'], ascending=True)

### Normalizing the X values

In [8]:
X_train = X_train / 255
X_test = X_test / 255

### Decreasing dimensionalities using PCA

In [9]:
n_components = 40
pca = PCA(n_components=n_components, whiten=True)
pca.fit(X_train)
X_train_PCA = pd.DataFrame(pca.transform(X_train))

print('Shape of X_train:', X_train_PCA.shape)

Shape of X_train: (42000, 40)


### Creating models

In [10]:
knn = KNeighborsClassifier()

In [11]:
get_results([knn], ['KNN'], X_train_PCA, y_train, random_seed)

Unnamed: 0,Algorithm,Means,Stds
0,KNN,0.963881,0.000985


### After tuning the hyperparameters

In [12]:
from skopt.space import Real, Integer

def tuning(model, param_grid, X, y, seed):
    
    n_folds = 5
    kfold = KFold(n_folds, shuffle=True, random_state=seed).get_n_splits(X)
    
    opt = BayesSearchCV(model,
                        param_grid,
                        n_iter=30,
                        cv=kfold,
                        scoring='accuracy',
                        verbose=0)
    
    opt.fit(X, y)
    
    model_best = opt.best_estimator_

    # Best score
    print('Best score....:', np.round(opt.best_score_, 4))

    # Best estimator
    print('Best estimator:', model_best)
    
    return model_best

In [13]:
## Search grid for optimal parameters (KNN)

parameters = {'n_neighbors': (1, 3),
        'weights': ('uniform', 'distance'),
        'leaf_size': (35, 40),
        'p': (1, 3)}

start = time.time()
knn_best = tuning(knn, parameters, X_train_PCA, y_train, random_seed)
end = time.time()
print('-------------------------------------')
print('Start:', start)
print('End  :', end)
print('-------------------------------------')
print('Duration: {0:.0f} h {1:.0f} min'.format(((end-start)//3600), ((end-start)%60)))

Best score....: 0.9662
Best estimator: KNeighborsClassifier(algorithm='auto', leaf_size=35, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=1,
           weights='distance')
-------------------------------------
Start: 1539440930.96403
End  : 1539511750.7459345
-------------------------------------
Duration: 19 h 20 min


### Predicting the labels

In [15]:
knn_best.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=35, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=1,
           weights='distance')

In [16]:
label_predicted = knn_best.predict(X_test)

In [18]:
label_predicted = pd.Series(label_predicted, name="Label")

In [19]:
solution = pd.concat([pd.Series(range(1, 28001),name = "ImageId"), label_predicted], axis = 1)
solution.to_csv("solution_knn.csv",index=False)

In [20]:
solution.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
