In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import normalize
from sklearn.feature_selection import VarianceThreshold

from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

from operator import itemgetter

In [2]:
%matplotlib inline

# Import data

In [3]:
# load data
rawTrain = pd.read_csv("../input/train.csv").astype(float)
rawTest = pd.read_csv("../input/test.csv").astype(float)

# extract labels and features 
Y = rawTrain.ix[:,0].values
rawX = rawTrain.ix[:,1:].values

# Prepare data

In [4]:
normX = normalize(rawX)
normTest = normalize(rawTest.values)
normX.shape

(42000L, 784L)

# Remove features with zero variance

In [5]:
selector = VarianceThreshold()
X = selector.fit_transform(normX)
X.shape

(42000L, 708L)

# Tuning hyperparameters

In [6]:
def report(grid_scores, n_top=3):
    """Print top N best combinations"""
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")
        
def tuneRandomForest(X, Y):
    """Tune hyperparameters for Random Forest"""
    param_grid = {
                    "n_estimators": [100],
                    "max_features": [30, 40, 50, 60, 80],
                    "min_samples_split": [1, 3, 10],
                    "min_samples_leaf": [1, 3, 10],
                    "bootstrap": [False]
                }
    grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs =-1)
    grid.fit(X, Y)
    report(grid.grid_scores_)
    
def tuneKNeighbors(X, Y):
    """Tune hyperparameters for KNeighbors"""
    param_grid = {
                    "n_neighbors": [3, 4, 5],
                    "weights": ["uniform", "distance"]
                }
    grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=5, n_jobs =-1)
    grid.fit(X, Y)
    report(grid.grid_scores_)

In [7]:
tuneKNeighbors(X, Y)

Model with rank: 1
Mean validation score: 0.973 (std: 0.001)
Parameters: {'n_neighbors': 4, 'weights': 'distance'}

Model with rank: 2
Mean validation score: 0.972 (std: 0.002)
Parameters: {'n_neighbors': 3, 'weights': 'distance'}

Model with rank: 3
Mean validation score: 0.972 (std: 0.002)
Parameters: {'n_neighbors': 5, 'weights': 'distance'}



In [8]:
tuneRandomForest(X, Y)

Model with rank: 1
Mean validation score: 0.968 (std: 0.001)
Parameters: {'max_features': 30, 'min_samples_split': 3, 'bootstrap': False, 'n_estimators': 100, 'min_samples_leaf': 1}

Model with rank: 2
Mean validation score: 0.968 (std: 0.001)
Parameters: {'max_features': 40, 'min_samples_split': 1, 'bootstrap': False, 'n_estimators': 100, 'min_samples_leaf': 1}

Model with rank: 3
Mean validation score: 0.968 (std: 0.001)
Parameters: {'max_features': 30, 'min_samples_split': 1, 'bootstrap': False, 'n_estimators': 100, 'min_samples_leaf': 1}

