In [1]:
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import LeaveOneLabelOut
from sklearn.linear_model import LogisticRegression
from skimage.transform import rescale, resize
from sklearn.feature_selection import SelectKBest

from matplotlib import pyplot as plt
%matplotlib inline

import numpy as np

In [12]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [8]:
X = train.loc[:, 'pixel0':].astype(np.float64)
y = train.loc[:, 'label']

In [9]:
sk = SelectKBest(k=200)
std = StandardScaler()
lr = LogisticRegression(random_state=0, max_iter=20)

pipe = Pipeline([('sk', sk),
                 ('std', std),
                 ('lr', lr)])

param_grid = {'lr__penalty': ['l1', 'l2'],
              'lr__C': [1., 10.]}

cv = GridSearchCV(pipe, param_grid=param_grid, scoring='accuracy')

In [10]:
cv.fit(X, y)

  22  23  24  25  26  27  28  29  30  31  52  53  54  55  56  57  82  83
  84  85 110 111 112 139 140 141 167 168 196 224 392 420 421 448 476 532
 560 588 616 644 645 671 672 673 698 699 700 701 725 726 727 728 729 730
 731 754 755 756 757 758 759 760 761 780 781 782 783] are constant.
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  52  53  54  55
  56  57  58  82  83  84  85 111 112 113 139 140 141 168 196 336 364 392
 420 421 448 476 532 560 615 643 644 645 671 672 673 699 700 701 727 728
 729 730 731 754 755 756 757 758 759 760 780 781 782 783] are constant.
  22  23  24  25  26  27  28  29  30  31  32  33  52  53  54  55  56  57
  59  82  83  84  85 111 112 139 140 141 168 169 196 392 393 420 421 448
 476 504 532 560 587 644 645 671 672 673 699 700 701 727 728 729 730 731
 753 754 755 756 757 758 759 760 779 780 781 782 783] are constant.
  22  23  24  25  26  27  28  29  30  31  52  53  54  55  56  57  82  83
  84  85 110 111 112 139 140 141 167 168 196 224 392 420 421 4

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('sk', SelectKBest(k=200, score_func=<function f_classif at 0x7fb51f766598>)), ('std', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=20, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'lr__C': [1.0, 10.0], 'lr__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [11]:
cv.best_score_

0.88807142857142862

In [16]:
prediction = cv.predict(test.loc[:, 'pixel0':].astype(np.float64))

In [17]:
submission = pd.DataFrame({'Label': prediction, 'ImageId': np.arange(1, test.shape[0] + 1, 1)})

In [18]:
submission.to_csv('../submission/200KBestStdLogReg.csv', index=False)