In this script I will try to understand how GridSearchCV works

In [1]:
from sklearn import datasets, svm
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
import numpy as np
from sklearn.pipeline import Pipeline

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
svc = svm.SVC(C=1, kernel='linear')
parameters = {'C': 10 ** np.linspace(-1, 3, 3)}

First we use the basic default parameter.

In [2]:
grid = GridSearchCV(svc, parameters, cv=3)
grid.fit(X_digits, y_digits)
print(grid.cv_results_)

{'mean_fit_time': array([ 0.03057806,  0.02522691,  0.02462053]), 'std_fit_time': array([ 0.00431713,  0.0014018 ,  0.00097326]), 'mean_score_time': array([ 0.01617161,  0.01420116,  0.01381707]), 'std_score_time': array([ 0.00091624,  0.00049559,  0.00022903]), 'param_C': masked_array(data = [0.10000000000000001 10.0 1000.0],
             mask = [False False False],
       fill_value = ?)
, 'params': [{'C': 0.10000000000000001}, {'C': 10.0}, {'C': 1000.0}], 'split0_test_score': array([ 0.93521595,  0.93521595,  0.93521595]), 'split1_test_score': array([ 0.95826377,  0.95826377,  0.95826377]), 'split2_test_score': array([ 0.93791946,  0.93791946,  0.93791946]), 'mean_test_score': array([ 0.94379521,  0.94379521,  0.94379521]), 'std_test_score': array([ 0.01029018,  0.01029018,  0.01029018]), 'rank_test_score': array([1, 1, 1], dtype=int32), 'split0_train_score': array([ 1.,  1.,  1.]), 'split1_train_score': array([ 1.,  1.,  1.]), 'split2_train_score': array([ 1.,  1.,  1.]), 'mean_tra

Second we use StratifiedKFold to split our data.

In [11]:
strat_k_fold = StratifiedKFold(random_state=10, n_splits=3)
grid = GridSearchCV(svc, parameters, cv=strat_k_fold)
grid.fit(X_digits, y_digits)
print(grid.cv_results_)

{'mean_fit_time': array([ 0.02780326,  0.0239656 ,  0.02332878]), 'std_fit_time': array([ 0.00496753,  0.00120074,  0.00080675]), 'mean_score_time': array([ 0.01519457,  0.01382144,  0.01330884]), 'std_score_time': array([ 0.00204237,  0.00058719,  0.00011681]), 'param_C': masked_array(data = [0.01 0.5 1],
             mask = [False False False],
       fill_value = ?)
, 'params': [{'C': 0.01}, {'C': 0.5}, {'C': 1}], 'split0_test_score': array([ 0.93521595,  0.93521595,  0.93521595]), 'split1_test_score': array([ 0.95826377,  0.95826377,  0.95826377]), 'split2_test_score': array([ 0.93791946,  0.93791946,  0.93791946]), 'mean_test_score': array([ 0.94379521,  0.94379521,  0.94379521]), 'std_test_score': array([ 0.01029018,  0.01029018,  0.01029018]), 'rank_test_score': array([1, 1, 1], dtype=int32), 'split0_train_score': array([ 0.99832636,  1.        ,  1.        ]), 'split1_train_score': array([ 1.,  1.,  1.]), 'split2_train_score': array([ 1.,  1.,  1.]), 'mean_train_score': array([

Now with a pipeline.

In [5]:
pca = PCA()
pipeline = Pipeline(steps = [('pca', pca), ('svc', svc)])
#parameters2 = {'pca__n_components': [5, 10, 20], 'svc__C': 10 ** np.linspace(-2, 3)}
parameters2 = {'pca__n_components': [5, 10, 20], 'svc__C': 10 ** np.linspace(start=-2, stop=1, num=4)}

In [6]:
grid2 = GridSearchCV(pipeline, parameters2, cv=3)

In [7]:
grid2.fit(X_digits, y_digits)
print(grid2.best_params_)
print(grid2.best_score_)

{'pca__n_components': 20, 'svc__C': 0.10000000000000001}
0.936004451864


Let's try to print the 3 best sets of parameters.

Since the attribute 'params' is a list one has to use comprehension list to subset it.

In [23]:
grid_results = grid2.cv_results_
print(type(grid_results))
print(grid_results['mean_test_score'])
print(type(grid_results['mean_test_score']))

<class 'dict'>
[ 0.84028937  0.83750696  0.83361157  0.83305509  0.92487479  0.92376183
  0.91819699  0.91708403  0.93377852  0.93600445  0.93600445  0.93600445]
<class 'numpy.ndarray'>


In [24]:
ranking_results = np.argsort(grid_results['mean_test_score'])
print(ranking_results)
print(type(grid_results['params']))
print(type(ranking_results[-5:]))
best_params = [grid_results['params'][i] for i in ranking_results[-5:]]
print(best_params)

[ 3  2  1  0  7  6  5  4  8  9 10 11]
<class 'list'>
<class 'numpy.ndarray'>
[{'pca__n_components': 10, 'svc__C': 0.01}, {'pca__n_components': 20, 'svc__C': 0.01}, {'pca__n_components': 20, 'svc__C': 0.10000000000000001}, {'pca__n_components': 20, 'svc__C': 1.0}, {'pca__n_components': 20, 'svc__C': 10.0}]
