# Sklearn

## sklearn.grid_search

документация: http://scikit-learn.org/stable/modules/grid_search.html

In [2]:
from sklearn import datasets, linear_model, metrics, model_selection

import numpy as np
import pandas as pd

### Генерация датасета

In [3]:
iris = datasets.load_iris()

In [4]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3,random_state = 0)

### Задание модели

In [5]:
classifier = linear_model.SGDClassifier(random_state = 0)

### Генерация сетки

In [15]:
classifier.get_params().keys()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'max_iter': None,
 'n_iter': None,
 'n_jobs': 1,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': 0,
 'shuffle': True,
 'tol': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
parameters_grid = {
    'loss' : ['hinge', 'log', 'squared_hinge', 'squared_loss'],
    'penalty' : ['l1', 'l2'],
    'max_iter' : range(5,10),
    'alpha' : np.linspace(0.0001, 0.001, num = 5),
}

In [8]:
cv = model_selection.StratifiedShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0)

### Подбор параметров и оценка качества

#### Grid search

In [27]:
grid_cv = model_selection.GridSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv, return_train_score=True)

In [28]:
%%time
grid_cv.fit(train_data, train_labels)

CPU times: user 3.28 s, sys: 9.57 ms, total: 3.29 s
Wall time: 3.29 s


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'], 'penalty': ['l1', 'l2'], 'max_iter': range(5, 10), 'alpha': array([0.0001 , 0.00032, 0.00055, 0.00078, 0.001  ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [29]:
grid_cv.best_estimator_

SGDClassifier(alpha=0.000325, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=9, n_iter=None,
       n_jobs=1, penalty='l1', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [30]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

0.8952380952380953
{'alpha': 0.000325, 'loss': 'hinge', 'max_iter': 9, 'penalty': 'l1'}


In [33]:
pd.DataFrame(grid_cv.cv_results_)[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_loss,param_max_iter,param_penalty,params,split0_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.000837,0.000129,0.000181,2.2e-05,0.0001,hinge,5,l1,"{'alpha': 0.0001, 'loss': 'hinge', 'max_iter':...",0.904762,...,0.630952,0.690476,0.690476,0.690476,0.809524,0.690476,0.630952,0.630952,0.728571,0.107644
1,0.000657,1.9e-05,0.000152,5e-06,0.0001,hinge,5,l2,"{'alpha': 0.0001, 'loss': 'hinge', 'max_iter':...",0.809524,...,0.630952,0.702381,0.690476,0.690476,0.309524,0.690476,0.416667,0.928571,0.616667,0.177521
2,0.000821,0.00014,0.000195,7.7e-05,0.0001,hinge,6,l1,"{'alpha': 0.0001, 'loss': 'hinge', 'max_iter':...",0.666667,...,0.642857,0.690476,0.630952,0.690476,0.75,0.392857,0.321429,0.369048,0.582143,0.149114
3,0.000824,0.000147,0.000182,2.4e-05,0.0001,hinge,6,l2,"{'alpha': 0.0001, 'loss': 'hinge', 'max_iter':...",0.904762,...,0.619048,0.702381,0.630952,0.690476,0.690476,0.607143,0.333333,0.928571,0.682143,0.15999
4,0.000717,6.3e-05,0.000152,7e-06,0.0001,hinge,7,l1,"{'alpha': 0.0001, 'loss': 'hinge', 'max_iter':...",0.571429,...,0.809524,0.630952,0.97619,0.690476,0.97619,0.880952,0.964286,0.690476,0.784524,0.146912


#### Randomized grid search

In [37]:
randomized_grid_cv = model_selection.RandomizedSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv, n_iter = 20, 
                                                   random_state = 0)

In [41]:
%%time
randomized_grid_cv.fit(train_data, train_labels)

CPU times: user 341 ms, sys: 2.28 ms, total: 343 ms
Wall time: 342 ms


RandomizedSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
          error_score='raise',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=0, shuffle=True,
       tol=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'loss': ['hinge', 'log', 'squared_hinge', 'squared_loss'], 'penalty': ['l1', 'l2'], 'max_iter': range(5, 10), 'alpha': array([0.0001 , 0.00032, 0.00055, 0.00078, 0.001  ])},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [42]:
print(randomized_grid_cv.best_score_)
print(randomized_grid_cv.best_params_)

0.8142857142857143
{'penalty': 'l1', 'max_iter': 9, 'loss': 'log', 'alpha': 0.00055}
