In [1]:
import numpy as np
import pandas as pd

# Scikit-learn

## Grid Search

Learning how to implement grid search from [Source code: scikit-learn's model selection](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py).

In [2]:
# Grid search
from itertools import product
from collections import Mapping
from sklearn.model_selection import ParameterGrid

params_grid = {'a': [1, 2], 'b': [True, False]}

# ensures that it also supports list of dictionary,
# Mapping ensures a object has keys, values, items, etc. methods
# which matches a dictionary
# https://docs.python.org/3/library/collections.abc.html
if isinstance(params_grid, Mapping):
    params_grid = [params_grid]
    
for p in params_grid:
    # for reproducibility, always sort the keys of a dictionary
    # this will become a list of paired tuples
    items = sorted(p.items())
    print('sorted parameters, values: ', items)
    print()
    
    # unpack the list of tuples into two lists tuples, so what's originally 
    # a list of items [('a', [1, 2]), ('b', [True, False])], becomes
    # two lists ('a', 'b'), ([1, 2], [True, False]), with all the keys being the parameter
    # and the value being the list of possible values that the parameter can take
    # http://stackoverflow.com/questions/7558908/unpacking-a-list-tuple-of-pairs-into-two-lists-tuples
    key, value = zip(*items)
    print('parameters: ', key)
    print('values', value)
    print()
    
    # unpack the list of values to compute the cartesian product
    # [(1, True), (1, False), (2, True), (2, False)], and zip it
    # back to the original key
    print('grid search parameters')
    cartesian = product(*value)
    for v in cartesian:
        params = dict(zip(key, v))
        print(params)

sorted parameters, values:  [('a', [1, 2]), ('b', [True, False])]

parameters:  ('a', 'b')
values ([1, 2], [True, False])

grid search parameters
{'a': 1, 'b': True}
{'a': 1, 'b': False}
{'a': 2, 'b': True}
{'a': 2, 'b': False}


In [3]:
# confirm with scikit-learn's output
list( ParameterGrid(params_grid) )

[{'a': 1, 'b': True},
 {'a': 1, 'b': False},
 {'a': 2, 'b': True},
 {'a': 2, 'b': False}]

In [4]:
# making our function
def _get_params_grid(params_grid):
    """
    create cartesian product of parameters (grid search),
    this will be a generator that will allow looping through
    all possible parameter combination, note if we want to
    expand this to cross validation we'll have to turn it to a list
    """
    # for reproducibility, always sort the keys of a dictionary
    items = sorted(params_grid.items())
    
    # unpack parameter and the range of values
    # into separate list; then unpack the range 
    # of values to compute the cartesian product
    # and zip it back to the original key
    key, value = zip(*items)
    cartesian = product(*value)
    for v in cartesian:
        params = dict(zip(key, v))
        yield params

params_grid = {'a': [1, 2], 'b': [True, False]}
params = _get_params_grid(params_grid)
for p in params:
    print(p)

{'a': 1, 'b': True}
{'a': 1, 'b': False}
{'a': 2, 'b': True}
{'a': 2, 'b': False}


In [5]:
# self._fit(X, y, groups, ParameterGrid(self.param_grid))

In [31]:
class KFolds:
    """
    K-Folds cross-validation
    Provides train/test indices to split data in train/test sets. Split
    dataset into k consecutive folds; Each fold is then used once as 
    a validation while the k - 1 remaining folds form the training set

    Parameters
    ----------
    n_splits : int
        number of folds. Must be at least 2
    
    shuffle : boolean, default True
        whether to shuffle the data before splitting into batches
    
    seed : int, default 4321
        When shuffle = True, pseudo-random number generator state used for
        shuffling; this ensures reproducibility
    """
    def __init__(self, n_splits, shuffle = True, seed = 4321):
        self.seed = seed
        self.shuffle = shuffle
        self.n_splits = n_splits
        
    def split(self, X):
        """pass in the data to create train/test split for k fold"""
        # shuffle modifies indices inplace
        indices = np.arange(X.shape[0])
        if self.shuffle:
            rstate = np.random.RandomState(self.seed)
            rstate.shuffle(indices)

        for test_mask in self._iter_test_masks(X, indices):
            train_index = indices[np.logical_not(test_mask)]
            test_index = indices[test_mask]
            yield train_index, test_index
        
    def _iter_test_masks(self, X, indices):
        """
        create the mask for the test set, then the indices that
        are not in the test set belongs in the training set
        """
        # indicate the number of samples in each fold, and also
        # make sure the ones that are not evenly splitted also
        # gets assigned to a fold (e.g. if we do 2 fold on a
        # dataset that has 5 samples, then 1 will be left out,
        # and has to be assigned to one of the other fold)
        n_samples = X.shape[0]
        fold_sizes = (n_samples // self.n_splits) * np.ones(self.n_splits, dtype = np.int)
        fold_sizes[:n_samples % self.n_splits] += 1

        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            test_indices = indices[start:stop]
            test_mask = np.zeros(n_samples, dtype = np.bool)
            test_mask[test_indices] = True
            yield test_mask
            current = stop

In [6]:
from sklearn.model_selection import KFold

# create some sample data
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 3, 4, 5])

kf = KFolds(n_splits = 2, shuffle = False, seed = 4312)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
print('\nconfirm results with scikit-learn')
kf = KFold(n_splits = 2, random_state = 4312)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [3 4] TEST: [0 1 2]
TRAIN: [0 1 2] TEST: [3 4]

confirm results with scikit-learn
TRAIN: [3 4] TEST: [0 1 2]
TRAIN: [0 1 2] TEST: [3 4]


In [7]:
from time import time
from scipy.stats import randint
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# load the data
iris = load_iris()
X = iris.data
y = iris.target

# build a classifier
clf = RandomForestClassifier(n_estimators = 20)

def report(results):
    """report best scores and corresponding parameters"""
    print( 'Best score obtained: {0}'.format(results.best_score_) )
    print('Parameters:')
    for param, value in results.best_params_.items():
        print( '\t{}: {}'.format(param, value) )


# specify parameters and distributions to sample from
param_dist = {'max_depth': [3, None],
              'min_samples_split': randint(1, 11),
              'criterion': ['gini', 'entropy']}

# run randomized search
n_iter_search = 6
random_search = RandomizedSearchCV(clf, param_distributions = param_dist,
                                   n_iter = n_iter_search)
start = time()
random_search.fit(X, y)
print('RandomizedSearchCV took %.2f seconds for %d candidates'
      ' parameter settings.' % ((time() - start), n_iter_search))
report(random_search)


# use a full grid over all parameters
param_grid = {'max_depth': [3, None],
              'min_samples_split': [1, 3, 10],
              'criterion': ['gini', 'entropy']}

# run grid search
grid_search = GridSearchCV(clf, param_grid = param_grid)
start = time()
grid_search.fit(X, y)
print('GridSearchCV took %.2f seconds for %d candidate parameter settings.'
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search)

RandomizedSearchCV took 0.52 seconds for 6 candidates parameter settings.
Best score obtained: 0.9733333333333334
Parameters:
	criterion: entropy
	max_depth: None
	min_samples_split: 5
GridSearchCV took 1.12 seconds for 12 candidate parameter settings.
Best score obtained: 0.9666666666666667
Parameters:
	criterion: entropy
	max_depth: 3
	min_samples_split: 1


- https://zacharyst.com/2016/03/31/parallelize-a-multifunction-argument-in-python/
- https://pythonhosted.org/joblib/parallel.html

In [41]:
from math import sqrt
from joblib import Parallel, delayed
Parallel(n_jobs = 2, verbose = 1)( delayed(sqrt)(i ** 2) for i in range(10) )

[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished


[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

## Grid Search

In [9]:
def _fit_and_score(estimator, X, y, scorer, 
                   train_index, test_index,
                   parameters, fit_params):
    
    # create the train/test split
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # fit the model
    estimator.set_params(**parameters)
    estimator.fit(X_train, y_train, **fit_params)

    # obtain the train/test score
    y_pred_train = estimator.predict(X_train)
    y_pred_test  = estimator.predict(X_test)
    train_score = scorer(y_train, y_pred_train)
    test_score  = scorer(y_test, y_pred_test)
    output = [train_score, test_score]
    return output

In [34]:
class GridSearchCV:
    
    def __init__(self, estimator, scorer, cv, param_grid,
                 fit_params = None, verbose = True, n_jobs = -1, 
                 pre_dispatch = '2*n_jobs', refit = True):
        self.cv = cv
        self.refit = refit
        self.n_jobs = n_jobs
        self.scorer = scorer
        self.verbose = verbose
        self.estimator = estimator
        self.param_grid = param_grid
        self.fit_params = fit_params
        self.pre_dispatch = pre_dispatch     
        
    def fit(self, X, y):
        
        # object used as a cross-validation generator
        # is passed without any modification
        if isinstance(self.cv, int):
            cv = KFolds(n_splits = self.cv)
        else:
            cv = self.cv
        
        # obtain the train/test set index, the parameters
        # and perform cross validation
        cv_iter = cv.split(X)
        params_iterable = list(_get_params_grid(self.param_grid))
        fit_params = self.fit_params if self.fit_params is not None else {}
        
        parallel = Parallel(n_jobs = self.n_jobs, verbose = self.verbose, 
                            pre_dispatch = self.pre_dispatch)
        output = parallel(delayed(_fit_and_score)(deepcopy(self.estimator), 
                                                  X, y, self.scorer,
                                                  train_index, test_index, 
                                                  parameters, fit_params)
                          for train_index, test_index in cv_iter
                          for parameters in params_iterable)

        # unpack training/testing scores
        n_splits = cv.n_splits
        n_candidates = len(params_iterable)
        train_score, test_score = zip(*output)
        train_score = np.array(train_score, dtype = np.float64).reshape(n_splits, n_candidates)
        test_score = np.array(test_score, dtype = np.float64).reshape(n_splits, n_candidates)
        
        # obtain the best score and parameter using the 
        # best mean test scores across all folds, where
        # best here means the higher the better
        mean_test_score = np.mean(test_score, axis = 0)
        best_index = np.argmax(mean_test_score)
        self.best_score = mean_test_score[best_index]
        self.best_param = params_iterable[best_index]

        # list the mean, std train and test score
        # for each parameters combination;
        # not sure if 'params', the column with the
        # values in the dictionary format is useful or not
        mean_train_score = np.mean(train_score, axis = 0)
        std_test_score = np.std(test_score, axis = 0)
        std_train_score = np.std(train_score, axis = 0)
        cv_results = {
            'mean_train_score': mean_train_score,
            'std_train_score': std_train_score,
            'mean_test_score': mean_test_score,
            'std_test_score': std_test_score
        }

        # ensure the columns appear in this order (train score, test score, parameters)
        # and order by the best test score
        cols = ['mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']
        cv_results = pd.DataFrame(cv_results, columns = cols)
        df_params  = pd.DataFrame(params_iterable)
        cv_results = pd.concat([cv_results, df_params], axis = 1)
        cv_results['params'] = params_iterable
        cv_results = cv_results.sort_values(['mean_test_score', 'std_test_score'], ascending = False)
        cv_results = cv_results.reset_index(drop = True)
        self.cv_results = cv_results
        
        # refit on the entire dataset after performing cross validation
        if self.refit:
            best_estimator = deepcopy(self.estimator)
            best_estimator.set_params(**self.best_param)
            best_estimator.fit(X, y, **fit_params)
            self.best_estimator = best_estimator
        
        return self
    
    def predict(self, X):
        """call predict on the estimator with the best found parameter"""
        if not self.refit:
            raise ValueError('Only available if refit=True')
        
        return self.best_estimator.predict(X)

In [42]:
#n_jobs = -1
#verbose = True
#pre_dispatch = '2*n_jobs'

# dictionary of
# additional parameters pass to fit
# or just None
#fit_params = None 

#n_splits = 3
#kf = KFolds(n_splits = n_splits, shuffle = True, seed = 4312)
#cv_iter = kf.split(X)
#params_iterable = list(_get_params_grid(param_grid))

# load the data
iris = load_iris()
X = iris.data
y = iris.target

# options
cv = 3
scorer = accuracy_score
clf = RandomForestClassifier()

# fit grid search
grid_search = GridSearchCV(estimator = clf, scorer = scorer, cv = cv, param_grid = param_grid)
grid_search.fit(X, y)
grid_search.cv_results

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    0.2s finished


Unnamed: 0,mean_train_score,std_train_score,mean_test_score,std_test_score,criterion,max_depth,min_samples_split,params
0,0.966667,0.004714045,0.953333,0.009428,gini,3.0,1,"{'criterion': 'gini', 'max_depth': 3, 'min_sam..."
1,0.973333,0.004714045,0.953333,0.009428,gini,3.0,3,"{'criterion': 'gini', 'max_depth': 3, 'min_sam..."
2,0.97,0.008164966,0.946667,0.018856,gini,3.0,10,"{'criterion': 'gini', 'max_depth': 3, 'min_sam..."
3,0.983333,0.004714045,0.946667,0.018856,gini,,10,"{'criterion': 'gini', 'max_depth': None, 'min_..."
4,0.973333,0.00942809,0.946667,0.009428,entropy,,10,"{'criterion': 'entropy', 'max_depth': None, 'm..."
5,0.97,0.008164966,0.94,0.03266,entropy,3.0,10,"{'criterion': 'entropy', 'max_depth': 3, 'min_..."
6,0.993333,0.004714045,0.94,0.028284,gini,,3,"{'criterion': 'gini', 'max_depth': None, 'min_..."
7,1.0,0.0,0.933333,0.024944,gini,,1,"{'criterion': 'gini', 'max_depth': None, 'min_..."
8,0.973333,0.004714045,0.933333,0.009428,entropy,3.0,1,"{'criterion': 'entropy', 'max_depth': 3, 'min_..."
9,0.97,1.110223e-16,0.933333,0.009428,entropy,3.0,3,"{'criterion': 'entropy', 'max_depth': 3, 'min_..."


## Reference

- [Github: scikit-learn's KFold](https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/model_selection/_split.py#L347)
- [Github: scikit-learn's GridSearch](https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/model_selection/_search.py#L685)
- [Scikit-learn Documentation: Comparing randomized search and grid search for hyperparameter estimation](http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html#sphx-glr-auto-examples-model-selection-randomized-search-py)