# Grid Search and Grid Search CV

Z. W. Miller - Copyright 2018

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [2]:
import numpy as np
import sklearn
import matplotlib
import pandas as pd
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Anaconda custom (64-bit)| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.12.1
Pandas Version: 0.20.3


In [3]:
from itertools import product

class grid_search():
    
    def __init__(self, model_name, param_grid):
        """
        Given a base model and a parameter grid of params
        for that model, iterates through all the combinations
        of parameters, builds a model with each combo,
        and returns the score of the model.
        ---
        Inputs:
        model_name : the name of the model with parenthesis 
        and as a string. Any parameters you wish to set for all
        models can be set in the parameter name.
        param_grid: dictionary with parameter names as keys,
        and list of param values to test as value for each key
        """
        self._base_model = str(model_name).replace(')','')
        self._param_grid = param_grid
        self.models = self.get_models()
        
    def get_models(self):
        """
        Finds every combination of parameters from the param grid.
        Uses the string basename for to create a list of model 
        names with the proper parameters. This command_list is
        still in string form until we're ready to test the models.
        """
        params = []
        order = []
        for key, value in self._param_grid.items():
            order.append(key)
            params.append(value)
        options = list(product(*params))

        command_list = []
        for option in options:
            cmd = self._base_model
            if cmd[-1] != '(':
                cmd+=', '
            for i,j in zip(order, option):
                if type(j) == type('string'):
                    cmd += str(i)+"='"+str(j)+"', "
                else:
                    cmd += str(i)+"="+str(j)+", "
            command_list.append(cmd[:-2]+')')
        return command_list
    
    def fit(self, X, y):
        """
        Uses the "eval" function in Python to convert the model
        name from string to an actual model. Fits each model
        and scores it. Creates a lists of models and scores.
        Sets the best possible model and score to be easily
        retrievable and usable.
        """
        results = []
        for model_name in self.models:
            model = eval(model_name)
            model.fit(X,y)
            s = model.score(X,y)
            results.append([model, s, model_name])
        self.all_results = sorted(results, key=lambda x: x[1], reverse=True)
        self.best_model = self.all_results[0][0]
        self.best_score = self.all_results[0][1]
        
    def print_results(self):
        """
        Method to print the results in a nice readable format.
        """
        if self.all_results:
            print("Model    |    Score\n--------------------\n")
            for result in self.all_results:
                print(result[2], "   |   ", result[1],"\n")

In [4]:
import sys 
sys.path.append('../../')
from zwml.tree_models import random_forest_classifier
param_grid = {"n_trees": [1,10,20],"max_depth":[1,5,10]}

gs = grid_search('random_forest_classifier(n_features=3)',param_grid)
gs.models

['random_forest_classifier(n_features=3, n_trees=1, max_depth=1)',
 'random_forest_classifier(n_features=3, n_trees=1, max_depth=5)',
 'random_forest_classifier(n_features=3, n_trees=1, max_depth=10)',
 'random_forest_classifier(n_features=3, n_trees=10, max_depth=1)',
 'random_forest_classifier(n_features=3, n_trees=10, max_depth=5)',
 'random_forest_classifier(n_features=3, n_trees=10, max_depth=10)',
 'random_forest_classifier(n_features=3, n_trees=20, max_depth=1)',
 'random_forest_classifier(n_features=3, n_trees=20, max_depth=5)',
 'random_forest_classifier(n_features=3, n_trees=20, max_depth=10)']

In [5]:
from sklearn.datasets import load_iris
X = load_iris().data
y = load_iris().target

In [11]:
gs.fit(X,y)

In [12]:
gs.print_results()

Model    |    Score
--------------------

random_forest_classifier(n_features=3, n_trees=10, max_depth=10)    |    1.0 

random_forest_classifier(n_features=3, n_trees=20, max_depth=10)    |    1.0 

random_forest_classifier(n_features=3, n_trees=1, max_depth=10)    |    0.9933333333333333 

random_forest_classifier(n_features=3, n_trees=20, max_depth=5)    |    0.9933333333333333 

random_forest_classifier(n_features=3, n_trees=10, max_depth=5)    |    0.9866666666666667 

random_forest_classifier(n_features=3, n_trees=1, max_depth=5)    |    0.9733333333333334 

random_forest_classifier(n_features=3, n_trees=10, max_depth=1)    |    0.6666666666666666 

random_forest_classifier(n_features=3, n_trees=20, max_depth=1)    |    0.66 

random_forest_classifier(n_features=3, n_trees=1, max_depth=1)    |    0.6533333333333333 



In [14]:
from itertools import product
import sys 
sys.path.append('../../')
from zwml.utilities import cross_val

class grid_search_cv():
    
    def __init__(self, model_name, param_grid={}, k=5):
        """
        Given a base model and a parameter grid of params
        for that model, iterates through all the combinations
        of parameters, builds a model with each combo,
        and does kFold cross validation on them model
        ---
        Inputs:
        model_name : the name of the model with parenthesis 
        and as a string. Any parameters you wish to set for all
        models can be set in the parameter name.
        param_grid: dictionary with parameter names as keys,
        and list of param values to test as value for each key
        k: number of folds for cross val
        """
        self._base_model = str(model_name).replace(')','')
        self._param_grid = param_grid
        self.models = self.get_models()
        self.k = k
        
    def get_models(self):
        """
        Finds every combination of parameters from the param grid.
        Uses the string basename for to create a list of model 
        names with the proper parameters. This command_list is
        still in string form until we're ready to test the models.
        """
        params = []
        order = []
        for key, value in self._param_grid.items():
            order.append(key)
            params.append(value)
        options = list(product(*params))

        command_list = []
        for option in options:
            cmd = self._base_model
            if cmd[-1] != '(':
                cmd+=', '
            for i,j in zip(order, option):
                if type(j) == type('string'):
                    cmd += str(i)+"='"+str(j)+"', "
                else:
                    cmd += str(i)+"="+str(j)+", "
            command_list.append(cmd[:-2]+')')
        return command_list
    
    def fit(self, X, y):
        """
        Uses the "eval" function in Python to convert the model
        name from string to an actual model. Fits each model
        and scores it with kfold cross_val. 
        Creates a lists of models and scores.
        Sets the best possible model and score to be easily
        retrievable and usable.
        """
        results = []
        for model_name in self.models:
            model = eval(model_name)
            cv = cross_val()
            cv.cross_validation_scores(model, X, y, self.k)
            results.append([model, cv.score_folds, model_name])
        self.all_results = sorted(results, key=lambda x: np.mean(x[1]), reverse=True)
        self.best_model = self.all_results[0][0]
        self.best_score = self.all_results[0][1]
        
    def print_results(self, coefs=False, mean=False):
        """
        Method to print the results in a nice readable format.
        If the user asks for mean, only show the average score 
        across all folds. If the user asks for coefficients
        show coefficients if the model has them.
        """
        if self.all_results:
            print("Model    |    Scores\n--------------------")
            for result in self.all_results:
                if mean:
                    print(result[2], "   |   ", np.mean(result[1]))
                else:
                    print(result[2], "   |   ", result[1])
                if coefs:
                    try:
                        print("Coefs: ", result[0].coefs_)
                    except AttributeError:
                        print("No Coefficients in model!")    
                print()

In [15]:
param_grid = {"n_trees": [1,10],"max_depth":[1,5], 'mode':['rfnode', 'rftree']}
gs = grid_search_cv('random_forest_classifier()',param_grid)
gs.fit(X,y)

In [16]:
gs.print_results(mean=False, coefs=False)

Model    |    Scores
--------------------
random_forest_classifier(n_trees=10, max_depth=5, mode='rfnode')    |    [0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9333333333333333, 1.0]

random_forest_classifier(n_trees=10, max_depth=5, mode='rftree')    |    [0.9333333333333333, 0.9333333333333333, 0.9666666666666667, 0.9333333333333333, 1.0]

random_forest_classifier(n_trees=1, max_depth=5, mode='rftree')    |    [0.9, 0.9, 0.9666666666666667, 0.8333333333333334, 0.9333333333333333]

random_forest_classifier(n_trees=10, max_depth=1, mode='rfnode')    |    [0.43333333333333335, 0.9666666666666667, 0.9666666666666667, 0.8666666666666667, 0.9]

random_forest_classifier(n_trees=10, max_depth=1, mode='rftree')    |    [0.43333333333333335, 0.9333333333333333, 0.9, 0.8666666666666667, 0.9]

random_forest_classifier(n_trees=1, max_depth=5, mode='rfnode')    |    [0.6333333333333333, 0.7, 0.9333333333333333, 0.7666666666666667, 0.9666666666666667]

random_forest_classifier(n_t