In [2]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

  return f(*args, **kwds)


In [9]:
print('hello')

hello


In [3]:
# Seed the random number generator:
np.random.seed(1)

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [4]:
X = load_data('training_data.txt')
y = X[:, 0]
X = X[:, 1:]

In [7]:
# build a classifier
clf = RandomForestClassifier(n_estimators=20)

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [5]:
# specify parameters and distributions to sample from
param_dist = {"max_depth": [10, None],
              "max_features": sp_randint(1, 10),
              "min_samples_split": sp_randint(2, 10),
              "min_samples_leaf": sp_randint(1, 10),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 71.81 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.821 (std: 0.005)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 8, 'min_samples_leaf': 5, 'min_samples_split': 5}

Model with rank: 2
Mean validation score: 0.818 (std: 0.007)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 8, 'min_samples_leaf': 7, 'min_samples_split': 3}

Model with rank: 3
Mean validation score: 0.817 (std: 0.005)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 8, 'min_samples_split': 5}



In [None]:
# build a classifier
clf = RandomForestClassifier()

# use a full grid over all parameters
param_grid = {"max_depth": [None],
              "max_features": [i for i in range(10, 20, 5)],
              "min_samples_split": [i for i in range(2, 20, 5)],
              "min_samples_leaf": [2],
              "criterion": ["gini", "entropy"],
             "n_estimators" : [200, 400, 800]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, verbose=1)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [None]:
# build a classifier
clf = ExtraTreesClassifier(criterion="entropy", max_features=int(np.sqrt(1000)))

# use a full grid over all parameters
param_grid = {"min_samples_split": [i for i in range(2, 20, 4)],
             "n_estimators" : [200, 400, 800]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, verbose=1)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


In [8]:
report(grid_search.cv_results_)

Model with rank: 1
Mean validation score: 0.846 (std: 0.005)
Parameters: {'n_estimators': 800, 'criterion': 'entropy', 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 31, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.846 (std: 0.006)
Parameters: {'n_estimators': 800, 'criterion': 'entropy', 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 31, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.846 (std: 0.005)
Parameters: {'n_estimators': 800, 'criterion': 'entropy', 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 31, 'max_depth': None}



In [15]:
grid_search.cv_results_



{'mean_fit_time': array([ 64.10927955, 123.20454582, 280.26738739, 103.41097903,
        181.29714259, 310.16634234,  74.20762404, 142.78002834,
        276.1756657 ,  60.90540902, 114.18001866, 215.61855563,
         57.00969577, 107.60875169, 212.85640931,  55.33712133,
        111.2439909 , 223.433122  ,  55.63524365, 110.448656  ,
        230.28141729,  56.15693061, 115.6877013 , 301.64345996,
         73.09571862, 124.07052294, 267.37661942,  66.01728503,
        138.58766596, 270.03549178]),
 'mean_score_time': array([0.83331943, 1.65936724, 4.7248373 , 1.28930227, 2.76133609,
        4.06434973, 1.09644628, 1.86372201, 3.63883861, 0.8135066 ,
        1.50826693, 2.89331834, 0.75284839, 1.40972567, 2.80638568,
        0.73773964, 1.48190204, 3.00956861, 0.75511758, 1.46741335,
        3.11868866, 0.72307698, 1.51910996, 4.07811928, 0.9573257 ,
        1.78784871, 3.42611472, 0.84042501, 1.69648902, 3.39087796]),
 'mean_test_score': array([0.84175, 0.84325, 0.8431 , 0.84085, 0.843

In [14]:
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):

    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))

    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1,1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))

    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')

# Calling Method 
plot_grid_search(grid_search.cv_results_, 'n_estimators', 'max_features', 'N Estimators', 'Max Features')

ValueError: cannot reshape array of size 30 into shape (12,12)