In [45]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator as est
import warnings
import time

start_time = time.time()
warnings.filterwarnings('ignore')

print(__doc__)

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = {'splitter':['best', 'random'],'max_depth':[1,2,3,4,5,6,7,8,9,10],
                    'criterion':['gini', 'entropy']}

scores = ['precision', 'recall','f1']

print('\n' + 'parameters:' + "\n" + str(est.get_params(DecisionTreeClassifier)) + '\n')
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        DecisionTreeClassifier(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
print("Time Consumed: " + str(time.time()-start_time))

Automatically created module for IPython interactive environment

parameters:
{'ccp_alpha': None, 'class_weight': None, 'criterion': None, 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': None, 'min_impurity_split': None, 'min_samples_leaf': None, 'min_samples_split': None, 'min_weight_fraction_leaf': None, 'presort': None, 'random_state': None, 'splitter': None}

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 9, 'splitter': 'random'}

Grid scores on development set:

0.070 (+/-0.023) for {'criterion': 'gini', 'max_depth': 1, 'splitter': 'best'}
0.056 (+/-0.038) for {'criterion': 'gini', 'max_depth': 1, 'splitter': 'random'}
0.232 (+/-0.070) for {'criterion': 'gini', 'max_depth': 2, 'splitter': 'best'}
0.177 (+/-0.024) for {'criterion': 'gini', 'max_depth': 2, 'splitter': 'random'}
0.425 (+/-0.141) for {'criterion': 'gini', 'max_depth': 3, 'splitter': 'best'}
0.394 (+

In [46]:
start_time = time.time()
warnings.filterwarnings('ignore')

print(__doc__)

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = {'splitter':['best', 'random'],'max_depth':[1,2,3,4,5,6,7,8,9,10],
                    'criterion':['gini', 'entropy']}

scores = ['precision', 'recall','f1']

print('\n' + 'parameters:' + "\n" + str(est.get_params(DecisionTreeClassifier)) + '\n')
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = RandomizedSearchCV(
        DecisionTreeClassifier(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
print("Time Consumed: " + str(time.time()-start_time))

Automatically created module for IPython interactive environment

parameters:
{'ccp_alpha': None, 'class_weight': None, 'criterion': None, 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': None, 'min_impurity_split': None, 'min_samples_leaf': None, 'min_samples_split': None, 'min_weight_fraction_leaf': None, 'presort': None, 'random_state': None, 'splitter': None}

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'splitter': 'random', 'max_depth': 8, 'criterion': 'entropy'}

Grid scores on development set:

0.741 (+/-0.042) for {'splitter': 'best', 'max_depth': 5, 'criterion': 'entropy'}
0.793 (+/-0.079) for {'splitter': 'random', 'max_depth': 10, 'criterion': 'gini'}
0.425 (+/-0.141) for {'splitter': 'best', 'max_depth': 3, 'criterion': 'gini'}
0.798 (+/-0.033) for {'splitter': 'best', 'max_depth': 9, 'criterion': 'gini'}
0.776 (+/-0.069) for {'splitter': 'random', 'max_depth': 7, 'criterion': 'gini'}
0.18

In [None]:
'''
Grid Search time consumed: 3.06325626373291
Randomized Search time consumed: 0.9388930797576904

grid search essentially brute forces its way through all possible combinations of hyperparameters 
and saves the metrics for the combination with the best performance.

A randomized search provides an alternative to the exhaustive grid search method. 
As the name suggests, it randomly selects combinations of hyperparameters and tests them to find the 
optimal hyperparameter values out of the randomly selected group. 
This method is typically faster than a grid search since it doesn’t test the full range of possibilities.
'''