Ref: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix

#PCA dimension reduced files
X_train = pd.read_pickle("X_train_pca.npk")
X_test = pd.read_pickle("X_test_pca.npk")

#keeps the totalRansactionRevenue, class_pred = 0 if didnt buy anythin and 1 if did
y_test = pd.read_pickle("y_test.pkl")
y_train = pd.read_pickle("y_train.pkl")

In [3]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train['class_pred'])
base_accuracy = evaluate(base_model, X_test, y_test['class_pred'])

#best_random = rf_random.best_estimator_
#random_accuracy = evaluate(best_random, X_test, y_test['class_pred'])

#print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 0.0117 degrees.
Accuracy = -inf%.


In [10]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [50,100,200],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10, 100]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, verbose = 2)

In [11]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train['class_pred'])
grid_search.best_params_
{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test['class_pred'])

print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10, total=  27.5s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.4s remaining:    0.0s


[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10, total=  27.4s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10, total=  31.7s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10, total=  26.7s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10, total=  28.1s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_lea

[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100, total= 2.9min
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100, total= 2.9min
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100, total= 2.9min
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100, total= 2.9min
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=4, min_samples_split=10, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samp

[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=5, min_samples_split=10, n_estimators=10, total=  17.7s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=5, min_samples_split=10, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=5, min_samples_split=10, n_estimators=10, total=  17.1s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=5, min_samples_split=10, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=5, min_samples_split=10, n_estimators=10, total=  17.3s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=5, min_samples_split=10, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=5, min_samples_split=10, n_estimators=10, total=  17.9s
[CV] bootstrap=True, max_depth=50, max_features=2, min_samples_leaf=5, min_samples_split=10, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=2, min_sam

[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100, total= 4.0min
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100, total= 4.1min
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100, total= 4.1min
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=100, total= 4.0min
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=3, m

[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=10, total=  26.2s
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=10, total=  26.2s
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=10, total=  25.8s
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=10 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=10, total=  26.6s
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_sam

[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=100, total= 4.1min
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=100, total= 4.2min
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=100, total= 4.2min
[CV] bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=100 
[CV]  bootstrap=True, max_depth=50, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=100, total= 4.1min
[CV] bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=10 
[CV]  bootstrap=True, max_depth=100, max_features=2, 

KeyboardInterrupt: 