In [1]:
# manual nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=1, n_informative=10, n_redundant=10)

In [3]:
# configure the cross-validation procedure
cv_outer = KFold(n_splits=4, shuffle=True, random_state=1)
# enumerate splits
outer_results = list()

In [4]:
for train_ix, test_ix in cv_outer.split(X):
	# split data
	X_train, X_test = X[train_ix, :], X[test_ix, :]
	y_train, y_test = y[train_ix], y[test_ix]
	# configure the cross-validation procedure
	cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
	# define the model
	model = RandomForestClassifier(random_state=1)
	# define search space
	space = dict()
	space['n_estimators'] = [10, 100, 500]
	space['max_features'] = [2, 4, 6]
	# define search
	search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True, verbose=True)
	# execute search
	print('==GridSearchCV==')
	print(search)
	result = search.fit(X_train, y_train)
	# get the best performing model fit on the whole training set
	best_model = result.best_estimator_
	print('==Best Model==')
	print(best_model)
	print('==Inner Score==')
	print('Accuracy:', result.best_score_)
	# evaluate model on the hold out dataset
	yhat = best_model.predict(X_test)
	# evaluate the model
	acc = accuracy_score(y_test, yhat)
	print('==Outer Score==')
	print('Accuracy:', acc)
	# store the result
	outer_results.append(acc)
	# report progress
	print('===REPORT===')
	print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
	print('+++++CYCLE END+++++')
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))

==GridSearchCV==
GridSearchCV(cv=KFold(n_splits=3, random_state=1, shuffle=True),
             estimator=RandomForestClassifier(random_state=1),
             param_grid={'max_features': [2, 4, 6],
                         'n_estimators': [10, 100, 500]},
             scoring='accuracy', verbose=True)
Fitting 3 folds for each of 9 candidates, totalling 27 fits
==Best Model==
RandomForestClassifier(max_features=4, n_estimators=500, random_state=1)
==Inner Score==
Accuracy: 0.9293333333333332
==Outer Score==
Accuracy: 0.908
===REPORT===
>acc=0.908, est=0.929, cfg={'max_features': 4, 'n_estimators': 500}
+++++CYCLE END+++++
==GridSearchCV==
GridSearchCV(cv=KFold(n_splits=3, random_state=1, shuffle=True),
             estimator=RandomForestClassifier(random_state=1),
             param_grid={'max_features': [2, 4, 6],
                         'n_estimators': [10, 100, 500]},
             scoring='accuracy', verbose=True)
Fitting 3 folds for each of 9 candidates, totalling 27 fits
==Best Mod