In [1]:
# imports

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score # for nested cv, we can optimize this

In [2]:
def plot_feature_importances(model, data):
    n_features = data.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), data.columns)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

In [3]:
def plot_classification_results(features, target, model, title, xlabel, ylabel, normalizer=None):
    if normalizer is not None:
        X_set, y_set = normalizer.inverse_transform(features), target
    else:
        X_set, y_set = features, target
    X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 10, stop=X_set[:, 0].max() + 10, step=0.25),
                         np.arange(start=X_set[:, 1].min() - 1000, stop=X_set[:, 1].max() + 1000, step=0.25))

    plt.contourf(X1, X2,
                 model.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),alpha=0.75, cmap=ListedColormap(('red', 'green')))

    plt.xlim(X1.min(), X1.max())
    plt.ylim(X2.min(), X2.max())
    for i, j in enumerate(np.unique(y_set)):
        plt.scatter(X_set[y_set==j, 0], X_set[y_set==j, 1], c=ListedColormap(('red', 'green'))(i), label=j)

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.show()

In [2]:
def nested_cross_validation(model, grid, train_set, test_set):
    cv_outer = KFold(n_splits=5, shuffle=True, random_state=123)
    
    cv_inner = KFold(n_splits=5, shuffle=True, random_state=11)
    
    outer_results = list()
    X_train, y_train = train_set
    X_test, y_test = test_set
    
    for train_idx, test_idx in cv_outer.split(X_train):
        xtrain, xtest = X_train.loc[train_idx, :], X_train.loc[test_idx, :]
        ytrain, ytest = y_train[train_idx], y_test[test_idx]
        
        search = GridSearchCV(model, grid, scoring='accuracy', cv=cv_inner, refit=True)
        search.fit(xtrain, ytrain)
        
        yhat = search.predict(xtest)
        accuracy = accuracy_score(ytest, yhart)
        outer_results.append(accuracy)
        
        print(' >> accuracy_outer = %.3f, accuracy_inner = %.3f, cfg=%s' % (accuracy, search.best_score_, serach.best_params_))
        
    print('accuracy outer: %.3f +- %.3f' % (np.mean(outer_results), np.std(outer_results)))
    
    return search.fit(X_train, y_train)