# Supplementary Material

Additional material helpful during the demo.

## Explore Impact of Model Parameter Settings / Over- and Underfitting

What happens when we adjust the model parameters?

In [1]:
import inspect

from ipywidgets import interactive
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from util import DATA_PATH_PREPROCESSED, TARGET, FEATURES

%matplotlib inline
sns.set_style('whitegrid')

In [2]:
REGRESSORS = {
    'DecisionTree': DecisionTreeRegressor, 
    'RandomForest': RandomForestRegressor,
}

In [3]:
def create_noisy_sine(n=100):
    prng = np.random.RandomState(123)
    X = np.sort(prng.rand(n))
    y = np.sin(2 * np.pi * X)
    y += prng.normal(0, 0.1, len(y))
    y[5::10] += 2*(0.5 - prng.rand(len(y[5::10])))
    X_true = np.linspace(0, 1, n)
    y_true = np.sin(2 * np.pi * X_true)
    return X_true, y_true, X, y

In [4]:
def regression(clf='DecisionTree', n_estimators=1, min_samples_leaf=1, max_depth=None):
    
    kws = {
        'max_depth': max_depth, 
        'min_samples_leaf': min_samples_leaf, 
        'n_estimators': n_estimators,
    }
    
    regr_cls = REGRESSORS[clf]
    supported_kws = {k: v for k, v in kws.items() if k in inspect.getfullargspec(regr_cls).args}    
    regr = regr_cls(**supported_kws, random_state=42)

    X_true, y_true, X, y = create_noisy_sine(n=80)
    regr.fit(X.reshape((-1, 1)), y)
    yp = regr.predict(X.reshape((-1, 1)))

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(X, y, s=20, color='green', edgecolor='black', label='noisy data')
    ax.plot(X_true, y_true, label='true')
    ax.plot(X, yp, color='red', label='pred')
    ax.legend()
    ax.set_xlabel('x')
    ax.set_ylabel('y = sin(2pi x)')
    plt.show()

    
interactive_plot = interactive(
    regression, 
    clf=['DecisionTree', 'RandomForest'],
    max_depth=[1, 3, 5, 10, None],
    n_estimators=[1, 2, 5, 10, 50, 100, 200, 500, 1000], 
    min_samples_leaf=[1, 3, 5, 10],
)
interactive_plot

interactive(children=(Dropdown(description='clf', options=('DecisionTree', 'RandomForest'), value='DecisionTre…

In [5]:
DATASETS = {
    'moons': make_moons(noise=0.3, random_state=42), 
    'circles': make_circles(noise=0.2, factor=0.5, random_state=42),
    'linear': make_classification(n_features=2, n_redundant=0, n_informative=2,
                                  random_state=42, n_clusters_per_class=1, flip_y=0.1),
}

CLASSIFIERS = {
    'DecisionTree': DecisionTreeClassifier, 
    'RandomForest': RandomForestClassifier,
}

In [6]:
def classification(clf='DecisionTree', data='linear', max_depth=None, n_estimators=1, min_samples_leaf=1, max_features=1):

    kws = {
        'max_depth': max_depth, 
        'min_samples_leaf': min_samples_leaf, 
        'max_features': max_features, 
        'n_estimators': n_estimators, 
    }
    
    # prepare model
    clf_cls = CLASSIFIERS[clf]
    supported_kws = {k: v for k, v in kws.items() if k in inspect.getfullargspec(clf_cls).args}
    clf = clf_cls(**supported_kws, random_state=42)
    
    # prepare data
    X, y = DATASETS[data]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = .02
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))    
    
    # define color scheme
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    cm_bright_other = ListedColormap(['#FF8000', '#0080FF'])

    # fit predict
    clf.fit(X_train, y_train)
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)

    # plot
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.contourf(xx, yy, Z, cmap=cm, alpha=0.6)
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
               edgecolors='k', label='train')
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright_other,
               edgecolors='k', label='test')

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    ax.legend()
    plt.show()

    
interactive_plot = interactive(
    classification,
    data=['moons', 'circles', 'linear'],
    clf=['DecisionTree', 'RandomForest'], 
    max_depth=[1, 3, 5, None],
    n_estimators=[1, 10, 100, 500],
    min_samples_leaf=[1, 2, 5],
    max_features=[1, 2],
)
interactive_plot

interactive(children=(Dropdown(description='clf', options=('DecisionTree', 'RandomForest'), value='DecisionTre…