# Miscellaneous plots

This notebook contains miscellaneous plots I used to present tree based models.

In [None]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn import datasets
import pydotplus

import matplotlib.pyplot as plt
import matplotlib

# used to display trees
from IPython.display import Image
%matplotlib inline
plt.style.use('ggplot')

## Boosting example

In [None]:


xi = np.arange(0,51,1)

# exponential that converges to 1
yi = 1-np.exp(-0.05*xi)

plt.figure(figsize=[12,8])
plt.plot(xi, 0.3*yi+0.2, lw=4, label='Weak learner')
plt.plot(xi, -0.15*yi+0.2, lw=4, label='Overall ensemble')
plt.ylim([0, 0.5])
plt.yticks([0, 0.5], fontsize=24)
plt.xticks(np.arange(0,51,10), fontsize=24)

# add text
plt.arrow(20, 0.3, -10, 0, width=0.005, head_length=1, color='k')
plt.text(20.5, 0.3, 'Each tree has higher error', fontsize=24, va='center')

plt.arrow(20, 0.15, -10, 0, width=0.005, head_length=1, color='k')
plt.text(20.5, 0.15, 'Ensemble has lower error', fontsize=24, va='center')
plt.ylabel('Error', fontsize=24)
plt.xlabel('Number of trees', fontsize=24)
plt.show()

In [None]:
def make_colormap(seq):
    """Return a LinearSegmentedColormap
    seq: a sequence of floats and RGB-tuples. The floats should be increasing
    and in the interval (0,1).
    """
    seq = [(None,) * 3, 0.0] + list(seq) + [1.0, (None,) * 3]
    cdict = {'red': [], 'green': [], 'blue': []}
    for i, item in enumerate(seq):
        if isinstance(item, float):
            r1, g1, b1 = seq[i - 1]
            r2, g2, b2 = seq[i + 1]
            cdict['red'].append([item, r1, r2])
            cdict['green'].append([item, g1, g2])
            cdict['blue'].append([item, b1, b2])
    return matplotlib.colors.LinearSegmentedColormap('CustomMap', cdict)

# OLD PURPLE colormap

# colormap
#cm = plt.cm.get_cmap(name='Purples',lut=2) # dummy initialization
#c1 = [x/256.0 for x in [224,236,244]]
#c2 = [x/256.0 for x in [136,86,167]]
#cm = cm.from_list('custom', [c1,c2], N=2)


# NEW custom colormap
#e58139f9 - orange
#399de5e0 - to blue
s = list()

lo = np.array(matplotlib.colors.to_rgb('#e5813900'))
hi = np.array(matplotlib.colors.to_rgb('#399de5e0'))

for i in range(255):
    s.append( list((hi-lo)*(float(i)/255)+lo) )
cm = make_colormap(s)

In [None]:
def plot_model_purple(mdl, X, y, feat):
    plt.figure(figsize=[8,5])

    # colormap
    cm = plt.cm.get_cmap(name='Purples',lut=2) # dummy initialization
    c1 = [x/256.0 for x in [224,236,244]]
    c2 = [x/256.0 for x in [136,86,167]]
    cm = cm.from_list('custom', [c1,c2], N=2)

    # get minimum and maximum values
    x0_min = X[:, 0].min()
    x0_max = X[:, 0].max()
    x1_min = X[:, 1].min()
    x1_max = X[:, 1].max()

    vmin = np.min([x0_min,x1_min])
    vmax = np.max([x0_max,x1_max])
    xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                         np.linspace(x1_min, x1_max, 1000))

    Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # plot the contour - colouring different regions
    cs = plt.contourf(xx, yy, Z, cmap=cm, levels=[0,1,2])

    # plot the individual data points - colouring by the *true* outcome
    color = np.asarray(y.ravel(),dtype='float')
    plt.scatter(X[:, 0], X[:, 1], c=color, marker='o',
                s=60, cmap=cm)

    plt.xlabel(feat[0],fontsize=24)
    plt.ylabel(feat[1],fontsize=24)
    plt.axis("tight")

    plt.colorbar(cs)

In [None]:
def plot_model_pred_2d_old(mdl, X, y, feat):
    # look at the regions in a 2d plot
    # based on scikit-learn tutorial plot_iris.html

    # get minimum and maximum values
    x0_min = X[:, 0].min()
    x0_max = X[:, 0].max()
    x1_min = X[:, 1].min()
    x1_max = X[:, 1].max()

    xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                         np.linspace(x1_min, x1_max, 1000))

    Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # plot the contour - colouring different regions
    cs = plt.contourf(xx, yy, Z, cmap='hsv')

    # plot the individual data points - colouring by the *true* outcome
    color = y.ravel()
    plt.scatter(X[:, 0], X[:, 1], c=color, marker='o', s=40, cmap='Blues')

    plt.xlabel(feat[0],fontsize=24)
    plt.ylabel(feat[1],fontsize=24)
    plt.axis("tight")

In [None]:
def plot_model_pred_2d(mdl, X, y, feat, cm=None, plot_colorbar=True):
    # look at the regions in a 2d plot
    # based on scikit-learn tutorial plot_iris.html
    
    # get minimum and maximum values
    x0_min = X[:, 0].min()
    x0_max = X[:, 0].max()
    x1_min = X[:, 1].min()
    x1_max = X[:, 1].max()

    xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 100),
                         np.linspace(x1_min, x1_max, 100))

    Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    if not cm:
        # custom colormap
        #e58139f9 - orange
        #399de5e0 - to blue
        s = list()

        lo = np.array(matplotlib.colors.to_rgb('#e5813900'))
        hi = np.array(matplotlib.colors.to_rgb('#399de5e0'))

        for i in range(255):
            s.append( list((hi-lo)*(float(i)/255)+lo) )
        cm = make_colormap(s)
    
    # plot the contour - colouring different regions
    cs = plt.contourf(xx, yy, Z, cmap=cm)

    # plot the individual data points - colouring by the *true* outcome
    color = y.ravel()
    plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k', linewidth=2,
                marker='o', s=60, cmap=cm)

    plt.xlabel(feat[0],fontsize=24)
    plt.ylabel(feat[1],fontsize=24)
    plt.axis("tight")
    if plot_colorbar:
        #plt.clim([-1.5,1.5])
        plt.colorbar()

In [None]:
# real example
df = datasets.load_iris()

In [None]:
# simple plot with just the points

plt.figure(figsize=[10,8])
plt.scatter(0, 1, s=400)
plt.scatter(0, 2,color=lo, s=400)
plt.scatter(0, 3,color=hi, s=400)
plt.grid()
plt.show()

In [None]:
# 16 plots, 3 subplots per row and text
f, ax = plt.subplots(4, 4, figsize=[16,10], sharex='col', sharey='row')

# for this plot, we re-order the data so sepal length is bottom right
data = df['data']
data = data[:, ::-1]
feat = df['feature_names']
feat = feat[::-1]

for i in range(df['data'].shape[1]):
    for j in range(df['data'].shape[1]):
        if i==j:
            ax[i, j].grid()
        else:
            ax[i, j].scatter(data[:50,j], data[:50,i])
            ax[i, j].scatter(data[50:100,j], data[50:100,i],color=lo)
            ax[i, j].scatter(data[100:,j], data[100:,i],color=hi)
            
# add text to middle plots
for i in range(df['data'].shape[1]):
    xloc = ax[i,i].get_xlim()
    yloc = ax[i,i].get_ylim()
    ax[i, i].text(np.mean(xloc), np.mean(yloc), feat[i],
                  horizontalalignment='center', verticalalignment='center', fontsize=16)
    
# hide x ticks for top plots
plt.setp([a.get_xticklabels() for a in ax[0, :]], visible=False)

# hide y ticks for right plots
plt.setp([a.get_yticklabels() for a in ax[:, 1]], visible=False)

plt.show()

In [None]:
# 16 plots, 3 subplots per row and text
f, ax = plt.subplots(4, 4, figsize=[16,10], sharex='col', sharey='row')

# for this plot, we re-order the data so sepal length is bottom right
data = df['data']
data = data[:, ::-1]
feat = df['feature_names']
feat = feat[::-1]

for i in range(df['data'].shape[1]):
    for j in range(df['data'].shape[1]):
        if i==j:
            ax[i, j].grid()
        else:
            #ax[i, j].scatter(data[:50,j], data[:50,i])
            ax[i, j].scatter(data[50:100,j], data[50:100,i],color=lo)
            ax[i, j].scatter(data[100:,j], data[100:,i],color=hi)
            
# add text to middle plots
for i in range(df['data'].shape[1]):
    xloc = ax[i,i].get_xlim()
    yloc = ax[i,i].get_ylim()
    ax[i, i].text(np.mean(xloc), np.mean(yloc), feat[i],
                  horizontalalignment='center', verticalalignment='center', fontsize=16)
    
# hide x ticks for top plots
plt.setp([a.get_xticklabels() for a in ax[0, :]], visible=False)

# hide y ticks for right plots
plt.setp([a.get_yticklabels() for a in ax[:, 1]], visible=False)

plt.show()

In [None]:
idx = [0,2]
X = df['data'][50:,idx]
y = df['target'][50:]
# scale y to be -1, 1
y[y==1] = -1
y[y==2] = 1
feat = [df['feature_names'][x] for x in idx]

In [None]:
def plot_cleanup():
    ax = plt.gca()
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

In [None]:
mdl = linear_model.LogisticRegression()
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
f = plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.colorbar(f)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
mdl = linear_model.LinearRegression()
x0 = X[:,0].reshape([100,1])
x1 = X[:,1].reshape([100,1])
mdl = mdl.fit(x0, x1)

# get minimum and maximum values
x0_min = X[:, 0].min()+0.2
x0_max = X[:, 0].max()-0.2

Z = mdl.predict([[x0_min], [x0_max]])

plt.figure(figsize=[12,8])

# plot the line
plt.plot([x0_min, x0_max], Z, 'k--', linewidth=3)
# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
f = plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.colorbar(f)

# cleanup plot
plot_cleanup()
plt.show()

In [None]:
print('{} = {:3.2f} * {} + {:3.1f}'.format(feat[1], mdl.coef_[0][0], feat[0], mdl.intercept_[0]))

In [None]:
mdl = linear_model.LinearRegression()
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
f = plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.colorbar(f)

# cleanup plot
plot_cleanup()
plt.show()

In [None]:
mdl = linear_model.LinearRegression()
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
# round predictions
Z[Z>=0] = 1
Z[Z<0] = -1
Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.colorbar(cs)

# cleanup plot
plot_cleanup()
plt.show()

# decision tree

In [None]:
# fit a decision tree
mdl = tree.DecisionTreeClassifier(max_depth=1)
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
# round predictions
Z[Z>=0] = 1
Z[Z<0] = -1
Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.colorbar(cs)

# cleanup plot
plot_cleanup()
plt.show()

In [None]:
# examine the tree
tree_graph = tree.export_graphviz(mdl, out_file=None,
                         feature_names=feat, 
                         filled=True, rounded=True)  
graph = pydotplus.graphviz.graph_from_dot_data(tree_graph) 
Image(graph.create_png())

# fitting a sinusoid

In [None]:
# create a sample dataset of sinusoidal data
rng = np.random.RandomState(777)
N = 30

# random points along the time axis for two cycles
x = np.sort(2 * np.pi * rng.rand(N))
y_true = np.sin(x)
# generate the same data with random noise
y_noise = np.sin(x) + (rng.rand(N)-0.5)*0.3

# reshape x to be the only feature
x = x.reshape(-1,1)

# fit a decision tree
mdl = tree.DecisionTreeRegressor(max_depth=5).fit(x, y_noise)

# get test points
x_test = np.linspace(0, 2*np.pi, 100).reshape(-1,1)
y_test_pred = mdl.predict(x_test)

plt.figure(figsize=[12,8])
# plot original sinusoid
plt.plot(x, y_true, 'k--',linewidth=2, label='Truth')

# noisy test points
plt.scatter(x, y_noise, marker='o', color='b', alpha=0.8, s=75, linewidth=2,label='Data')

# decision tree decisions
plt.plot(x_test, y_test_pred, 'r-', linewidth=2,label='Decision tree')

plot_cleanup()
plt.legend(fontsize=20)
plt.show()

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best')
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.colorbar(f)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
# examine the tree
tree_graph = tree.export_graphviz(mdl, out_file=None,
                         feature_names=feat, 
                         filled=True, rounded=True)  
graph = pydotplus.graphviz.graph_from_dot_data(tree_graph) 
Image(graph.create_png())

# best splits

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best')
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

#plt.colorbar(cs)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best')
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.plot([x0_min, x0_max],[4.75,4.75],'k--',linewidth=3)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best',max_depth=1)
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.plot([x0_min, x0_max],[4.75,4.75],'k--',linewidth=3)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
# examine the tree
tree_graph = tree.export_graphviz(mdl, out_file=None,
                         feature_names=feat, 
                         filled=True, rounded=True)  
graph = pydotplus.graphviz.graph_from_dot_data(tree_graph) 
Image(graph.create_png())

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=1)
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

X_grid = np.c_[xx.ravel(), yy.ravel()]
Z = mdl.predict(X_grid)

# customize the prediction using the left side of above tree
# apply:
# petal length <= 3.9
idxUnk = (X_grid[:,1] <= 4.75) & (X_grid[:,0] <= 4.95)
Z[idxUnk] = 0
Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.plot([x0_min, x0_max],[4.75,4.75],'k--',linewidth=3)
plt.plot([4.95, 4.95],[x1_min, 4.75],'k--',linewidth=3)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=1)
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

X_grid = np.c_[xx.ravel(), yy.ravel()]
Z = mdl.predict(X_grid)

# customize the prediction using the left side of above tree
idxUnk = (X_grid[:,1] <= 4.75) & (X_grid[:,0] <= 4.95) & (X_grid[:,1] <= 3.9)
Z[idxUnk] = -1
idxUnk = (X_grid[:,1] <= 4.75) & (X_grid[:,0] <= 4.95) & (X_grid[:,1] > 3.9)
Z[idxUnk] = 1
Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.plot([x0_min, x0_max],[4.75,4.75],'k--',linewidth=3)
plt.plot([4.95, 4.95],[x1_min, 4.75],'k--',linewidth=3)
plt.plot([x0_min, 4.95],[3.9, 3.9],'k--',linewidth=3)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=1)
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

X_grid = np.c_[xx.ravel(), yy.ravel()]
Z = mdl.predict(X_grid)
Z = Z.astype(float)
# customize the prediction using the left side of above tree
# left side of tree
idxUnk = (X_grid[:,1] <= 4.75) & (X_grid[:,0] <= 4.95) & (X_grid[:,1] <= 3.9)
Z[idxUnk] = -1
idxUnk = (X_grid[:,1] <= 4.75) & (X_grid[:,0] <= 4.95) & (X_grid[:,1] > 3.9)
Z[idxUnk] = 1
# right side of tree
idxUnk = (X_grid[:,1] > 4.75) & (X_grid[:,1] <= 5.15)
Z[idxUnk] = 0.5
idxUnk = (X_grid[:,1] > 4.75) & (X_grid[:,1] > 5.15)
Z[idxUnk] = 1

Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.plot([x0_min, x0_max],[4.75,4.75],'k--',linewidth=3)

plt.plot([x0_min, x0_max],[4.75,4.75],'k--',linewidth=3)
plt.plot([4.95, 4.95],[x1_min, 4.75],'k--',linewidth=3)
plt.plot([x0_min, 4.95],[3.9, 3.9],'k--',linewidth=3)
plt.plot([x0_min, x0_max],[5.15,5.15],'k--',linewidth=3)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
# examine a depth-3 tree
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=3)
mdl = mdl.fit(X,y)

tree_graph = tree.export_graphviz(mdl, out_file=None,
                         feature_names=feat, 
                         filled=True, rounded=True)  
graph = pydotplus.graphviz.graph_from_dot_data(tree_graph) 
Image(graph.create_png())

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=3)
mdl = mdl.fit(X,y)

plt.figure(figsize=[12,8])

# get minimum and maximum values
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))

X_grid = np.c_[xx.ravel(), yy.ravel()]
Z = mdl.predict(X_grid)
Z = Z.astype(float)

# customize the prediction using the left side of above tree
# left side of tree
idxUnk = (X_grid[:,1] <= 4.75) & (X_grid[:,0] <= 4.95) & (X_grid[:,1] <= 3.9)
Z[idxUnk] = -1
idxUnk = (X_grid[:,1] <= 4.75) & (X_grid[:,0] <= 4.95) & (X_grid[:,1] > 3.9)
Z[idxUnk] = 1
# right side of tree
idxUnk = (X_grid[:,1] > 4.75) & (X_grid[:,1] <= 5.15) & (X_grid[:,0] <= 6.6)
Z[idxUnk] = 0.85
idxUnk = (X_grid[:,1] > 4.75) & (X_grid[:,1] <= 5.15) & (X_grid[:,0] > 6.6)
Z[idxUnk] = -0.6

Z = Z.reshape(xx.shape)

# plot the contour - colouring different regions
cs = plt.contourf(xx, yy, Z, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

plt.plot([x0_min, x0_max],[4.75,4.75],'k--',linewidth=3)

plt.plot([x0_min, x0_max],[4.75,4.75],'k--',linewidth=3)
plt.plot([4.95, 4.95],[x1_min, 4.75],'k--',linewidth=3)
plt.plot([x0_min, 4.95],[3.9, 3.9],'k--',linewidth=3)
plt.plot([x0_min, x0_max],[5.15,5.15],'k--',linewidth=3)

# cleanup plot
plot_cleanup()

plt.show()

# algorithm at different spots in the tree

In [None]:
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best',max_depth=3).fit(X,y)

plt.figure(figsize=[12,8])
plot_model_pred_2d(mdl, X, y, feat, plot_colorbar=False)

# cleanup plot
plot_cleanup()

plt.show()

In [None]:
# examine a depth-7 tree
mdl = tree.DecisionTreeClassifier(criterion='entropy', splitter='best',max_depth=7).fit(X,y)
tree_graph = tree.export_graphviz(mdl, out_file=None,
                         feature_names=feat, 
                         filled=True, rounded=True)  
graph = pydotplus.graphviz.graph_from_dot_data(tree_graph) 
Image(graph.create_png())

In [None]:
plt.figure(figsize=[12,8])
plot_model_pred_2d(mdl, X, y, feat, plot_colorbar=False)

# cleanup plot
plot_cleanup()

plt.show()

# bootstrapping CDF vs. PDF

In [None]:

# Create some test data
dx = .1
X  = np.arange(-2,2,dx)
Y  = np.exp(-X**2)

# Normalize the data to a proper PDF
Y /= (dx*Y).sum()

# Compute the CDF
CY = np.cumsum(Y*dx)

# shift the axis over, set the histogram widths
factor = 10
X = (X*factor)+70
hist_w = 1

colors = plt.cm.Set1([x/7.0 for x in range(7)])
plt.figure(figsize=[12,7])
# plot pdf
plt.plot(X,Y, linewidth=3, color=colors[1])
# plot hist
plt.bar(X-(dx*factor/2.),Y,width=dx*factor,linewidth=0.1, facecolor=colors[1],alpha=0.5)
# plot CDF
plt.plot(X,CY,'--', linewidth=3,color=colors[3])

plot_cleanup()
plt.show()

# no histogram


# Create some test data
dx = .1
X  = np.arange(-2,2,dx)
Y  = np.exp(-X**2)

# Normalize the data to a proper PDF
Y /= (dx*Y).sum()

# Compute the CDF
CY = np.cumsum(Y*dx)

# shift the axis over, set the histogram widths
factor = 10
X = (X*factor)+70
hist_w = 1

colors = plt.cm.Set1([x/7.0 for x in range(7)])
plt.figure(figsize=[12,7])
# plot pdf
plt.plot(X,Y, linewidth=3, color=colors[1])
# plot CDF
plt.plot(X,CY,'--', linewidth=3,color=colors[3])

plot_cleanup()
plt.show()

In [None]:
colors = plt.cm.Set1([x/7.0 for x in range(7)])

# a sample of data using rand + normrnd
np.random.seed(123)
Y = np.random.normal(loc=0.0, scale=1.0, size=[50,])
plt.figure(figsize=[12,7])
n, bins, patches = plt.hist(Y,bins=np.linspace(-5,5,50),color=colors[1],normed=True)


plt.xlim([-3,3])
plt.ylim([0,1])

# reset xticks to be in the "weight" range
locs, labels = plt.xticks()
plt.xticks( locs, [x*factor+70 for x in locs] )

plot_cleanup()
plt.show()

# plot the CDF of the above
plt.figure(figsize=[12,7])

n, bins, patches = plt.hist(Y,bins=np.linspace(-5,5,50),color=colors[1],normed=True)
n, bins, patches = plt.hist(Y,bins=np.linspace(-5,5,50),color=colors[2],normed=True,
                            linewidth=4,histtype='step',cumulative=True)


plt.xlim([-3,3])
plt.ylim([0,1])

# reset xticks to be in the "weight" range
locs, labels = plt.xticks()
plt.xticks( locs, [x*factor+70 for x in locs] )

plot_cleanup()

plt.show()

In [None]:

# Create some test data
hist_w = 1

dx = .1
X  = np.arange(-3,3,dx)
Y  = np.exp(-X**2)

# Normalize the data to a proper PDF
Y /= (dx*Y).sum()

# Compute the CDF
CY = np.cumsum(Y*dx)
colors = plt.cm.Set1([x/7.0 for x in range(7)])

# a sample of data using rand + normrnd
np.random.seed(123)
Y = np.random.normal(loc=0.0, scale=1.0, size=[50,])




colors = plt.cm.Set1([x/7.0 for x in range(7)])
plt.figure(figsize=[12,7])


# plot CDF
n, bins, patches = plt.hist(Y,bins=np.linspace(-5,5,50),color=colors[2],normed=True,
                            linewidth=4,histtype='step',cumulative=True)
plt.plot(X,CY,'--', linewidth=3,color=colors[3])

plt.xlim([-3,3])
plt.ylim([0,1])

# reset xticks to be in the "weight" range
locs, labels = plt.xticks()
plt.xticks( locs, [int(x*factor+70) for x in locs] )

plot_cleanup()

plt.show()

In [None]:
# a sample of data using rand + normrnd
for m in range(5):
    np.random.seed(123)
    Y = np.random.normal(loc=0.0, scale=1.0, size=[50,])

    colors = plt.cm.Set1([x/7.0 for x in range(7)])
    plt.figure(figsize=[12,7])

    # plot original CDF
    n, bins, patches = plt.hist(Y,bins=np.linspace(-5,5,50),color=colors[2],normed=True,
                                linewidth=4,histtype='step',cumulative=True)

    # remove green from our colors
    colors = [colors[i] for i in range(colors.shape[0]) if i!=2]

    # plot ~5 repeated samples
    for i in range(m):
        # bootstrap sample
        idx = np.random.randint(0, high=Y.shape[0],size=Y.shape[0])
        n, bins, patches = plt.hist(Y[idx],bins=np.linspace(-5,5,50),color=colors[i],normed=True,
                                    linewidth=0.1,histtype='bar',cumulative=False,alpha=0.5)
        n, bins, patches = plt.hist(Y[idx],bins=np.linspace(-5,5,50),color=colors[i],normed=True,
                                    linewidth=2,histtype='step',cumulative=True)

    plt.xlim([-3,3])
    plt.ylim([0,1])

    # reset xticks to be in the "weight" range
    locs, labels = plt.xticks()
    plt.xticks( locs, [int(x*factor+70) for x in locs] )

    plot_cleanup()

    plt.show()

In [None]:
# load fisher-iris and build bagging model .. showing each individual tree and cumulative result
# real example
df = datasets.load_iris()

idx = [0,2]
X = df['data'][50:,idx]
y = df['target'][50:]

feat = [df['feature_names'][x] for x in idx]

# get minimum and maximum values for dataset
# these are used in the plotting
x0_min = X[:, 0].min()
x0_max = X[:, 0].max()
x1_min = X[:, 1].min()
x1_max = X[:, 1].max()

vmin = np.min([x0_min,x1_min])
vmax = np.max([x0_max,x1_max])
xx, yy = np.meshgrid(np.linspace(x0_min, x0_max, 1000),
                     np.linspace(x1_min, x1_max, 1000))


# plot the original data
fig = plt.figure(figsize=[8,5])

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')
plt.scatter(X[:, 0], X[:, 1], c=color, marker='o',
            s=60, cmap=cm)
plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")
# cleanup plot
plot_cleanup()
# disable ticks
plt.xticks([])
plt.yticks([])
plt.show()




np.random.seed(321)
clf = tree.DecisionTreeClassifier(max_depth=5)

mdls = list()
ypred = np.zeros([y.shape[0]])

for i in range(5):
    fig = plt.figure(figsize=[8,5])
    
    # random sample
    idx = np.random.randint(0, X.shape[0], X.shape[0])
    idxOOB = [x for x in range(X.shape[0]) if x not in idx]
    
    # create the estimator
    mdl = clf.fit(X[idx,:],y[idx])
    mdls.append(mdl)    

    Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    if i==0:
        Z_all = Z.astype(float)
    else:
        Z_all += Z.astype(float)
    
    # plot the contour - colouring different regions according to class
    cs = plt.contourf(xx, yy, Z, cmap=cm)

    # plot the individual data points - colouring by the *true* outcome
    color = np.asarray(y[idx].ravel(),dtype='float')
    plt.scatter(X[idx, 0], X[idx, 1], c=color, edgecolor='k',
                marker='o', linewidth=2,
                s=60, cmap=cm)
    
    # plot "s" for data points which weren't included
    color = np.asarray(y[idxOOB].ravel(),dtype='float')
    plt.scatter(X[idxOOB, 0], X[idxOOB, 1], c=color, edgecolor='gray',
                marker='s', linewidth=2,
                s=60, cmap=cm)

    plt.xlabel(feat[0],fontsize=24)
    plt.ylabel(feat[1],fontsize=24)
    plt.axis("tight")

    #plt.colorbar(cs)

    # cleanup plot
    plot_cleanup()

    # disable ticks
    plt.xticks([])
    plt.yticks([])
    
    txt = 'Tree {}'.format(i+1)
    plt.text(7.0, 3.5, txt, fontdict={'fontsize':12})
    plt.show()
    
    
print('Final aggregation')
Z_all = Z_all / 5.0
Z_all = np.round(Z_all)

fig = plt.figure(figsize=[8,5])
# plot the contour - colouring different regions according to class
cs = plt.contourf(xx, yy, Z_all, cmap=cm)

# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y.ravel(),dtype='float')

plt.scatter(X[:, 0], X[:, 1], c=color, edgecolor='k',
            marker='o', linewidth=2,
            s=60, cmap=cm)

plt.xlabel(feat[0],fontsize=24)
plt.ylabel(feat[1],fontsize=24)
plt.axis("tight")

#plt.colorbar(cs)

# cleanup plot
plot_cleanup()

# disable ticks
plt.xticks([])
plt.yticks([])

txt = 'All trees'
plt.text(7.0, 3.5, txt, fontdict={'fontsize':12})
plt.show()


# random forest

In [None]:
# load fisher-iris and build bagging model .. showing each individual tree and cumulative result
# real example
df = datasets.load_iris()

idx = [0,2]
X = df['data'][50:,:]
y = df['target'][50:]

feat = df['feature_names']

# get minimum and maximum values for dataset
# these are used in the plotting
x_min = 0
x_max = 8
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 1000),
                     np.linspace(x_min, x_max, 1000))


np.random.seed(321)
clf = tree.DecisionTreeClassifier(max_depth=5)

mdls = list()
fig = plt.figure(figsize=[14,10])

for i in range(6):
    ax = fig.add_subplot(2,3,i+1)
    
    # random sample of data
    idx = np.random.randint(0, X.shape[0], X.shape[0])
    idxOOB = [x for x in range(X.shape[0]) if x not in idx]
    
    # random subset of features
    idxFeat = np.random.permutation(4)[:2]
    
    # create the estimator
    mdl = clf.fit(X[idx,:][:,idxFeat],y[idx])
    mdls.append(mdl)    

    Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # plot the contour - colouring different regions according to class
    cs = plt.contourf(xx, yy, Z, cmap=cm)

    # plot the individual data points - colouring by the *true* outcome
    color = np.asarray(y[idx].ravel(),dtype='float')
    plt.scatter(X[idx, idxFeat[0]], X[idx, idxFeat[1]], c=color, marker='o', edgecolors='k',
                s=60, cmap=cm)
    
    # plot a gray square around for data points which weren't included
    color = np.asarray(y[idxOOB].ravel(),dtype='float')
    plt.scatter(X[idxOOB, idxFeat[0]], X[idxOOB, idxFeat[1]], c=color, marker='s',
                linewidth=2, edgecolors='gray',
                s=60, cmap=cm)
    

    plt.xlabel(feat[idxFeat[0]],fontsize=24)
    plt.ylabel(feat[idxFeat[1]],fontsize=24)
    plt.axis("tight")

    #plt.colorbar(cs)

    # cleanup plot
    plot_cleanup()

    # disable ticks
    plt.xticks([])
    plt.yticks([])
    
    txt = 'Tree {}'.format(i+1)
    plt.text(3.0, 4.0, txt, fontdict={'fontsize':12,'fontweight':'bold'})
plt.show()

# random forest overview

In [None]:
# load fisher-iris and build bagging model .. showing each individual tree and cumulative result
# real example
df = datasets.load_iris()

idx = [0,2]
X = df['data'][50:,:]
y = df['target'][50:]

feat = df['feature_names']

# get minimum and maximum values for dataset
# these are used in the plotting
x_min = 0
x_max = 8
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 1000),
                     np.linspace(x_min, x_max, 1000))


np.random.seed(172631)
i=0
# random sample of data
idx = np.random.randint(0, X.shape[0], X.shape[0])
idxOOB = [x for x in range(X.shape[0]) if x not in idx]

# random subset of features
idxFeat = np.random.permutation(4)[:2]

# create the estimator
mdl = clf.fit(X[idx,:][:,idxFeat],y[idx])
mdls.append(mdl)    

Z = mdl.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# plot the bootstrap sample with the OOB observations
fig = plt.figure(figsize=[14,10])
# plot the individual data points - colouring by the *true* outcome
color = np.asarray(y[idx].ravel(),dtype='float')
plt.scatter(X[idx, idxFeat[0]], X[idx, idxFeat[1]], c=color, marker='o',
            edgecolor='k', linewidth=2,
            s=60, cmap=cm)
# plot "x" for data points which weren't included
color = np.asarray(y[idxOOB].ravel(),dtype='float')
plt.scatter(X[idxOOB, idxFeat[0]], X[idxOOB, idxFeat[1]], c=color, marker='s',
            linewidth=2, edgecolors='gray',
            s=60, cmap=cm)
plt.xlabel(feat[idxFeat[0]],fontsize=24)
plt.ylabel(feat[idxFeat[1]],fontsize=24)
plt.axis("tight")
# cleanup plot
plot_cleanup()
plt.xlim([0,8])
plt.ylim([0,8])
plt.show()

# plot the bootstrap sample w/o OOB
fig = plt.figure(figsize=[14,10])
color = np.asarray(y[idx].ravel(),dtype='float')
plt.scatter(X[idx, idxFeat[0]], X[idx, idxFeat[1]], c=color, marker='o',
            linewidth=2, edgecolor='k',
            s=60, cmap=cm)
plt.xlabel(feat[idxFeat[0]],fontsize=24)
plt.ylabel(feat[idxFeat[1]],fontsize=24)
plt.axis("tight")
# cleanup plot
plot_cleanup()
#plt.text(3.0, 4.0, txt, fontdict={'fontsize':12,'fontweight':'bold'})
plt.xlim([0,8])
plt.ylim([0,8])
plt.show()



# plot the bootstrap sample w/o OOB and with decision surface
fig = plt.figure(figsize=[14,10])
cs = plt.contourf(xx, yy, Z, cmap=cm)
color = np.asarray(y[idx].ravel(),dtype='float')
plt.scatter(X[idx, idxFeat[0]], X[idx, idxFeat[1]], c=color, marker='o',
            edgecolor='k', linewidth=2,
            s=60, cmap=cm)
plt.xlabel(feat[idxFeat[0]],fontsize=24)
plt.ylabel(feat[idxFeat[1]],fontsize=24)
plt.axis("tight")
# cleanup plot
plot_cleanup()
plt.grid()
#plt.text(3.0, 4.0, txt, fontdict={'fontsize':12,'fontweight':'bold'})
plt.xlim([0,8])
plt.ylim([0,8])
plt.show()

# performance curve of random forest

## on training set

In [None]:
df = datasets.load_iris()

X = df['data']
y = df['target']

np.random.seed(321)
mdl = ensemble.RandomForestClassifier(n_estimators=50, oob_score=True)
mdl = mdl.fit(X,y)

from sklearn.ensemble.forest import _generate_unsampled_indices

err = list()
n_samples = X.shape[0]
pred = np.zeros([y.shape[0],50])
roll_pred = np.zeros([y.shape[0],50])

idx = np.zeros([y.shape[0],50],dtype=bool)
for i, estimator in enumerate(mdl.estimators_):
    # Here at each iteration we obtain out of bag samples for every tree.
    idxOOB = _generate_unsampled_indices(estimator.random_state, n_samples)
    
    # update predictions
    curr_pred = estimator.predict(X[idxOOB,:])
    pred[idxOOB,i] = curr_pred
    idx[idxOOB,i] = True

    idxFeat = range(i+1)
    roll_pred[:,i] = np.sum(pred[:,idxFeat]*idx[:,idxFeat],axis=1)
    idxKeep = np.sum(idx[:, idxFeat],axis=1)
    
    roll_pred[idxKeep>0,i] = roll_pred[idxKeep>0,i] / idxKeep[idxKeep>0]
    
    # convert from 0/1 to the class labels
    roll_pred[idxKeep>0,i] = mdl.classes_[np.round(roll_pred[idxKeep>0,i]).astype(int)]
    
    
    # calculate current error
    err.append( 1.0-np.mean(roll_pred[idxKeep>0,i] == y[idxKeep>0] ) )

err = np.asarray(err)
plt.figure(figsize=[10,7])
plt.plot(range(err.shape[0]),err*100.0,color=colors[1],linewidth=4)
plt.ylabel('Number of errors',fontsize=20)
plt.xlabel('Number of trees',fontsize=20)
plot_cleanup()
plt.show()

In [None]:
df = datasets.load_iris()

X = df['data']
y = df['target']

np.random.seed(321)
mdl = ensemble.RandomForestClassifier(n_estimators=50, oob_score=True)
mdl = mdl.fit(X,y)

from sklearn.ensemble.forest import _generate_unsampled_indices

err = list()
n_samples = X.shape[0]
pred = np.zeros([y.shape[0],50])
roll_pred = np.zeros([y.shape[0],50])
idx = np.zeros([y.shape[0],50],dtype=bool)
for i, estimator in enumerate(mdl.estimators_):
    # Here at each iteration we obtain out of bag samples for every tree.
    idxOOB = _generate_unsampled_indices(estimator.random_state, n_samples)
    
    # update predictions
    curr_pred = estimator.predict(X[idxOOB,:])
    pred[idxOOB,i] = curr_pred
    idx[idxOOB,i] = True

    idxFeat = range(i+1)
    roll_pred[:,i] = np.sum(pred[:,idxFeat]*idx[:,idxFeat],axis=1)
    idxKeep = np.sum(idx[:, idxFeat],axis=1)
    
    roll_pred[idxKeep>0,i] = roll_pred[idxKeep>0,i] / idxKeep[idxKeep>0]
    
    # convert from 0/1 to the class labels
    roll_pred[idxKeep>0,i] = mdl.classes_[np.round(roll_pred[idxKeep>0,i]).astype(int)]
    # calculate current error
    err.append( 1.0-np.mean(roll_pred[idxKeep>0,i] == y[idxKeep>0] ) )

err = np.asarray(err)
plt.figure(figsize=[10,7])
plt.plot(range(err.shape[0]),err*100.0,color=colors[1],linewidth=4)
plt.ylabel('Number of errors',fontsize=20)
plt.xlabel('Number of trees',fontsize=20)
plot_cleanup()
plt.show()

In [None]:
importances = mdl.feature_importances_
std = np.std([current_tree.feature_importances_ for current_tree in mdl.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
plt.figure(figsize=[10,7])
plt.barh(range(X.shape[1]), importances[indices],
       color=colors[0], xerr=std[indices], align="center")
plt.yticks(range(X.shape[1]), [feat[i] for i in indices])
plt.ylim([-1, X.shape[1]])
plot_cleanup()
plt.show()