### Decison Trees

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

%matplotlib notebook

#### Three impurity calculation mechanism
<ol>
    <li>Entropy</li>
    <li>Gini Index</li>
    <li>Error Misclassification</li>
</ol>

In [2]:
def gini(p):
    return (p)*(1 - (p)) + (1 - p)*(1 - (1-p))

def entropy(p):
    return - p*np.log2(p) - (1 - p)*np.log2((1 - p))

def error(p):
    return 1 - np.max([p, 1-p])

In [3]:
x = np.arange(0, 1.0, 0.01)
ent = [entropy(p) if p != 0.0 else None for p in x]
err = [error(i) for i in x]
gini_values = [gini(i) for i in x]

In [4]:
fig = plt.figure()
ax = plt.subplot(111)

labels = ['Entropy', 'Gini Index', 'Misclassification Errors']
linestyle = ['--', '-', '-.']
colors = ['red', 'blue', 'green']
for i, lab, ls, c, in zip([ent, gini_values, err], labels, linestyle, colors):
    _ = plt.plot(x, i, label=lab, color=c, linestyle=ls)
_ = plt.legend()
_ = plt.ylabel('Impurity Index')
_ = plt.title('Nature of Impurities', fontsize=10)

<IPython.core.display.Javascript object>

#### Decision Trees using scikit-learn

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data[:,[2,3]]
y = iris.target

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [8]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

def plot_decision_region(X, y, classifier, test_idx=None, resolution=0.2):
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    markers = ('s', 'x', 'o', '^', 'v')
    
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    """
    Plot the decision surface region
    """
    
    x1_min, x1_max = X[:,0].min(), X[:,0].max()
    x2_min, x2_max = X[:,1].min(), X[:,1].max()
    
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    
    Z = classifier.predict(np.array([np.ravel(xx1), np.ravel(xx2)]).T)
    Z = Z.reshape(xx1.shape)
    _ = plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    _ = plt.xlim(xx1.min(), xx1.max())
    _ = plt.ylim(xx2.min(), xx2.max())
    
    """
    Plot sample points
    """
    X_test, y_test = X[test_idx, 0], y[test_idx]
    for idx, c1 in enumerate(np.unique(y)):
        _ = plt.scatter(x=X[y==c1, 0], y=X[y==c1, 1], marker=markers[idx], c=cmap(idx), label=c1)
    
    """
    Highlight test points
    """
    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[:, 0], X_test[:, 1], c='gray', alpha=0.8, linewidth=1, marker='o', s=55, label='test set')

In [9]:
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined_std = np.hstack((y_train, y_test))

In [10]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
tree.fit(X_train_std, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [11]:
plot_decision_region(X_combined_std, y_combined_std, test_idx=range(105,150), classifier=tree, resolution=0.02)
_ = plt.legend()
_ = plt.xlabel('Petal Length')
_ = plt.ylabel('Petal Width')
_ = plt.title('Decision Tree Classifier using entropy mechanism', fontsize=10)

<IPython.core.display.Javascript object>

In [12]:
tree_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
tree_gini.fit(X_train_std, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [13]:
plot_decision_region(X_combined_std, y_combined_std, test_idx=range(105,150), classifier=tree_gini, resolution=0.02)
_ = plt.legend()
_ = plt.xlabel('Petal Length')
_ = plt.ylabel('Petal Width')
_ = plt.title('Decision Tree Classifier using gini mechanism', fontsize=10)

<IPython.core.display.Javascript object>

In [14]:
tree_error = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
tree_error.fit(X_train_std, y_train)
plot_decision_region(X_combined_std, y_combined_std, test_idx=range(105,150), classifier=tree_error, resolution=0.02)
_ = plt.legend()
_ = plt.xlabel('Petal Length')
_ = plt.ylabel('Petal Width')
_ = plt.title('Decision Tree Classifier using Error mechanism', fontsize=10)

<IPython.core.display.Javascript object>

In [15]:
from sklearn.metrics import accuracy_score
y_pred = tree.predict(X_test_std)
y_pred_gini = tree_gini.predict(X_test_std)
y_pred_error = tree_error.predict(X_test_std)

In [16]:
print("Accuracy Entropy: %.3f" %(accuracy_score(y_test, y_pred)*100))
print("Accuracy Gini Index: %.3f" %(accuracy_score(y_test, y_pred_gini)*100))
print("Accuracy Error: %.3f" %(accuracy_score(y_test, y_pred_error)*100))

Accuracy Entropy: 97.778
Accuracy Gini Index: 97.778
Accuracy Error: 97.778


### Random Forests
#### Ensemble of Decision Trees

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
rf = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1, n_jobs=2)

In [19]:
rf.fit(X_train_std, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [20]:
y_pred_rf = rf.predict(X_test_std)

In [22]:
print("Accuracy: %.3f" %(accuracy_score(y_test, y_pred_rf)*100))

Accuracy: 95.556


In [24]:
_ = plot_decision_region(X_combined_std, y_combined_std, test_idx=range(105,150), classifier=rf, resolution=0.02)
_ = plt.legend()
_ = plt.title('Random Forest', fontsize=10)

<IPython.core.display.Javascript object>