- [8.1.1 Regression Trees](#8.1.1-Regression-Trees)
- [8.1.2 Classification Trees](#8.1.2-Classification-Trees)
- [Lab: 8.3.1 Fitting Classification Trees](#8.3.1-Fitting-Classification-Trees)
- [Lab: 8.3.2 Fitting Regression Trees](#8.3.2-Fitting-Regression-Trees)
- [Lab: 8.3.3 Bagging and Random Forests](#8.3.3-Bagging-and-Random-Forests)
- [Lab: 8.3.4 Boosting](#8.3.4-Boosting)

In [None]:
# !pip install pydotplus

# Chapter 8 - Tree-based Methods

In [None]:
%matplotlib inline

# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import pydotplus
from IPython.display import Image

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.externals.six import StringIO  
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

plt.style.use('seaborn-white')

In [None]:
def tree_plot(clf, features, labels=None):
    """Plot decision tree in Notebook"""
    from sklearn import tree
    import graphviz
    from pydotplus import graph_from_dot_data

    dot_data = tree.export_graphviz(clf, out_file=None, feature_names=features, class_names=labels,
                             filled=True, rounded=True, special_characters=True, impurity=True, proportion=False)
    return Image(graph_from_dot_data(dot_data).create_png())

### 8.1.1 Regression Trees

In [None]:
df = pd.read_csv('../../_data/Hitters.csv').dropna()
df.info()

In [None]:
X = df[['Years', 'Hits']].as_matrix()
y = np.log(df.Salary.as_matrix())

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11,4))
ax1.hist(df.Salary.as_matrix())
ax1.set_xlabel('Salary')
ax2.hist(y)
ax2.set_xlabel('Log(Salary)');

In [None]:
regr = DecisionTreeRegressor(max_leaf_nodes=3)
regr.fit(X, y)

### Figure 8.1

In [None]:
tree_plot(regr, ['Years', 'Hits'])

###  Figure 8.2

In [None]:
df.plot('Years', 'Hits', kind='scatter', color='orange', figsize=(7,6))
plt.xlim(0,25)
plt.ylim(ymin=-5)
plt.xticks([1, 4.5, 24])
plt.yticks([1, 117.5, 238])
plt.vlines(4.5, ymin=-5, ymax=250)
plt.hlines(117.5, xmin=4.5, xmax=25)
plt.annotate('R1', xy=(2, 117.5), fontsize='xx-large')
plt.annotate('R2', xy=(11, 60), fontsize='xx-large')
plt.annotate('R3', xy=(11, 170), fontsize='xx-large');

### Pruning
This is currently not supported in scikit-learn. See first point under 'disadvantages of decision trees in the <A href='http://scikit-learn.github.io/dev/modules/tree.html#'>documentation</A>. Implementation has been <A href='https://github.com/scikit-learn/scikit-learn/pull/941'>discussed</A> but Random Forests have better predictive qualities than a single pruned tree anyway.
    

### 8.1.2 Classification Trees

Dataset available on http://www-bcf.usc.edu/~gareth/ISL/data.html

In [None]:
df2 = pd.read_csv('../../_data/Heart.csv').drop('Unnamed: 0', axis=1).dropna()
df2.info()

In [None]:
from collections import Counter

Counter(df2.ChestPain)
Counter(df2.Thal)
Counter(df2.AHD)

In [None]:
df2.ChestPain = pd.factorize(df2.ChestPain)[0]
df2.Thal = pd.factorize(df2.Thal)[0]

In [None]:
Counter(df2.ChestPain)
Counter(df2.Thal)

### Set labels

In [None]:
X2 = df2.drop('AHD', axis=1)
y2 = pd.factorize(df2.AHD)[0]
y2

### Decision Tree

In [None]:
clf = DecisionTreeClassifier(max_depth=None, max_leaf_nodes=6, max_features=3)
clf.fit(X2, y2)

In [None]:
clf.score(X2, y2)

In [None]:
tree_plot(clf, features=X2.columns, labels=['No', 'Yes'])

## Lab

### 8.3.1 Fitting Classification Trees

In [None]:
df3 = pd.read_csv('../../_data/Carseats.csv').drop('Unnamed: 0', axis=1)
df3.head()

### Feature wrangling/engineering

In [None]:
df3['High'] = df3.Sales.map(lambda x: 1 if x > 8 else 0)
df3.ShelveLoc = pd.factorize(df3.ShelveLoc)[0]

df3.Urban = df3.Urban.map({'No':0, 'Yes':1})
df3.US = df3.US.map({'No':0, 'Yes':1})
df3.info()

In [None]:
df3.head(5)

### Set labels

In [None]:
X = df3.drop(['Sales', 'High'], axis=1)
y = df3.High

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

### Decision Tree

In [None]:
clf = DecisionTreeClassifier(max_depth=6)
clf.fit(X, y)

In [None]:
print(classification_report(y, clf.predict(X)))

In [None]:
confusion_matrix(y, clf.predict(X))

In [None]:
tree_plot(clf, features=X.columns, labels=['No', 'Yes'])

### Decision Tree train-test split

In [None]:
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [None]:
cm = pd.DataFrame(confusion_matrix(y_test, pred).T, index=['No', 'Yes'], columns=['No', 'Yes'])
cm.index.name = 'Predicted'
cm.columns.name = 'True'
cm

In [None]:
# Precision of the model using test data is 74%
print(classification_report(y_test, pred))

### 8.3.2 Fitting Regression Trees

In [None]:
boston_df = pd.read_csv('../../_data/Boston.csv')
boston_df.info()

### Set labels

In [None]:
X = boston_df.drop('medv', axis=1)
y = boston_df.medv

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

### Decision Tree Regression

In [None]:
# Pruning not supported. Choosing max depth 3)
regr2 = DecisionTreeRegressor(max_depth=3)
regr2.fit(X_train, y_train)
pred = regr2.predict(X_test)

In [None]:
tree_plot(regr2, features=X.columns)

In [None]:
plt.scatter(pred, y_test, label='medv')
plt.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
plt.xlabel('pred')
plt.ylabel('y_test');

In [None]:
mean_squared_error(y_test, pred)

### 8.3.3 Bagging and Random Forests

In [None]:
# There are 13 features in the dataset
X.shape

### Bagging: using all features

In [None]:
regr1 = RandomForestRegressor(max_features=13, random_state=1)
regr1.fit(X_train, y_train)

In [None]:
pred = regr1.predict(X_test)

plt.scatter(pred, y_test, label='medv')
plt.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
plt.xlabel('pred')
plt.ylabel('y_test');

In [None]:
mean_squared_error(y_test, pred)

### Random forests: using 6 features

In [None]:
regr2 = RandomForestRegressor(max_features=6, random_state=1)
regr2.fit(X_train, y_train)

In [None]:
pred = regr2.predict(X_test)
mean_squared_error(y_test, pred)

### Feature importance

In [None]:
Importance = pd.DataFrame({'Importance': regr2.feature_importances_*100}, index=X.columns)

Importance.sort_values('Importance', axis=0, ascending=True).plot(kind='barh', color='b')
plt.xlabel('Variable Importance')
plt.gca().legend_ = None;

### 8.3.4 Boosting

In [None]:
regr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, random_state=1)
regr.fit(X_train, y_train)

In [None]:
feature_importance = regr.feature_importances_*100
rel_imp = pd.Series(feature_importance, index=X.columns).sort_values(ascending=True)

print(rel_imp)
rel_imp.T.plot(kind='barh', color='b', )
plt.xlabel('Variable Importance')
plt.gca().legend_ = None;

In [None]:
mean_squared_error(y_test, regr.predict(X_test))