In [None]:
%pylab inline
%run helper_functions.py

import pandas as pd
import seaborn as sns
import numpy as np
import sklearn
sns.set_style("whitegrid")

In [None]:
# A helper function to display the tree.
# NOTE: requires pydotplus and graphviz libraries. 
#       for MACOS/LINUX just install by typing "conda install pydotplus" at the terminal
#       for windows its tricker so we can skip this
    
from IPython.display import Image 
import pydotplus
def plot_tree(clf, feature_names, target_names):
    dot_data = sklearn.tree.export_graphviz(clf, out_file=None, 
                             feature_names=feature_names,  
                             class_names= target_names,  
                             filled=True, rounded=True,  
                             special_characters=True) 
    return pydotplus.graph_from_dot_data(dot_data).create_png() 



# LOAD IRIS DATASET

In [None]:
from sklearn import datasets
iris = datasets.load_iris()

X = iris.data
feature_names = iris.feature_names
y = iris.target
class_labels = iris.target_names

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score


tree = DecisionTreeClassifier( random_state=0) 


# score at training
score_test = tree.fit(X,y).score(X, y)
print("Accuracy Training         :", score_test)

scores = cross_val_score(tree, X, y, cv=10)
print("Accuracy Cross Validation : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

plot_boundaries_iris_dataset(tree, iris)

# plot the tree
tree.fit(X,y)
Image(plot_tree(tree, iris.feature_names, iris.target_names))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

params = { 
           'min_samples_leaf':  range(2,10),
           'max_leaf_nodes'  :  range(2,10)
         }

tree = DecisionTreeClassifier()

cv = ShuffleSplit(n_splits=10, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid=params )
grid.fit(X, y)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

plot_boundaries_iris_dataset(grid.best_estimator_, iris)

In [None]:
Image(plot_tree(grid.best_estimator_, iris.feature_names, iris.target_names))

## Feature importance

In [None]:
important_features = pd.Series(data=grid.best_estimator_.feature_importances_ , index=feature_names)
important_features.sort_values(ascending=False)

# Random Forests

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier(max_leaf_nodes=4)
scores = cross_val_score(tree, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



important_features = pd.Series(data=tree.fit(X,y).feature_importances_ ,index=feature_names)
important_features.sort_values(ascending=False)


plot_boundaries_iris_dataset(tree, iris)

# GradientBoostingClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier

tree = GradientBoostingClassifier()
scores = cross_val_score(tree, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


important_features = pd.Series(data=tree.fit(X,y).feature_importances_ ,index=feature_names)
important_features.sort_values(ascending=False)

plot_boundaries_iris_dataset(tree, iris)

# Titanic

In [None]:
# Load the boston dataset from sklearn
dataset = pd.read_csv("../data/titanic_clean.csv")
X = dataset.drop('survived', axis = 1)
y = dataset.survived


# to print stats
feature_names = X.columns
class_labels = ["Died", "Survived"]

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

params = { 
           #'min_samples_leaf':  range(2,10),
           'max_leaf_nodes'  :  range(2,7)
         }

tree = DecisionTreeClassifier()

cv = ShuffleSplit(n_splits=10, random_state=0)
grid = GridSearchCV(estimator=tree, cv = cv, param_grid=params )
grid.fit(X, y)
# summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)




Image(plot_tree(grid.best_estimator_, feature_names, class_labels))


In [None]:
important_features = pd.Series(data=tree.fit(X,y).feature_importances_ ,index=feature_names)
important_features.sort_values(ascending=False)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

tree = GradientBoostingClassifier()
scores = cross_val_score(tree, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


important_features = pd.Series(data=tree.fit(X,y).feature_importances_ ,index=feature_names)
important_features.sort_values(ascending=False)


# Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
pylab.rcParams['figure.figsize'] = 16, 12

# Create a random dataset
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(200, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(40))

# Fit regression model
regr_1 = DecisionTreeRegressor(max_leaf_nodes=2)
regr_1.fit(X, y)

num_leafs = 20
regr_2 = DecisionTreeRegressor(max_leaf_nodes=num_leafs)
regr_2.fit(X, y)

# Predict
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)

# Plot the results
plt.figure()
plt.scatter(X, y, c="darkorange", label="data")
plt.plot(X_test, y_1, color="yellowgreen", label="max_leaf_nodes=2", linewidth=2)
plt.plot(X_test, y_2, color="red", label="max_leaf_nodes=%s"%num_leafs, linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

