# HSST B10m Decision Trees


In [None]:
import numpy as np
from numpy import pi
from numpy.random import randint
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.ensemble import RandomForestClassifier

In [None]:
# generate spiral data (see https://gist.github.com/45deg/e731d9e7f478de134def5668324c44c5)
noise = 1.0
N = 200
split = 0.25

theta = np.sqrt(np.random.rand(N))*2*pi # np.linspace(0,2*pi,100)

r_a = 2*theta + pi
data_a = np.array([np.cos(theta)*r_a, np.sin(theta)*r_a]).T
x_a = data_a + noise*np.random.randn(N,2)

r_b = -2*theta - pi
data_b = np.array([np.cos(theta)*r_b, np.sin(theta)*r_b]).T
x_b = data_b + noise*np.random.randn(N,2)

res_a = np.append(x_a, np.zeros((N,1)), axis=1)
res_b = np.append(x_b, np.ones((N,1)), axis=1)

res = np.append(res_a, res_b, axis=0)
np.random.shuffle(res)

plt.scatter(x_a[:,0],x_a[:,1])
plt.scatter(x_b[:,0],x_b[:,1])
X=np.r_[x_a,x_b]
y=np.r_[np.zeros(N),np.ones(N)]
print("Input data")
plt.show()

X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=split, random_state=None)


In [None]:
clf = DecisionTreeClassifier(random_state=0)
print("10 fold cross validation")
print(cross_val_score(clf, X_train,y_train, scoring='roc_auc', cv=10))

clf = clf.fit(X_train,y_train)

print("AUC",roc_auc_score(y_test, clf.predict(X_test)))

print("Decision Tree")
plt.figure(figsize=(10,10))
plot_tree(clf)
plt.show()

In [None]:
print("Decision Boundary Plot")
feature_1, feature_2 = np.meshgrid(
    np.linspace(X[:, 0].min(), X[:, 0].max()),
    np.linspace(X[:, 1].min(), X[:, 1].max())
)
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
tree = DecisionTreeClassifier().fit(X_train,y_train)
y_pred = np.reshape(tree.predict(grid), feature_1.shape)
display = DecisionBoundaryDisplay(
    xx0=feature_1, xx1=feature_2, response=y_pred
)
display.plot(alpha=0.5, cmap="plasma")
display.ax_.scatter(
    X[:, 0], X[:, 1], c=y, edgecolor="black"
)
plt.show()

In [None]:
# Hyperparameter dictionary
param_dist={
    'n_estimators':range(50,100), # size of forest
    'max_depth':range(1,20) # depth of forest - "pruning"
}

# perform a grid search of hyperparameters
print("Optimising hyperparameters")
rf = RandomForestClassifier()
rand_search=RandomizedSearchCV(rf,
                               param_distributions=param_dist,
                               n_iter=5, 
                               cv=5)
rand_search.fit(X_train, y_train)

# get best model and parameters
best_rf = rand_search.best_estimator_
print(rand_search.best_params_)

# evaluate best model
print("AUC", roc_auc_score(y_test, best_rf.predict(X_test)))

# print decision boundary plot.
print("Decision Boundary Plot")
feature_1, feature_2 = np.meshgrid(
    np.linspace(X[:, 0].min(), X[:, 0].max()),
    np.linspace(X[:, 1].min(), X[:, 1].max())
)
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
y_pred = np.reshape(best_rf.predict(grid), feature_1.shape)
display = DecisionBoundaryDisplay(
    xx0=feature_1, xx1=feature_2, response=y_pred
)
display.plot(alpha=0.5, cmap="plasma")
display.ax_.scatter(
    X[:, 0], X[:, 1], c=y, edgecolor="black"
)
plt.show()