# HSST B10m Decision Trees


In [None]:
import numpy as np
from numpy import pi
from numpy.random import randint
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, RocCurveDisplay
from sklearn.model_selection import cross_val_score, train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier

### Data generator
Experiment with changing noise, test/train split and number of samples

In [None]:
# generate spiral data (see https://gist.github.com/45deg/e731d9e7f478de134def5668324c44c5)
noise = 1.0
N = 200
split = 0.25

theta = np.sqrt(np.random.rand(N))*2*pi # np.linspace(0,2*pi,100)

r_a = 2*theta + pi
data_a = np.array([np.cos(theta)*r_a, np.sin(theta)*r_a]).T
x_a = data_a + noise*np.random.randn(N,2)

r_b = -2*theta - pi
data_b = np.array([np.cos(theta)*r_b, np.sin(theta)*r_b]).T
x_b = data_b + noise*np.random.randn(N,2)

res_a = np.append(x_a, np.zeros((N,1)), axis=1)
res_b = np.append(x_b, np.ones((N,1)), axis=1)

res = np.append(res_a, res_b, axis=0)
np.random.shuffle(res)

plt.scatter(x_a[:,0],x_a[:,1])
plt.scatter(x_b[:,0],x_b[:,1])
X=np.r_[x_a,x_b]
y=np.r_[np.zeros(N),np.ones(N)]
print("Input data")
plt.show()

X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=split, random_state=None)


## Basic Decision Tree
Experiment with changing the maximum depth of the tree, how does this effect the AUC.
Does the average AUC predicted by cross validation agree with the value from the test data?

In [None]:
max_depth = None # how deep the tree generated can go, None is uncapped, experiment with lower values

clf = DecisionTreeClassifier(random_state=0,max_depth=max_depth)

print("10 fold cross validation")
print(cross_val_score(clf, X_train,y_train, scoring='roc_auc', cv=10))

print("Final model")
clf = clf.fit(X_train,y_train)
print("AUC",roc_auc_score(y_test, clf.predict(X_test)))

clf_disp = RocCurveDisplay.from_estimator(clf, X_test, y_test)
plt.show()

print("Decision Tree")
plt.figure(figsize=(10,10))
plot_tree(clf)
plt.show()

### Decision boundary plot
This is a helper that shows the decision boundaries for the classifier.

In [None]:
def plot_decision_boundaries(_X, _y, tree):
    feature_1, feature_2 = np.meshgrid(
        np.linspace(_X[:, 0].min(), _X[:, 0].max()),
        np.linspace(_X[:, 1].min(), _X[:, 1].max())
    )
    grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
    y_pred = np.reshape(tree.predict(grid), feature_1.shape)
    display = DecisionBoundaryDisplay(
        xx0=feature_1, xx1=feature_2, response=y_pred
    )
    display.plot(alpha=0.5, cmap="plasma")
    display.ax_.scatter(
        _X[:, 0], _X[:, 1], c=_y, edgecolor="black"
    )
    plt.show()
    
print("Decision Boundary Plot")
print("Training data")
plot_decision_boundaries(X_train, y_train, clf)
print("Test data")
plot_decision_boundaries(X_test, y_test, clf)

## Bagging - Random forest

In [None]:
# Hyperparameter dictionary
param_dist={
    'n_estimators':range(50,100), # size of forest
    'max_depth':range(1,20) # depth of forest - "pruning"
}

# perform a "randomised" grid search of hyperparameters
print("Picking hyperparameters")
rf = RandomForestClassifier()
rand_search=RandomizedSearchCV(rf,
                               param_distributions=param_dist,
                               n_iter=5, 
                               cv=5)
rand_search.fit(X_train, y_train)

# get best model and parameters
best_rf = rand_search.best_estimator_
print(rand_search.best_params_)

# evaluate best model
print("AUC", roc_auc_score(y_test, best_rf.predict(X_test)))

# print decision boundary plot.
print("Decision Boundary Plot")
print("Training data")
plot_decision_boundaries(X_train, y_train, best_rf)
print("Test data")
plot_decision_boundaries(X_test, y_test, best_rf)

## Boosting

In [None]:
max_depth = 4

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(random_state=0,max_depth=max_depth), n_estimators=300, random_state=0
)

ada_clf.fit(X_train, y_train)

print("AUC", roc_auc_score(y_test, ada_clf.predict(X_test)))

# print decision boundary plot.
print("Decision Boundary Plot")
print("Training data")
plot_decision_boundaries(X_train, y_train, ada_clf)
print("Test data")
plot_decision_boundaries(X_test, y_test, ada_clf)