# Decision Trees for Regression

## But first, more on ML fundamentals

Some of this will repeat content, but it will set us up for looking again at:
* test/train split
* cross-validation

As well as at:
* pipelines
* grid-search cross-validation 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

### Get the data
Here we're manufacturing it.

We're going to generate some fictitious data that follows an equation of our choosing:

$$ y(x) = 4 + 2x - x^2 + 0.075x^3 $$

In [None]:
np.random.seed(42)
noise = np.random.normal(0,1.5,50)

x = np.linspace(0, 10, 50)

y_underlying = 4 + 2*x - x**2 + 0.075*x**3 
y = 4 + 2*x - x**2 + 0.075*x**3 + noise

In [None]:
# plot our theory curve
plt.plot(x,y_underlying,'k')

# plot our data generated from the theory curve + noise
plt.scatter(x,y,color='k',marker='o')

plt.show()

## ML training process

* get the data and pre-process if needed
* choose the model
* train the model
* evaluate the model

In [None]:
import sklearn.linear_model
import sklearn.neighbors
import sklearn.model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

Preparing the data

In [None]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, test_size=0.2, random_state=42)

x_train_transformed = x_train.reshape(-1,1)
x_test_transformed = x_test.reshape(-1,1)

Using cross-validation to identify the optimum hyperparameter values for our model 

In [None]:
k_range = range(1, 20)
k_scores = []
for k in k_range:
    knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k)
    loss = cross_val_score(knn,
                           x_train_transformed,
                           y_train, 
                           cv=5, 
                           scoring='neg_mean_squared_error')
    k_scores.append(np.sqrt(-loss).mean())
plt.scatter(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated RMSE')
plt.show()

Now we have our best guess for a hyperparameter

In [None]:
model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=4)

model.fit(x_train_transformed, y_train)

plt.plot(x,y_underlying,'k')
plt.scatter(x_train,y_train,color='black')
plt.scatter(x_test,y_test,color='blue')

x_model_vals = np.linspace(0, 10, 500).reshape(-1,1)
y_model_vals = model.predict(x_model_vals)
plt.plot(x_model_vals,y_model_vals,'green')

plt.show()

y_pred = model.predict(x_test_transformed)

print('MSE_knn = ', mean_squared_error(y_test, y_pred))

# New content for Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
model = sklearn.tree.DecisionTreeRegressor()

model.fit(x_train_transformed, y_train)

plt.plot(x,y_underlying,'k')
plt.scatter(x_train,y_train,color='black')
plt.scatter(x_test,y_test,color='blue')

x_model_vals = np.linspace(0, 10, 500).reshape(-1,1)
y_model_vals = model.predict(x_model_vals)
plt.plot(x_model_vals,y_model_vals,'green')

plt.show()

y_pred = model.predict(x_test_transformed)

print('MSE_tree = ', mean_squared_error(y_test, y_pred))

### We can see the overfitting

Temper this with regularization -> use cross-validation to search out value for max_depth

In [None]:
h_range = range(1, 20)
h_scores = []
for h in h_range:
    dt_reg = sklearn.tree.DecisionTreeRegressor(max_depth=h)
    loss = cross_val_score(dt_reg,
                           x_train_transformed,
                           y_train, 
                           cv=5, 
                           scoring='neg_mean_squared_error')
    h_scores.append(np.sqrt(-loss).mean())
plt.scatter(h_range, h_scores)
plt.xlabel('Value of Max_Depth for Tree')
plt.ylabel('Cross-Validated RMSE')
plt.show()

In [None]:
model = sklearn.tree.DecisionTreeRegressor(max_depth=4)

model.fit(x_train_transformed, y_train)

plt.plot(x,y_underlying,'k')
plt.scatter(x_train,y_train,color='black')
plt.scatter(x_test,y_test,color='blue')

x_model_vals = np.linspace(0, 10, 500).reshape(-1,1)
y_model_vals = model.predict(x_model_vals)
plt.plot(x_model_vals,y_model_vals,'green')

plt.show()

y_pred = model.predict(x_test_transformed)

print('MSE_tree = ', mean_squared_error(y_test, y_pred))

# Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Writing out the workflow manually

x_train_shaped = x_train.reshape(-1,1)
x_test_shaped = x_test.reshape(-1,1)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_shaped)
x_test_scaled = scaler.transform(x_test_shaped)

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train_scaled, y_train)
print(tree_reg.score(x_train_scaled, y_train))
print(tree_reg.score(x_test_scaled, y_test))

That's a clean fit and assessment, but the score is 1.0 because we're overfitting to our training data and then using our training data to assess the score.

We could pass test data into score to get a better assessment of generalization error, or if we want to hold onto the test data for later use, we could hold out some training data to assess the generalization error.

Cross-validation can assess scores using multiple validation sets taken from our training data.

In [None]:
# Repeat the workflow manually, but use cross-validation to get better assessments of the generalization error.

x_train_shaped = x_train.reshape(-1,1)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_shaped)

# in place of this:
#
##tree_reg = DecisionTreeRegressor()
##tree_reg.fit(x_train_scaled, y_train)
#
# use this:
# 
cv_results = cross_validate(DecisionTreeRegressor(), 
                            x_train_scaled, 
                            y_train)

# by default we get 5-folds cross-validation, resulting in 5 scores
print('Keys:',cv_results.keys())
print('Test score results:',cv_results['test_score'])

We can be even a little bit more meticulous in our up-front specification of steps.  This helps to document our workflow, and later it makes it easier to change the workflow.

In [None]:
# Specify our initial steps in processing the data and initializing our algorithm

x_train_shaped = x_train.reshape(-1,1)

pipeline = Pipeline([
    ('normalizer', StandardScaler()),  # Step 1 - normalize data
    ('reg', DecisionTreeRegressor())   # Step 2 - regression algorithm
])

cv_results = cross_validate(pipeline, 
                            x_train_shaped, 
                            y_train)

# by default we get 5-folds cross-validation, resulting in 5 scores
print('Keys:',cv_results.keys())
print('Test score results:',cv_results['test_score'])

In [None]:
cv_results['test_score'].mean()

Now that we have our pipeline, we can use it to advantage.  For example, we can use it to loop over a couple different regression algorithms:

In [None]:
regs = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor()]

In [None]:
for r in regs:
    pipeline.set_params(reg = r)
    scores = cross_validate(pipeline, x_train_shaped, y_train)
    print('---------------------------------')
    print(str(r))
    print(str(r.get_params()))
    print('-----------------------------------')
    for key, values in scores.items():
            print(key,' mean ', values.mean())
            print(key,' std ', values.std())

We can also use our pipeline as a way to iteratively assess performance for hyperparameters.

## GridSearchCV

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
cv_grid = GridSearchCV(pipeline, param_grid = {
    'reg__max_depth' : range(1,40),
    'reg__min_samples_leaf': [1,2,3,5]
})

cv_grid.fit(x_train_shaped, y_train)

In [None]:
cv_grid.best_params_

In [None]:
x_test_shaped = x_test.reshape(-1,1)

y_predict = cv_grid.predict(x_test_shaped)
r2score = r2_score(y_test,y_predict)
print('R2 of the best regressor after CV is %.2f' % (r2score))

In [None]:
pipeline

In [None]:
pipeline.set_params(reg = KNeighborsRegressor())

In [None]:
pipeline

In [None]:
cv_grid = GridSearchCV(pipeline, param_grid = {
    'reg__n_neighbors' : range(1,20)
})

cv_grid.fit(x_train_shaped, y_train)

In [None]:
cv_grid.best_params_

In [None]:
y_predict = cv_grid.predict(x_test_shaped)
r2score = r2_score(y_test,y_predict)
print('R2 of the best regressor after CV is %.2f' % (r2score))

## Just to double-check, let's do it ourselves

In [None]:
testmodel = KNeighborsRegressor(n_neighbors=6)

s = StandardScaler()
xtrain_trans = s.fit_transform(x_train_shaped)
xtest_trans = s.transform(x_test_shaped)

testmodel.fit(xtrain_trans,y_train)

y_predict = testmodel.predict(xtest_trans)
r2score = r2_score(y_test,y_predict)
print('R2 of the best regressor after CV is %.2f' % (r2score))

# Back to normal

In [None]:
#model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=6)
model = sklearn.tree.DecisionTreeRegressor(max_depth=4)

model.fit(x_train_transformed, y_train)

plt.plot(x,y_underlying,'k')
plt.scatter(x_train,y_train,color='black')
plt.scatter(x_test,y_test,color='blue')

x_model_vals = np.linspace(0, 10, 500).reshape(-1,1)
y_model_vals = model.predict(x_model_vals)
plt.plot(x_model_vals,y_model_vals,'green')

plt.show()

y_pred = model.predict(x_test_transformed)

print('MSE_tree = ', mean_squared_error(y_test, y_pred))
print('R2_tree = ', r2_score(y_test, y_pred))

In [None]:
text_representation = sklearn.tree.export_text(model)
print(text_representation)

In [None]:
plt.figure(figsize=(12,8))
sklearn.tree.plot_tree(model, 
               feature_names=['x'],  
               class_names=['y'],
               filled=True);

In [None]:
import dtreeviz

In [None]:
%%capture --no-display

vizmodel = dtreeviz.model(model, 
         x.reshape(-1,1), 
         y,
         feature_names=['x'],
         class_names=['y'],
         target_name="y")

vizmodel.view()