# Decision Tree
[Decision Trees (DTs)](https://scikit-learn.org/stable/modules/tree.html#tree) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

## A simple 1D regression with decision tree.
The decision trees is used to fit a sine curve with addition noisy observation. As a result, it learns local linear regressions approximating the sine curve.

We can see that if the maximum depth of the tree (controlled by the max_depth parameter) is set too high, the decision trees learn too fine details of the training data and learn from the noise, i.e. they overfit.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import io

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve

### Create a random dataset


In [None]:
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")

In [None]:
# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_1.fit(X, y)
regr_2.fit(X, y)

# Predict
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)

In [None]:
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

## Same dataset with Random Forests
A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to **improve the predictive accuracy and control over-fitting**.

In [None]:
# Fit regression model
regr_1 =  RandomForestRegressor(max_depth=2)
regr_2 =  RandomForestRegressor(max_depth=5)
regr_1.fit(X, y)
regr_2.fit(X, y)

X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)
plt.figure()
plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data")
plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Random Forest Regression")
plt.legend()
plt.show()

## Decision tree and random forests
In this lab, we will apply decision tree and random forests to predict the latent heat.

In [None]:
data = pd.read_csv('FLX_US-Ne1_FLUXNET2015_SUBSET_DD_2001-2013_1-4.csv' , delimiter=",", skipinitialspace=True,  parse_dates=True)

meteo = pd.DataFrame(
            {"sw": data.SW_IN_F, "lw": data.LW_IN_F, "tmp": data.TA_F,
             "pre": data.PA_F, "u10": data.WS_F,  "vpd": data.VPD_F , "lh": data.LE_CORR})

data_all = np.array(meteo)
X = data_all[ : , 0:6]
y = data_all[ : , 6]

In [None]:
kf = KFold(n_splits=5)
for train, test in kf.split(X):    
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    tree1 = DecisionTreeRegressor(max_depth=5)
    tree2 = RandomForestRegressor(max_depth=5)
    tree1.fit(X_train , y_train)
    tree2.fit(X_train , y_train)
    y_pred1 = tree1.predict(X_test)
    y_pred2 = tree2.predict(X_test)

In [None]:
# plot the data
# Plot the data points
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
line = ax.plot( y_pred1, c='b',label='Decision Tree')
plt.plot( y_pred2, marker='o', c='black',label='Random Forests')
plt.plot( y_test, marker='x', c='r',label='True Value')
# Set the y-axis label
plt.ylabel('evaporation rate (mm/d)')
# Set the x-axis label
plt.xlabel('time (day)')
plt.legend()
plt.show()

In [None]:
fig,ax=plt.subplots(1, 1, figsize=(6, 6), sharey=True)
ax.scatter( y_test , y_pred1, c='b')
z = np.polyfit(y_test , y_pred1, 1)
y_hat = np.poly1d(z)(y_pred1)
plt.plot(y_pred1, y_hat, "r--", lw=2)
text = f"$y={z[0]:0.3f}\;x{z[1]:+0.3f}$\n$R^2 = {r2_score(y_test, y_hat):0.3f}$\n" \
                   f"$RMSE = {mean_squared_error(y_test, y_hat, squared=False):0.3f} $ "
plt.gca().text(0.05, 0.95, text, transform=plt.gca().transAxes,
                           fontsize=14, verticalalignment='top')
plt.title('Decision Tree')
plt.show()

In [None]:
fig,ax=plt.subplots(1, 1, figsize=(6, 6), sharey=True)
ax.scatter( y_test , y_pred2, c='b')
z = np.polyfit(y_test , y_pred2, 1)
y_hat = np.poly1d(z)(y_pred2)
plt.plot(y_pred2, y_hat, "r--", lw=2)
text = f"$y={z[0]:0.3f}\;x{z[1]:+0.3f}$\n$R^2 = {r2_score(y_test, y_hat):0.3f}$\n" \
                   f"$RMSE = {mean_squared_error(y_test, y_hat, squared=False):0.3f} $ "
plt.gca().text(0.05, 0.95, text, transform=plt.gca().transAxes,
                           fontsize=14, verticalalignment='top')
plt.ylabel('Predict Value')
# Set the x-axis label
plt.xlabel('True Value')
plt.title('Random Forests')
plt.show()

## Please try leaning learning curve here according to jupyter notebook: regression_linear_polynomial.ipyn