<a href="https://colab.research.google.com/github/ccwilliamsut/machine_learning/blob/master/MLAB_04_Decision_Tree_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Build a Decision Tree Model (Regression)

> Websites Referenced:
- [Scikit-Learn Documentation: Decision Tree Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)
- [Excellent explanation of Decision Tree Regression with sample code](https://gdcoder.com/decision-tree-regressor-explained-in-depth/)

## A. Setup Environment

In [0]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SETUP ENVIRONMENT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# Import libraries
import pandas as pd
import seaborn as sns; sns.set()
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import missingno as msno
from IPython.display import display
from sklearn.model_selection import train_test_split 
from sklearn import ensemble
from scipy import stats
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import learning_curve,GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn import preprocessing
from collections import Counter
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

url = 'https://raw.githubusercontent.com/ccwilliamsut/machine_learning/master/absolute_beginners/data_files/modified/CaliforniaHousingDataModified.csv'

df = pd.read_csv(url)
#df = pd.read_csv(~/Downloads/CaliforniaHousingDataModified.csv)


# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SETUP DATA <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# --------------------------------------------- RENAME FEATURES ---------------------------------------------
df.rename(columns = {'lattitude':'latitude', 't_rooms':'total_rooms'}, inplace=True)


# --------------------------------------------- ONE-HOT ENCODING ---------------------------------------------
df['ocean_proximity'] = pd.Categorical(df['ocean_proximity'])
df_dummies = pd.get_dummies(df['ocean_proximity'], drop_first = True)
df.drop(['ocean_proximity'], axis = 1, inplace = True)
df = pd.concat([df, df_dummies], axis=1)


# --------------------------------------------- DROP UNWANTED FEATURES ---------------------------------------------
df.drop(['population', 'households', 'proximity_to_store', 'ISLAND'], axis = 1, inplace = True)


# ------------------------------------- FIX MISSING DATA -------------------------------------
tb_med = df['total_bedrooms'].median(axis=0)
df['total_bedrooms'] = df['total_bedrooms'].fillna(value = tb_med)
df['total_bedrooms'] = df['total_bedrooms'].astype(int)
df.name = 'df'

# ------------------------------------- Z-SCORE -------------------------------------
z = np.abs(stats.zscore(df))
dfz = df[(z < 3).all(axis = 1)]
dfz.name = 'dfz'

# ------------------------------------- INTERQUARTILE RANGE -------------------------------------
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
iqr = q3 - q1
lower = q1 - (1.5 * iqr)
upper = q3 + (1.5 * iqr)
dfi = df[~((df < lower) | (df > upper)).any(axis = 1)]
dfi.name = 'dfi'
#dfi = dfi.drop(['NEAR BAY', 'NEAR OCEAN'], axis = 1)  # After applying IQR, the following features are now empty and can be dropped

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
def make_heatmap(df = 'df'):
  corr = df.corr()
  plt.figure(figsize=(15 ,9))
  sns.heatmap(corr, annot=True, vmin = -1, vmax = 1, center = 0, fmt = '.1g', cmap = 'coolwarm')
  plt.show()
  plt.close()


def make_heading(heading = 'heading'):
  print('{0}:\n'.format(heading), '-' * 30)


def show_coef(df_in, model):
  coef_df = pd.DataFrame(model.coef_, df_in.columns, columns=['Coefficient'])
  make_heading('\n\nCoefficients')
  display(coef_df)
  make_heading('\n\nIntercept')
  display(model.intercept_)


def drop_features(df_in):
  df_in = df_in.drop([#'median_house_value',
                      'longitude',
                      'latitude',
                      #'housing_median_age',
                      'total_rooms',
                      'total_bedrooms',
                      #'median_income',
                      'INLAND',
                      #'NEAR BAY',
                      #'NEAR OCEAN'
                      ],
                      axis = 1
                      )
  return features_dfx


def plot_test_predictions(y_test, y_pred):
  make_heading('Prediction Performance')  # Make a heading to separate output
  fig, ax = plt.subplots()
  ax = plt.subplot()
  ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0), alpha=0.5)
  ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'g--', lw=4)
  ax.set_xlabel('Actual')
  ax.set_ylabel('Predicted')
  ax.set_title("Ground Truth vs Predicted")
  plt.show()
  plt.close()


def plot_feature_importance(feature_importance):
  # Make importances relative to max importance
  feature_importance = 100.0 * (feature_importance / feature_importance.max())
  sorted_idx = np.argsort(feature_importance)
  pos = np.arange(sorted_idx.shape[0]) + .5
  make_heading('Feature Importance')  # Make a heading to separate output
  plt.subplot(1, 2, 2)
  plt.barh(pos, feature_importance[sorted_idx], align='center')
  plt.yticks(pos, features_dfx)
  plt.xlabel('Relative Importance')
  plt.title('Variable Importance')
  plt.show()
  plt.close()
  

def show_metrics(X_train, X_test, y_train, y_test, y_pred):
  # Display the shape of each array:
  #make_heading('The shape of each of our arrays after splitting into training and testing sets for dataframe \"{0}\"'.format(dfx.name))
  #print('X_train shape: ', X_train.shape)
  #print('X_test shape:  ', X_test.shape)
  #print('y_train shape: ', y_train.shape)
  #print('y_test shape:  ', y_test.shape)
  #print('y_pred shape:  ', y_pred.shape)

  # Display training / testing set metrics
  make_heading('\n\nAccuracy and Error for training/testing on the {0} model for dataframe \"{1}\"'.format(model_name, dfx.name))
  print("Training Accuracy (score)                 (X_train, y_train):  {:.2f}".format(model_train_score))
  print("Test Accuracy (score)                     (X_test, y_test):    {:.2f}".format(model_test_score))
  print("Predictive Accuracy (R^2 score)           (y_test, y_pred):    {:.2f}".format(predictive_accuracy))
  print("Explained Variance (1 is best) (loss)     (y_test, y_pred):    {:.2f}".format(ev))
  print("Mean Absolute Error on Test Set (loss)    (y_test, y_pred):    {:.2f}".format(mean_ae))
  print("Median Absolute Error on Test Set (loss)  (y_test, y_pred):    {:.2f}".format(median_ae))
  print("RMSE on Test Set (loss)                   (y_test, y_pred):    {:.2f}".format(rmse))
  print("Model Depth                            (model.get_depth()):    {:d}".format(model_depth))

## B. Choose a dataset to use
- Three datasets have been created:
  1. **df** -- the **original dataset** ('NaN' values have been replaced with median, so there are no null values)
  2. **dfz** -- a dataset that has used the **z-score method** for removing outlier data
  3. **dfi** -- a dataset that has used the **IQR method** for removing outlier data

```

```

**NOTE:** When choosing your dataset, you will need to change two commands to reflect the dataset that you want to use. In the following example, the 

**Original code:** --------------------------------------->  **New code** (For example, to change **from** 'dfz' **to** 'dfi' dataset):
```
dfx = dfz.copy()      -->   dfx = dfi.copy()
dfx.name = dfz.name   -->   dfx.name = dfi.name
```

In [0]:
# *************************** DETERMINE DATA FOR USE ***************************
# Choose which dataframe you would like to use (dfi: IQR, dfz: z-score, df: original (with replaced NaN values))
dfx = df.copy()
dfx.name = df.name

# Show a heatmap for the given dataframe you want to use
make_heatmap(dfx)

## C. Step by Step: Building a Decision Tree Model

- In this section, we walk through building a **Decision Tree Regression model** using the dataframe chosen in the previous step.

### The basic steps are:
  1. Define the **features** and the **target variable**.
  2. **Split** the dataset into **training** and **testing** partitions
  3. **Define the model:**
    - **Select Hyperparameters**
    - **Train the model** - run the model against our training set
    - Use the trained model to **predict against the test set**
  4. **Capture metrics** for the model (done during training, testing and prediction)
    - Training accuracy
    - Testing accuracy
    - Prediction error
    - Error values (Mean Absolute Error, Median Absolute Error, RMSE, etc.)
  5. **Analyze metrics**
    - Look at the varirous accuracy and error scores to **determine how well the model fits** (overfit, underfit, good, bad, etc.)
  6. **Plot graphs** to visualize the various accuracy / error metrics
  7. **Predict new values** to see if the model performs as expected (given the accuracy and coefficient values)

---
### Setting Hyperparameters
Please see this link for more information: [Scikit-Learn Documentation: Decision Tree Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)
- **```criterion```** : string, optional (default=”mse”)
- The function to measure the quality of a split. Supported criteria are:
  - ***mse*** for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node
  - ***friedman_mse***, which uses mean squared error with Friedman’s improvement score for potential splits
  - ***mae*** for the mean absolute error, which minimizes the L1 loss using the median of each terminal node.
- **```min_samples_leaf```** : int, float, optional (default=1)
  - It is the minimum number of samples for a terminal node.
  - In other words, a final node must have this many samples in order to be established. Any less, and it will not be created.

- **```min_samples_split```** : int, float, optional (default=2)
  - It is the minimum samples for a node split that we discuss above.
  - In other words, a node cannot be split unless it has at least this many samples. Any less, and it will become a terminal node.

- **```max_features```** : int, float, string or None, optional (default=”auto”)
  - The number of features to consider when looking for the best split
  - In other words, this determines the number (precisely defined or given as a percentage) of features that the model will contemplate when deriving the best means by which to split a node.

- **```max_depth```** : integer or None, optional (default=None)
  - The maximum depth of the tree. 
  - In other words, what is the maximum "depth" that the tree should grow?

In [0]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> STEP BY STEP DECISION TREE CONSTRUCTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# ---------------------------- Identify Target Variable ----------------------------
# Create a copy of the dataframe with only the desired features (dropping the target feature)
features_dfx = dfx.drop(['median_house_value'], axis = 1)

# Determine additional features to keep/drop (# means that we want to keep that variable)
features_dfx = features_dfx.drop([#'longitude',
                                  #'latitude',
                                  #'housing_median_age',
                                  #'total_rooms',
                                  #'total_bedrooms',
                                  #'median_income',
                                  #'INLAND',
                                  'NEAR BAY',
                                  'NEAR OCEAN'
                                  ],
                                 axis = 1
                                 )


errvals = np.array([])              # Create an array to hold our error values during training (used for plotting results)
rs = 20                             # Set our "Random State" variable (ensuring that we always start at the same point in our dataset)


# ---------------------------- SPLIT THE DATASET ----------------------------
# We extract the values of our two datasets into "X" and "y" variables for use in later functions
# The X variables are the "explanatory" numbers (or images). They are the numbers that we use to try and predict
#   what the "y" variable is.

# "X" are the independent variables, the "featues" that we will use to try and predict "y"
X = features_dfx.values

# "y" is the dependent variable (the "label"); the actual value that we are trying to learn to predict. 
#   Each time we make a prediction, it is compared to the actual values, and the "loss" (the difference between
#   the prediction and real number) is added to previous "loss". 
y = dfx['median_house_value'].values

# Split the dataset into training and testing arrays
#   We take our X and y variables that we have just created and now split each one into a training set and testing
#   set: X_train, X_test, y_train, y_test)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,    # reserve this percentage of our data for testing, use the rest for training
                                                    shuffle = True,
                                                    random_state = rs
                                                    )


# ---------------------------- CREATE THE MODEL  ----------------------------
# Define the desired algorithm and congigure the hyperparameters
"""model = DecisionTreeRegressor(criterion='mae',  # can also be 'mse' or 'friedman_mse'
                              max_depth = 500, 
                              min_samples_split = 4,
                              min_samples_leaf = 4,
                              max_features = 0.6,
                              random_state = rs, 
                              )
"""

model = DecisionTreeRegressor(criterion='friedman_mse',  # can also be 'mse' or other values
                              max_depth = None, 
                              min_samples_split = 64,
                              min_samples_leaf = 32,
                              max_features = 0.9,
                              random_state = rs, 
                              )

model_name = type(model).__name__   # Get the name of the model (for use in our display functions)


# ---------------------------- Fit / TRAIN THE MODEL ----------------------------
# Train the model using our training sets (X_train, y_train)
model.fit(X_train, y_train)

# Here we want to input our X_test array to make predictions based upon our trained model
y_pred = model.predict(X_test)

# ---------------------------- GATHER METRICS ----------------------------
model_test_score = model.score(X_test, y_test)  # Accuracy of our model on test data
model_train_score = model.score(X_train, y_train)  # Accuracy of our model on training data
mean_ae = metrics.mean_absolute_error(y_test, y_pred)  # Mean absolute error (find mae on the test set (to see how well the trained model performs on new data(y_test against y_pred))
median_ae = metrics.median_absolute_error(y_test, y_pred)  # Median absolute error
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))  # Root mean squared error
ev = metrics.explained_variance_score(y_test, y_pred) # Explained Variance Score (y_test, y_pred): Measures the proportion to which a mathematical model accounts for the variation (dispersion) of a given data set
predictive_accuracy = metrics.r2_score(y_test, y_pred)  # Determine the predictive accuracy of the model by getting the R^2 score (how well future samples are likely to be predicted by the model)
model_depth = model.get_depth()


# ---------------------------- ANALYSIS / VISUALIZATION ----------------------------
show_metrics(X_train, X_test, y_train, y_test, y_pred)  # Dispay the scores and loss for the model
plot_test_predictions(y_test, y_pred)                   # Create a scatterplot of real values against predicted ones for the test set
feature_importance = model.feature_importances_         # Gather feature importance values
plot_feature_importance(feature_importance)             # Plot feature importance
joblib.dump(model, 'ca_housing_dt_model.pkl')           # Save the model so that we can use it later
