# Classification and Regression Trees


In [None]:
%matplotlib inline

from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.model_selection import train_test_split, GridSearchCV

import missingno as msno

import matplotlib.pylab as plt

from dmba import plotDecisionTree, classificationSummary, regressionSummary

#### The Home Loan dataset allows us to predict who will have their loans approved.

In [None]:
data_df = pd.read_csv('../resource/lib/public/Loan_data.csv')

In [None]:
# Show the first five lines of the dataset



![Pix%201.png](attachment:Pix%201.png)

In [None]:
# Remove the non-informational variable(s) from the data and show the remaining variables



![Pix%202.png](attachment:Pix%202.png)

In [None]:
# Show the variable names, non-null counts, and datatypes



![Pix%203.png](attachment:Pix%203.png)

In [None]:
# Create the X and y objects

X = data_df.???(???, axis=1)

y = data_df[???]

In [None]:
# Show the count of values in the response variable



![Pix%204.png](attachment:Pix%204.png)

In [None]:
# One-hot encode the predictors for tree models



In [None]:
# Show the number of samples and predictors in the X object after dummy coding



![Pix%205.png](attachment:Pix%205.png)

In [None]:
# Impute missing values in the X object and save it into a new object name

from sklearn.impute import KNNImputer

imputer = KNNImputer()

X_imp = ???.fit_transform(???)

X_imp = pd.DataFrame(X_imp, columns=X.columns) # we do this to preserve the original variable names

In [None]:
# Verify that all missing values are imputed by using a missing value bar chart

%matplotlib inline

msno.???(???, color="darkorange")

![Pix%206.png](attachment:Pix%206.png)

#### The fit function accepts the X objects (predictors) first followed by the response variable. Notice how they created both the X and y objects within the fit function itself.

In [None]:
classTree = DecisionTreeClassifier(random_state=1, max_depth=3)

classTree.???(???, ???)

#### The first line of code prints the classes of the response variable. The second line generates a plot of the decision tree structure. Note how feature names and class names are required to create this diagram.

In [None]:
print("Classes: {}".format(', '.join(classTree.classes_)))

plotDecisionTree(classTree, feature_names=X.columns,class_names=classTree.classes_)

The order of the `values` vector in the boxes is the same as `classTree.classes_`.

![Pix%207.png](attachment:Pix%207.png)

## Figure 9.8
Grow tree fully by changing the max_depth=3 parameter to max_depth = 6; this will allow the tree to overfit. It also shows how a complex tree can be difficult to interpret.

In [None]:
classTree = DecisionTreeClassifier(random_state=1)

???.fit(???, ???)

print("Classes: {}".format(', '.join(classTree.classes_)))

plotDecisionTree(classTree, feature_names=X.columns, class_names=classTree.classes_, rotate=True)

![Pix%208.png](attachment:Pix%208.png)

#### Create a train/test split and check model performance

In [None]:
# Split the data into training and test with 50% for each, random seed = 1, and stratified y



In [None]:
# Now we impute missing values in the training predictors and map those to the test data

from sklearn.impute import KNNImputer

imputer = KNNImputer()

train_X = imputer.???(train_X)
test_X = imputer.???(test_X)

train_X = pd.DataFrame(train_X, columns=X.columns)

#### There is no depth parameter, so this tree will overfit the training data.

In [None]:
fullClassTree = DecisionTreeClassifier()

fullClassTree.???(???, ???)

plotDecisionTree(fullClassTree, feature_names=X.columns)

![Pix%209.png](attachment:Pix%209.png)

#### First confusion matrix is training; second is for test data

In [None]:
classificationSummary(train_y, fullClassTree.???(???))
print()
classificationSummary(???, ???.predict(test_X))

![Pix%2010.png](attachment:Pix%2010.png)

#### Now we are using hyperparameters to control stopping ( and thereby prevent overfitting); we're setting a max depth, a minimum samples before a node can split, and a minimum impurity decrease before another split is allowed.¶

In [None]:
smallClassTree = DecisionTreeClassifier(max_depth=10, min_samples_split=10, min_impurity_decrease=0.01)

smallClassTree.fit(???, ???)

plotDecisionTree(smallClassTree, feature_names=train_X.columns)

![Pix%2011.png](attachment:Pix%2011.png)

In [None]:
classificationSummary(???, ???.predict(???))
print()
classificationSummary(???, smallClassTree.???(???))

![Pix%2012.png](attachment:Pix%2012.png)

### Grid search for Early Stopping

#### Grid search combines cross-validation with hyperparameter searching; grid search works well when you know the ranges of the hyperparameters that you want to check. 

In [None]:
# Start with an initial guess for parameters

param_grid = {
    'max_depth': [10, 20, 30], 
    'min_samples_split': [10, 15, 20], 
    'min_impurity_decrease': [0, 0.0001, 0.001, 0.01],
    'random_state': [1],
}

#### Using a DecisionTreeClassifier with 10-fold CV

In [None]:
gridSearch = GridSearchCV(???, ???, cv=???)

gridSearch.fit(train_X, train_y)

![Pix%2013.png](attachment:Pix%2013.png)

In [None]:
print('Initial score: ', gridSearch.???)

print('Initial parameters: ', gridSearch.???)

InitialTree = gridSearch.???

![Pix%2014.png](attachment:Pix%2014.png)

In [None]:
classificationSummary(train_y, ???.predict(train_X))
print()
classificationSummary(test_y, InitialTree.???(test_X))

![Pix%2015.png](attachment:Pix%2015.png)

#### Based on the initial results, we can refine our hyperparameter grid search to look for values around the best estimates.

In [None]:
# Adapt grid based on result from initial grid search

param_grid = {
    'max_depth': list(range(2, 12)), 
    'min_samples_split': list(range(2, 12)), 
    'min_impurity_decrease': [0.009, 0.01, 0.02], 
    'random_state': [1],
}

In [None]:
gridSearch = ???(DecisionTreeClassifier(), ???, cv=???)

gridSearch.fit(train_X, train_y)

![Pix%2016.png](attachment:Pix%2016.png)

In [None]:
print('Improved score: ', gridSearch.???)

print('Improved parameters: ', gridSearch.???)

bestClassTree = gridSearch.???

![Pix%2017.png](attachment:Pix%2017.png)

In [None]:
plotDecisionTree(bestClassTree, feature_names=train_X.columns)

![Pix%2018.png](attachment:Pix%2018.png)

In [None]:
# Show the classification summaries (confusion matrices) for training and test data



![Pix%2019.png](attachment:Pix%2019.png)

## Grid search for Cost-Complexity Pruning

In [None]:

param_grid = {
    
    'ccp_alpha': [0.001, 0.005, 0.01, 0.15, 0.02, 0.25]
}

In [None]:
gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=10)

gridSearch.fit(train_X, train_y)

In [None]:
print('Cost-Complexity Prune Score: ', gridSearch.best_score_)

print('Best Alpha: ', gridSearch.best_params_)

bestClassTree = gridSearch.best_estimator_

![Pix%2020a.png](attachment:Pix%2020a.png)

In [None]:
plotDecisionTree(bestClassTree, feature_names=train_X.columns)

![Pix%2020.png](attachment:Pix%2020.png)

In [None]:
classificationSummary(train_y, bestClassTree.predict(train_X))
print()
classificationSummary(test_y, bestClassTree.predict(test_X))

![Pix%2021.png](attachment:Pix%2021.png)

## Regression Trees

#### Instead of using a classification voting scheme, regression trees find the mean response value of each completed node.

In [None]:
# The ToyotaCorolla dataset shows the sales of used Toyota Corollas in the Netherlands

car_df = pd.read_csv('../resource/lib/public/ToyotaCorolla.csv')

In [None]:
# We'll change the names of two variables to simplify them

car_df = car_df.rename(columns={'Age_08_04': 'Age', 'Quarterly_Tax': 'Tax'})

In [None]:
# Show the variable names, non-null counts, and datatypes



![Pix%2022.png](attachment:Pix%2022.png)

In [None]:
# We'll create X and y object and one-hot encode X in this cell

X = car_df.drop(columns = [???, 'Id', "Mfg_Month", "Age"])

X = ???.get_dummies(???, drop_first=???)

y = car_df[???]

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.5, random_state=1)

### Grid search for Early Stopping

#### Grid search combines cross-validation with hyperparameter searching; grid search works well when you know the ranges of the hyperparameters that you want to check. 

In [None]:
# user grid search to find optimized tree; initial search pattern

param_grid = {
    'max_depth': [10, 20, 30], 
    'min_impurity_decrease': [0, 0.001, 0.005, 0.01], 
    'min_samples_split': [10, 20, 30], 
    'random_state': [1],
}

In [None]:
gridSearch = GridSearchCV(???, ???, cv=10)

gridSearch.fit(???, ???)

print('Initial parameters: ', gridSearch.best_params_)

![Pix%2023.png](attachment:Pix%2023.png)

In [None]:
regressionSummary(train_y, gridSearch.predict(train_X))
print()
regressionSummary(test_y, gridSearch.predict(test_X))

![Pix%2024.png](attachment:Pix%2024.png)

In [None]:
# refined search grid using parameters from above as guides

param_grid = {
    'max_depth': list(range(3, 12)), 
    'min_impurity_decrease': [0], 
    'min_samples_split': list(range(2, 12)),
    'random_state': [1],
}

In [None]:
gridSearch = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=10)

gridSearch.fit(train_X, train_y)

print('Improved parameters: ', gridSearch.best_params_)

regTree = gridSearch.best_estimator_

![Pix%2025.png](attachment:Pix%2025.png)

In [None]:
regressionSummary(train_y, gridSearch.predict(train_X))
print()
regressionSummary(test_y, gridSearch.predict(test_X))

![Pix%2026.png](attachment:Pix%2026.png)

In [None]:
plotDecisionTree(regTree, feature_names=train_X.columns)

# we rotate the plot to fit all elements
plotDecisionTree(regTree, feature_names=train_X.columns, rotate=True)

![Pix%2027a.png](attachment:Pix%2027a.png)

### Grid search for Cost-Complexity Pruning

In [None]:
param_grid = {
    'ccp_alpha': [0.001, 0.005, 0.01, 0.015, 0.02]
}

In [None]:
gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=10)

gridSearch.???(train_X, train_y)

![Pix%2027.png](attachment:Pix%2027.png)

In [None]:
print('Cost-Complexity Prune Score: ', gridSearch.???)

print('Best Alpha: ', gridSearch.???)

bestClassTree = gridSearch.???

![Pix%2028.png](attachment:Pix%2028.png)

In [None]:
plotDecisionTree(bestClassTree, feature_names=train_X.columns)

![Pix%2028a.png](attachment:Pix%2028a.png)

In [None]:
regressionSummary(train_y, gridSearch.predict(train_X))
print()
regressionSummary(test_y, gridSearch.predict(test_X))

![Pix%2029.png](attachment:Pix%2029.png)