# Mini Project: Decision Trees

Written by Adam Ten Hoeve  
COMP 4448: Data Science Tools 2  
Summer 2021

In [15]:
# Load necesary libraries
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Find your own dataset suitable for classification with at least three input variables and 200 cases: You will build a decision tree classifier and a random forest classifier. Find some interesting dataset instead of the popular iris data, etc. 

Address the following and include code/output snippets from b) to f). Include the response under each sub question. 
1.	State your research question, for example: Are decision trees and random forest good models for predicting whether someone will default on a loan or not based on their age and income level? This is just an example, your dataset does not have to be (or should not be) about loans. 
2.	Data pre-processing (to the extent deemed necessary)
3.	Data splitting 
4.	Model construction (a decision tree and a random forest)
5.	Hyperparameter turning (for each model, tune the hyperparameter that is important to you and use any of the methods for hyperparameter tuning learned in class such as cross validation with for loop, gridsearch cross validation, etc. You could tune more than a single parameter for each model if you want). 
6.	Use the best or optimal parameter values to build a model, then compute the accuracy score for the decision tree and for the random forest). 
7.	Discuss about overfitting for both models and, also discuss which model is better for classification for your dataset and why? 

In [3]:
# Load in the smoking data
df_smoking = pd.read_csv("SmokeBan.csv")
# Drop the extra index column
df_smoking.drop("Unnamed: 0", axis=1, inplace=True)
df_smoking.head()

Unnamed: 0,smoker,ban,age,education,afam,hispanic,gender
0,yes,yes,41,hs,no,no,female
1,yes,yes,44,some college,no,no,female
2,no,no,19,some college,no,no,female
3,yes,no,29,hs,no,no,female
4,no,yes,28,some college,no,no,female


In [4]:
# Check for missing values
np.sum(df_smoking.isna(), axis=0)

smoker       0
ban          0
age          0
education    0
afam         0
hispanic     0
gender       0
dtype: int64

In [7]:
# Convert categorical variables to Dummy variables
df_clean = pd.get_dummies(df_smoking[["smoker", "ban", "education", "afam", "hispanic", "gender"]], drop_first=True)
# Add a scaled version of Age to the data
scaler = MinMaxScaler()
df_clean["age"] = scaler.fit_transform(df_smoking["age"].to_numpy().reshape(-1, 1))
df_clean.head()

Unnamed: 0,smoker_yes,ban_yes,education_hs,education_hs drop out,education_master,education_some college,afam_yes,hispanic_yes,gender_male,age
0,1,1,1,0,0,0,0,0,0,0.328571
1,1,1,0,0,0,1,0,0,0,0.371429
2,0,0,0,0,0,1,0,0,0,0.014286
3,1,0,1,0,0,0,0,0,0,0.157143
4,0,1,0,0,0,1,0,0,0,0.142857


In [10]:
# Split into training and test sets
X_smoke = df_clean.drop("smoker_yes", axis=1)
y_smoke = df_clean["smoker_yes"]
X_train_smoke, X_test_smoke, y_train_smoke, y_test_smoke = \
    train_test_split(X_smoke, y_smoke, test_size=0.3, random_state=42)
# Print the sizes of each set
print(X_train_smoke.shape)
print(X_test_smoke.shape)
print(y_train_smoke.shape)
print(y_test_smoke.shape)

(7000, 9)
(3000, 9)
(7000,)
(3000,)


In [11]:
# Create and fit a decision tree classifier
dt_clf_smoke = DecisionTreeClassifier().fit(X_train_smoke, y_train_smoke)
# Create and fit a random forest classifier
rf_clf_smoke = RandomForestClassifier().fit(X_train_smoke, y_train_smoke)

In [12]:
# Hyperparameter Tuning for the Decision Tree, using GridSearch
# Set the parameters to be explored by the gridsearch
param_grid = {"max_depth": range(1, dt_clf_smoke.tree_.max_depth),
              "max_features": [0.2, 0.4, 0.6, 0.8]}
# Use the grid search with the designated parameters
grid_clf_smoke = GridSearchCV(dt_clf_smoke, param_grid, cv=8)
# Fit the grid search to the training set
grid_clf_smoke.fit(X_train_smoke, y_train_smoke)
# Find the best parameters found by the gridsearch
print("Best parameters found by the gridsearch:", grid_clf_smoke.best_params_)

Best parameters found by the gridsearch: {'max_depth': 6, 'max_features': 0.6}


In [13]:
# Hyperparameter Tuning for the Random Forest, using GridSearch
# Find the maximum tree depth of the decision trees within the random forest
max_depth = max([estimator.tree_.max_depth for estimator in rf_clf_smoke.estimators_])
# Set the parameters to be explored by the gridsearch
rf_param_grid = {"max_depth": range(1, max_depth),
                 "bootstrap": [True, False]}
# Use the grid search with the designated parameters
grid_rf_smoke = GridSearchCV(rf_clf_smoke, rf_param_grid, cv=8)
# Fit the grid search to the training set
grid_rf_smoke.fit(X_train_smoke, y_train_smoke)
# Find the best parameters found by the gridsearch
print("Best parameters found by the gridsearch:", grid_rf_smoke.best_params_)

Best parameters found by the gridsearch: {'bootstrap': True, 'max_depth': 7}


In [16]:
# Create a decision tree and random forest using the best parameters
best_dt_clf = DecisionTreeClassifier(max_depth=4, max_features=0.8)
best_dt_clf.fit(X_train_smoke, y_train_smoke)

best_rf_clf = RandomForestClassifier(max_depth=6, bootstrap=True)
best_rf_clf.fit(X_train_smoke, y_train_smoke)

# Predict on the training set with both models
dt_preds_train = best_dt_clf.predict(X_train_smoke)
rf_preds_train = best_rf_clf.predict(X_train_smoke)

# Compute the accuracy of both models on the training set
print("Decision Tree Accuracy on the training set:", accuracy_score(dt_preds_train, y_train_smoke))
print("Random Forest Accuracy on the training set:", accuracy_score(rf_preds_train, y_train_smoke))

# Predict on the test set with both models
dt_preds_test = best_dt_clf.predict(X_test_smoke)
rf_preds_test = best_rf_clf.predict(X_test_smoke)

# Compute the accuracy of both classifiers on the test set
print("Decision Tree Accuracy on the test set:", accuracy_score(dt_preds_test, y_test_smoke))
print("Random Forest Accuracy on the test set:", accuracy_score(rf_preds_test, y_test_smoke))

Decision Tree Accuracy on the training set: 0.7615714285714286
Random Forest Accuracy on the training set: 0.7622857142857142
Decision Tree Accuracy on the test set: 0.759
Random Forest Accuracy on the test set: 0.7596666666666667
