# Decision Tree

In [6]:
# dependencies loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm
from scipy.stats import uniform
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree  import DecisionTreeClassifier
import sklearn_relief as sr
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet
)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import (
    KFold,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    RandomizedSearchCV,
    GridSearchCV
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    median_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    make_scorer
)

import warnings
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

sns.set_style("whitegrid")

#set Pandas display option 
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", 500)

#set figure size
plt.rcParams['figure.figsize'] = (8, 6)

#ensure that code is reproducible by setting random seed
np.random.seed(1916) 

In [3]:
#project setup
input_data_path = "../data/input"
output_data_path = "../data/output"

#load data
df = pd.read_csv(f"{output_data_path}/after_encoding_train.csv", index_col=0)
df_test = pd.read_csv(f"{output_data_path}/after_encoding_test.csv", index_col=0)

fr = pd.read_csv(f"{output_data_path}/feature_ranking.csv", index_col=0)

We can omit feature engineering in the case of Decision tTree

#### Searching for "good enough" model to feature selection

In [4]:
var = fr.mi_score.sort_values(ascending=False).index.tolist()[0:10]
print(var)  

["('never',).5", "('Kid(s)',)", "('Partner',)", "('gt8',)", "('2h',)", "('Restaurant(<20)',)", "('Legal',)", "('7AM',)", "('Single',)", "('Healthcare Practitioners & Technical',)"]


In [5]:
df.shape[0] ** (0.5)

100.7323185477233

In [8]:
# Define the parameter grid
param = {
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"],
}

# Define the mean squared error scorer
mse = make_scorer(mean_squared_error, greater_is_better=False)

# Create the DecisionTreeClassifier model
model = DecisionTreeClassifier()

# Create the GridSearchCV object
grid_CV = GridSearchCV(
    model, param, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)

# Fit the grid search to your data
grid_CV.fit(df.loc[:, var].values, df.loc[:, "target"].values.ravel())

225 fits failed out of a total of 675.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
143 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\adria\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\adria\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\adria\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\adria\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

In [9]:
# Get the best parameters
best_params = grid_CV.best_params_
print(best_params)

{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10}


In [10]:
# Access the full results
cv_results = grid_CV.cv_results_
print(cv_results)

{'mean_fit_time': array([0.        , 0.        , 0.        , 0.00312467, 0.        ,
       0.        , 0.00100408, 0.00055423, 0.        , 0.00559831,
       0.00527802, 0.00788269, 0.00807538, 0.00483165, 0.00312781,
       0.01174707, 0.00503349, 0.0070972 , 0.00188599, 0.00786147,
       0.00446286, 0.01138425, 0.00779557, 0.01078596, 0.00452995,
       0.00625396, 0.00938139, 0.        , 0.        , 0.        ,
       0.00312858, 0.00313911, 0.00313911, 0.        , 0.        ,
       0.        , 0.00387444, 0.00073795, 0.00626707, 0.00632668,
       0.00073795, 0.00625925, 0.00625982, 0.00558929, 0.        ,
       0.00625029, 0.00625629, 0.00624547, 0.00938072, 0.00312543,
       0.0093751 , 0.00625167, 0.00937529, 0.01250081, 0.00312576,
       0.        , 0.        , 0.        , 0.0031239 , 0.00312543,
       0.        , 0.00312543, 0.        , 0.00755649, 0.0065619 ,
       0.00679851, 0.00825839, 0.00592213, 0.00861406, 0.00808296,
       0.00537424, 0.00851474, 0.00744281, 0

In [11]:
# Right now (temporary) we will this hyperparameters as the best one:
print(best_params)

{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10}


#### Feature selection for Decision Tree

In [12]:
#Feature ranking
fr.sort_values("mi_score", ascending=False, inplace=True)
fr.head()

Unnamed: 0,mi_score,Correlation
"('never',).5",0.046534,0.312901
"('Kid(s)',)",0.025818,0.041959
"('Partner',)",0.017346,0.017552
"('gt8',)",0.016947,0.0
"('2h',)",0.01685,0.136174


In [14]:
mi_features = fr.iloc[0:20].index.tolist()
mi_features_25 = fr.iloc[0:25].index.tolist()
mi_features_35 = fr.iloc[0:35].index.tolist()
mi_features_50 = fr.iloc[0:50].index.tolist()
fr["corr_abs"] = np.abs(fr["Correlation"])
fr.sort_values("corr_abs", ascending=False, inplace=True)
corr_features = fr.iloc[0:20].index.tolist()

Forward elimination

In [15]:
forward_elimination = [
 "('Home',)",
 "('No Urgent Place',)",
 "('Work',)",
 "('Alone',)",
 "('Friend(s)',)",
 "('Kid(s)',)",
 "('Partner',)",
 "('Rainy',)",
 "('Snowy',)",
 "('Sunny',)",
 '(30,)',
 '(55,)',
 '(80,)',
 "('10AM',)",
 "('10PM',)",
 "('2PM',)",
 "('6PM',)",
 "('7AM',)",
 "('Bar',)",
 "('Carry out & Take away',)",
 "('Coffee House',)",
 "('Restaurant(20-50)',)",
 "('Restaurant(<20)',)",
 "('1d',)",
 "('2h',)",
 "('Female',)",
 "('Male',)",
 "('21',)",
 "('26',)",
 "('31',)",
 "('36',)",
 "('41',)",
 "('46',)",
 "('50plus',)",
 "('below21',)",
 "('Divorced',)",
 "('Married partner',)",
 "('Single',)",
 "('Unmarried partner',)",
 "('Widowed',)",
 "('Associates degree',)",
 "('Bachelors degree',)",
 "('Graduate degree (Masters or Doctorate)',)",
 "('High School Graduate',)",
 "('Some High School',)",
 "('Some college - no degree',)",
 "('Architecture & Engineering',)",
 "('Arts Design Entertainment Sports & Media',)",
 "('Building & Grounds Cleaning & Maintenance',)",
 "('Business & Financial',)",
 "('Community & Social Services',)",
 "('Computer & Mathematical',)",
 "('Construction & Extraction',)",
 "('Education&Training&Library',)",
 "('Farming Fishing & Forestry',)",
 "('Food Preparation & Serving Related',)",
 "('Healthcare Practitioners & Technical',)",
 "('Healthcare Support',)",
 "('Installation Maintenance & Repair',)",
 "('Legal',)",
 "('Life Physical Social Science',)",
 "('Management',)",
 "('Office & Administrative Support',)",
 "('Personal Care & Service',)",
 "('Production Occupations',)",
 "('Protective Service',)",
 "('Retired',)",
 "('Sales & Related',)",
 "('Student',)",
 "('Transportation & Material Moving',)",
 "('Unemployed',)",
 "('$100000 or More',)",
 "('$12500 - $24999',)",
 "('$25000 - $37499',)",
 "('$37500 - $49999',)",
 "('$50000 - $62499',)",
 "('$62500 - $74999',)",
 "('$75000 - $87499',)",
 "('$87500 - $99999',)",
 "('Less than $12500',)",
 "('1~3',)",
 "('4~8',)",
 "('gt8',)",
 "('less1',)",
 "('never',)",
 "('1~3',).1",
 "('4~8',).1",
 "('gt8',).1",
 "('less1',).1",
 "('never',).1",
 "('1~3',).2",
 "('4~8',).2",
 "('gt8',).2",
 "('less1',).2",
 "('never',).2",
 "('1~3',).3",
 "('4~8',).3",
 "('gt8',).3",
 "('less1',).3",
 "('never',).3",
 "('1~3',).4",
 "('4~8',).4",
 "('gt8',).4",
 "('less1',).4",
 "('never',).4",
 "('1~3',).5",
 "('4~8',).5",
 "('gt8',).5",
 "('less1',).5",
 "('never',).5",
 "('High_Acceptance',)",
 "('Low_Acceptance',)",
 "('Medium_Acceptance',)",
 "('Medium_High_Acceptance',)",
 "('Medium_Low_Acceptance',)",
 'has_children',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'to_Coupon',
]

In [16]:
candidates = [i for i in forward_elimination if "]" not in i]

In [17]:
grid_CV.best_params_

{'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10}

In [25]:
model = DecisionTreeClassifier(**grid_CV.best_params_)

sf = SFS(
    model,
    n_features_to_select=(15),
    direction='forward',
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

In [26]:
sffit = sf.fit(
    df.loc[:, candidates].values, df.loc[:, "target"].values.ravel()
)

sf_features = df.loc[:, candidates].columns[sffit.support_]

print(sf_features)

Index(['('Home',)', '('Work',)', '('Alone',)', '(80,)', '('6PM',)', '('Bar',)',
       '('Coffee House',)', '('Restaurant(20-50)',)', '('1d',)', '('2h',)',
       '('Architecture & Engineering',)', '('less1',).5', '('never',).5',
       'toCoupon_GEQ25min', 'direction_same'],
      dtype='object')


In [27]:
model = DecisionTreeClassifier(**grid_CV.best_params_)

sf = SFS(
    model,
    n_features_to_select=(10),
    direction='forward',
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

In [28]:
sffit = sf.fit(
    df.loc[:, candidates].values, df.loc[:, "target"].values.ravel()
)

sf_features2 = df.loc[:, candidates].columns[sffit.support_]

print(sf_features2)

Index(['('Work',)', '('Kid(s)',)', '(80,)', '('10PM',)', '('Coffee House',)',
       '('Restaurant(20-50)',)', '('1d',)', '('less1',).5', '('never',).5',
       'toCoupon_GEQ25min'],
      dtype='object')


In [29]:
model = DecisionTreeClassifier(**grid_CV.best_params_)

sf = SFS(
    model,
    n_features_to_select=(5),
    direction='forward',
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

In [30]:
sffit = sf.fit(
    df.loc[:, candidates].values, df.loc[:, "target"].values.ravel()
)

sf_features3 = df.loc[:, candidates].columns[sffit.support_]

print(sf_features3)

Index(['('Work',)', '('Restaurant(20-50)',)', '('1d',)', '('never',).5',
       'toCoupon_GEQ25min'],
      dtype='object')


#### Hyperparameters tunning for each group of variables

In [31]:
param = {
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"],
}

mse = make_scorer(mean_squared_error, greater_is_better=False)

In [44]:
def cv_proc(var):
    model = DecisionTreeClassifier()
    grid_CV = GridSearchCV(
        model, param, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
    )
    grid_CV.fit(df.loc[:, var].values, df.loc[:, "target"].values.ravel())
    print(grid_CV.best_params_)
    print(grid_CV.best_score_)

In [45]:
cv_proc(mi_features_25)

{'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2}
-0.30275051166946276


In [46]:
cv_proc(mi_features_35)

{'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5}
-0.30708908025744924


In [47]:
cv_proc(mi_features_50)

{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5}
-0.3102412069329695


In [48]:
cv_proc(mi_features)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5}
-0.3003857854217297


In [49]:
cv_proc(corr_features)

{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10}
-0.3023584138368048


In [50]:
cv_proc(sf_features)

{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10}
-0.2786048600708447


In [51]:
cv_proc(sf_features2)

{'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5}
-0.2863915588498787


In [52]:
cv_proc(sf_features3)

{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5}
-0.30738309293568383


#### Final models comparison

In [55]:
def proper_CV(x, y, model, display_res=False):
    train_score = list()
    valid_score = list()
    kf = KFold(n_splits=6, shuffle=True, random_state=42)

    for train_index, valid_index in kf.split(x):
        train_x, valid_x = x.iloc[train_index], x.iloc[valid_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[valid_index]

        model.fit(train_x.values, train_y.values.ravel())

        pred_y_train = model.predict(train_x.values)
        rmse_train = np.sqrt(mean_squared_error(train_y, pred_y_train))
        train_score.append(rmse_train)

        pred_y_val = model.predict(valid_x.values)
        rmse_val = np.sqrt(mean_squared_error(valid_y, pred_y_val))
        valid_score.append(rmse_val)

    if display_res:
        view = pd.DataFrame([train_score, valid_score]).T.rename(
            columns={0: "cv_train", 1: "cv_val"}
        )
        display(view)
        return train_score, valid_score, view
    else:
        return train_score, valid_score

In [56]:
hp = [
{'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2},
{'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5},
{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5},
{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5},
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10},
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10},
{'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5},
{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5},
]

In [57]:
model = DecisionTreeClassifier(**hp[0])
var = mi_features_25
cv_output0 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.525967,0.547399
1,0.522326,0.566667
2,0.52323,0.559314
3,0.522439,0.543223
4,0.528627,0.589681
5,0.52616,0.541588


In [58]:
model = DecisionTreeClassifier(**hp[1])
var = mi_features_35
cv_output1 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.504855,0.562836
1,0.508443,0.561425
2,0.506813,0.537753
3,0.511805,0.556665
4,0.505995,0.561425
5,0.51457,0.542134


In [59]:
model = DecisionTreeClassifier(**hp[2])
var = mi_features_50
cv_output2 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.525629,0.576326
1,0.513535,0.556133
2,0.515374,0.56927
3,0.516062,0.560898
4,0.51192,0.553469
5,0.522891,0.547561


In [60]:
model = DecisionTreeClassifier(**hp[3])
var = mi_features
cv_output3 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.537643,0.550628
1,0.533193,0.562477
2,0.537611,0.554003
3,0.5409,0.563528
4,0.535407,0.54864
5,0.54417,0.542679


In [61]:
model = DecisionTreeClassifier(**hp[4])
var = corr_features
cv_output4 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.512528,0.554905
1,0.511689,0.565623
2,0.516634,0.552399
3,0.521079,0.569789
4,0.515144,0.563003
5,0.521079,0.530557


In [62]:
model = DecisionTreeClassifier(**hp[5])
var = sf_features
cv_output5 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.508938,0.541973
1,0.508792,0.534996
2,0.510417,0.52439
3,0.510648,0.529441
4,0.509605,0.532226
5,0.516291,0.508357


In [63]:
model = DecisionTreeClassifier(**hp[6])
var = sf_features2
cv_output6 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.525742,0.553305
1,0.526834,0.542679
2,0.528515,0.540495
3,0.527955,0.5394
4,0.527507,0.541042
5,0.533858,0.514715


In [64]:
model = DecisionTreeClassifier(**hp[7])
var = sf_features3
cv_output7 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.554856,0.552771
1,0.552902,0.562477
2,0.555569,0.549178
3,0.551938,0.567189
4,0.553543,0.559314
5,0.558223,0.535549


In [65]:
pd.DataFrame(
    [
        cv_output0[2].mean().tolist(),
        cv_output1[2].mean().tolist(),
        cv_output2[2].mean().tolist(),
        cv_output3[2].mean().tolist(),
        cv_output4[2].mean().tolist(),
        cv_output5[2].mean().tolist(),
        cv_output6[2].mean().tolist(),
        cv_output7[2].mean().tolist(),
         
    ],
    columns=["train_mean", "test_mean"],
)

Unnamed: 0,train_mean,test_mean
0,0.524792,0.557979
1,0.508747,0.553706
2,0.517569,0.560609
3,0.538154,0.553659
4,0.516359,0.556046
5,0.510782,0.528564
6,0.528402,0.538606
7,0.554505,0.554413


In [66]:
pd.DataFrame(
    [
        cv_output0[2].std().tolist(),
        cv_output1[2].std().tolist(),
        cv_output2[2].std().tolist(),
        cv_output3[2].std().tolist(),
        cv_output4[2].std().tolist(),
        cv_output5[2].std().tolist(),
        cv_output6[2].std().tolist(),
        cv_output7[2].std().tolist(),
         
    ],
    columns=["train_std", "test_std"],
)

Unnamed: 0,train_std,test_std
0,0.002531,0.018334
1,0.003737,0.010952
2,0.005451,0.010615
3,0.00391,0.008127
4,0.004064,0.014095
5,0.002802,0.011499
6,0.00284,0.012756
7,0.002243,0.011298


Second model seems to be the best one 

In [67]:
print(mi_features_35)

["('never',).5", "('Kid(s)',)", "('Partner',)", "('gt8',)", "('2h',)", "('Restaurant(<20)',)", "('Legal',)", "('7AM',)", "('Single',)", "('Healthcare Practitioners & Technical',)", "('Sales & Related',)", "('Carry out & Take away',)", "('1~3',).2", "('1d',)", "('2PM',)", "('1~3',).1", "('Bar',)", "('below21',)", "('Installation Maintenance & Repair',)", "('10PM',)", "('1~3',).5", "('10AM',)", "('4~8',).5", "('Food Preparation & Serving Related',)", "('21',)", "('Sunny',)", "('less1',).5", "('Student',)", "('never',).1", "('No Urgent Place',)", "('1~3',)", 'toCoupon_GEQ25min', "('Friend(s)',)", "('Female',)", "('Less than $12500',)"]


#### Fitting the final model and saving it

In [68]:
model = DecisionTreeClassifier(**hp[1]) 
model.fit(df.loc[:, mi_features_35].values, df.loc[:, "target"].values.ravel())

In [69]:
filename = "../models/dt.sav"

In [70]:
pickle.dump(model, open(filename, "wb"))