# Random Forest

In [1]:
# dependencies loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm
from scipy.stats import uniform
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree  import DecisionTreeClassifier
import sklearn_relief as sr
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet
)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import (
    KFold,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    RandomizedSearchCV,
    GridSearchCV
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    median_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    make_scorer
)

import warnings
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

sns.set_style("whitegrid")

#set Pandas display option 
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", 500)

#set figure size
plt.rcParams['figure.figsize'] = (8, 6)

#ensure that code is reproducible by setting random seed
np.random.seed(1916) 

In [2]:
#project setup
input_data_path = "../data/input"
output_data_path = "../data/output"

#load data
df = pd.read_csv(f"{output_data_path}/after_encoding_train.csv", index_col=0)
df_test = pd.read_csv(f"{output_data_path}/after_encoding_test.csv", index_col=0)

fr = pd.read_csv(f"{output_data_path}/feature_ranking.csv", index_col=0)

We can omit feature engineering in the case of Random Forest

#### Searching for "good enough" model to feature selection

In [3]:
var = fr.mi_score.sort_values(ascending=False).index.tolist()[0:10]
print(var)  

["('never',).5", "('Kid(s)',)", "('Partner',)", "('gt8',)", "('2h',)", "('Restaurant(<20)',)", "('Legal',)", "('7AM',)", "('Single',)", "('Healthcare Practitioners & Technical',)"]


In [4]:
df.shape[0] ** (0.5)

100.7323185477233

In [8]:
# Define the parameter grid
param = {
    "n_estimators": [50],
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"],
}

# Define the mean squared error scorer
mse = make_scorer(mean_squared_error, greater_is_better=False)

# Create the DecisionTreeClassifier model
model = RandomForestClassifier()

# Create the GridSearchCV object
grid_CV = GridSearchCV(
    model, param, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
)

# Fit the grid search to your data
grid_CV.fit(df.loc[:, var].values, df.loc[:, "target"].values.ravel())

In [9]:
# Get the best parameters
best_params = grid_CV.best_params_
print(best_params)

{'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}


In [10]:
# Access the full results
cv_results = grid_CV.cv_results_
print(cv_results)

{'mean_fit_time': array([1.74274445e-03, 0.00000000e+00, 0.00000000e+00, 2.70748138e-04,
       0.00000000e+00, 3.12795639e-03, 0.00000000e+00, 0.00000000e+00,
       3.12495232e-03, 2.94903183e-01, 3.07238626e-01, 2.91497993e-01,
       2.78373194e-01, 2.61255407e-01, 2.53727245e-01, 2.86253500e-01,
       2.81092453e-01, 2.64605093e-01, 2.75424385e-01, 3.24624777e-01,
       3.06955528e-01, 3.08353853e-01, 2.89120150e-01, 2.67473984e-01,
       2.55692720e-01, 2.63543224e-01, 3.37488174e-01, 3.14550400e-03,
       1.20105743e-03, 1.79896355e-03, 1.86128616e-03, 2.22558975e-03,
       1.77531242e-03, 0.00000000e+00, 3.12442780e-03, 3.12581062e-03,
       2.43155098e-01, 2.33345270e-01, 2.22977781e-01, 2.39626551e-01,
       2.28034878e-01, 2.27886486e-01, 2.48024511e-01, 2.54713345e-01,
       2.28662920e-01, 2.41120338e-01, 2.56620789e-01, 2.71909857e-01,
       2.52991247e-01, 2.31096888e-01, 2.52436304e-01, 3.06188345e-01,
       2.82153797e-01, 3.16240883e-01, 3.12538147e-03, 0.00

In [12]:
# Right now (temporary) we will this hyperparameters as the best one:
print(best_params)

{'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}


#### Feature selection

In [13]:
#Feature ranking
fr.sort_values("mi_score", ascending=False, inplace=True)
fr.head()

Unnamed: 0,mi_score,Correlation
"('never',).5",0.046534,0.312901
"('Kid(s)',)",0.025818,0.041959
"('Partner',)",0.017346,0.017552
"('gt8',)",0.016947,0.0
"('2h',)",0.01685,0.136174


In [14]:
mi_features = fr.iloc[0:20].index.tolist()
mi_features_25 = fr.iloc[0:25].index.tolist()
mi_features_35 = fr.iloc[0:35].index.tolist()
mi_features_50 = fr.iloc[0:50].index.tolist()
fr["corr_abs"] = np.abs(fr["Correlation"])
fr.sort_values("corr_abs", ascending=False, inplace=True)
corr_features = fr.iloc[0:20].index.tolist()

Forward elimination

In [15]:
forward_elimination = [
 "('Home',)",
 "('No Urgent Place',)",
 "('Work',)",
 "('Alone',)",
 "('Friend(s)',)",
 "('Kid(s)',)",
 "('Partner',)",
 "('Rainy',)",
 "('Snowy',)",
 "('Sunny',)",
 '(30,)',
 '(55,)',
 '(80,)',
 "('10AM',)",
 "('10PM',)",
 "('2PM',)",
 "('6PM',)",
 "('7AM',)",
 "('Bar',)",
 "('Carry out & Take away',)",
 "('Coffee House',)",
 "('Restaurant(20-50)',)",
 "('Restaurant(<20)',)",
 "('1d',)",
 "('2h',)",
 "('Female',)",
 "('Male',)",
 "('21',)",
 "('26',)",
 "('31',)",
 "('36',)",
 "('41',)",
 "('46',)",
 "('50plus',)",
 "('below21',)",
 "('Divorced',)",
 "('Married partner',)",
 "('Single',)",
 "('Unmarried partner',)",
 "('Widowed',)",
 "('Associates degree',)",
 "('Bachelors degree',)",
 "('Graduate degree (Masters or Doctorate)',)",
 "('High School Graduate',)",
 "('Some High School',)",
 "('Some college - no degree',)",
 "('Architecture & Engineering',)",
 "('Arts Design Entertainment Sports & Media',)",
 "('Building & Grounds Cleaning & Maintenance',)",
 "('Business & Financial',)",
 "('Community & Social Services',)",
 "('Computer & Mathematical',)",
 "('Construction & Extraction',)",
 "('Education&Training&Library',)",
 "('Farming Fishing & Forestry',)",
 "('Food Preparation & Serving Related',)",
 "('Healthcare Practitioners & Technical',)",
 "('Healthcare Support',)",
 "('Installation Maintenance & Repair',)",
 "('Legal',)",
 "('Life Physical Social Science',)",
 "('Management',)",
 "('Office & Administrative Support',)",
 "('Personal Care & Service',)",
 "('Production Occupations',)",
 "('Protective Service',)",
 "('Retired',)",
 "('Sales & Related',)",
 "('Student',)",
 "('Transportation & Material Moving',)",
 "('Unemployed',)",
 "('$100000 or More',)",
 "('$12500 - $24999',)",
 "('$25000 - $37499',)",
 "('$37500 - $49999',)",
 "('$50000 - $62499',)",
 "('$62500 - $74999',)",
 "('$75000 - $87499',)",
 "('$87500 - $99999',)",
 "('Less than $12500',)",
 "('1~3',)",
 "('4~8',)",
 "('gt8',)",
 "('less1',)",
 "('never',)",
 "('1~3',).1",
 "('4~8',).1",
 "('gt8',).1",
 "('less1',).1",
 "('never',).1",
 "('1~3',).2",
 "('4~8',).2",
 "('gt8',).2",
 "('less1',).2",
 "('never',).2",
 "('1~3',).3",
 "('4~8',).3",
 "('gt8',).3",
 "('less1',).3",
 "('never',).3",
 "('1~3',).4",
 "('4~8',).4",
 "('gt8',).4",
 "('less1',).4",
 "('never',).4",
 "('1~3',).5",
 "('4~8',).5",
 "('gt8',).5",
 "('less1',).5",
 "('never',).5",
 "('High_Acceptance',)",
 "('Low_Acceptance',)",
 "('Medium_Acceptance',)",
 "('Medium_High_Acceptance',)",
 "('Medium_Low_Acceptance',)",
 'has_children',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'to_Coupon',
]

In [16]:
candidates = [i for i in forward_elimination if "]" not in i]

In [17]:
grid_CV.best_params_

{'max_depth': 15,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 50}

In [19]:
model = RandomForestClassifier(**grid_CV.best_params_)

sf = SFS(
    model,
    n_features_to_select=15,
    direction='forward',
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

In [20]:
sffit = sf.fit(
    df.loc[:, candidates].values, df.loc[:, "target"].values.ravel()
)

sf_features = df.loc[:, candidates].columns[sffit.support_]

print(sf_features)

Index(['('Home',)', '('Work',)', '(80,)', '('6PM',)', '('Bar',)',
       '('Coffee House',)', '('Restaurant(20-50)',)', '('1d',)',
       '('Healthcare Practitioners & Technical',)', '('$50000 - $62499',)',
       '('4~8',)', '('never',).3', '('less1',).5', '('never',).5',
       'toCoupon_GEQ25min'],
      dtype='object')


In [21]:
model = RandomForestClassifier(**grid_CV.best_params_)

sf = SFS(
    model,
    n_features_to_select=(10),
    direction='forward',
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

In [22]:
sffit = sf.fit(
    df.loc[:, candidates].values, df.loc[:, "target"].values.ravel()
)

sf_features2 = df.loc[:, candidates].columns[sffit.support_]

print(sf_features2)

Index(['('Friend(s)',)', '('Kid(s)',)', '('10PM',)', '('7AM',)',
       '('Restaurant(20-50)',)', '('1d',)', '('less1',).3', '('less1',).5',
       '('never',).5', 'to_Coupon'],
      dtype='object')


In [23]:
model = RandomForestClassifier(**grid_CV.best_params_)

sf = SFS(
    model,
    n_features_to_select=5,
    direction='forward',
    scoring=mse,
    cv=5,
    n_jobs=-1,
)

In [24]:
sffit = sf.fit(
    df.loc[:, candidates].values, df.loc[:, "target"].values.ravel()
)

sf_features3 = df.loc[:, candidates].columns[sffit.support_]

print(sf_features3)

Index(['('Work',)', '('Restaurant(20-50)',)', '('1d',)', '('never',).5',
       'to_Coupon'],
      dtype='object')


#### Hyperparameters tunning for each group of variables

In [25]:
param = {
    "n_estimators": [50],
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"],
}

mse = make_scorer(mean_squared_error, greater_is_better=False)

In [28]:
def cv_proc(var):
    model = RandomForestClassifier()
    grid_CV = GridSearchCV(
        model, param, cv=5, scoring=mse, return_train_score=True, n_jobs=-1
    )
    grid_CV.fit(df.loc[:, var].values, df.loc[:, "target"].values.ravel())
    print(grid_CV.best_params_)
    print(grid_CV.best_score_)

In [29]:
cv_proc(mi_features_25)

{'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
-0.2800851689905241


In [30]:
cv_proc(mi_features_35)

{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
-0.27298802826988955


In [31]:
cv_proc(mi_features_50)

{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
-0.26678064614809405


In [32]:
cv_proc(mi_features)

{'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
-0.2921074955024072


In [33]:
cv_proc(corr_features)

{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}
-0.28954587059072023


In [34]:
cv_proc(sf_features)

{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
-0.27160857225404056


In [35]:
cv_proc(sf_features2)

{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
-0.28570156377841494


In [37]:
cv_proc(sf_features3)

{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
-0.30275143425259843


#### Final models comparison

In [38]:
def proper_CV(x, y, model, display_res=False):
    train_score = list()
    valid_score = list()
    kf = KFold(n_splits=6, shuffle=True, random_state=42)

    for train_index, valid_index in kf.split(x):
        train_x, valid_x = x.iloc[train_index], x.iloc[valid_index]
        train_y, valid_y = y.iloc[train_index], y.iloc[valid_index]

        model.fit(train_x.values, train_y.values.ravel())

        pred_y_train = model.predict(train_x.values)
        rmse_train = np.sqrt(mean_squared_error(train_y, pred_y_train))
        train_score.append(rmse_train)

        pred_y_val = model.predict(valid_x.values)
        rmse_val = np.sqrt(mean_squared_error(valid_y, pred_y_val))
        valid_score.append(rmse_val)

    if display_res:
        view = pd.DataFrame([train_score, valid_score]).T.rename(
            columns={0: "cv_train", 1: "cv_val"}
        )
        display(view)
        return train_score, valid_score, view
    else:
        return train_score, valid_score

In [39]:
hp = [
{'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50},
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50},
{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50},
{'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50},
{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50},
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50},
{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50},
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50},
]

In [40]:
model = RandomForestClassifier(**hp[0])
var = mi_features_25
cv_output0 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.484289,0.527046
1,0.483649,0.543223
2,0.486575,0.523261
3,0.485236,0.532226
4,0.482915,0.538851
5,0.485114,0.508938


In [41]:
model = RandomForestClassifier(**hp[1])
var = mi_features_35
cv_output1 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.416825,0.5304
1,0.416659,0.520428
2,0.417934,0.516436
3,0.419769,0.518151
4,0.414382,0.534443
5,0.420614,0.498371


In [42]:
model = RandomForestClassifier(**hp[2])
var = mi_features_50
cv_output2 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.390302,0.521409
1,0.396591,0.51758
2,0.394648,0.520428
3,0.394199,0.518721
4,0.393147,0.523261
5,0.39164,0.491802


In [43]:
model = RandomForestClassifier(**hp[3])
var = mi_features
cv_output3 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.51091,0.552236
1,0.510532,0.556133
2,0.511805,0.537203
3,0.510069,0.54864
4,0.51434,0.537203
5,0.516405,0.517008


In [44]:
model = RandomForestClassifier(**hp[4])
var = corr_features
cv_output4 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.52111,0.552771
1,0.518006,0.549716
2,0.522665,0.5394
3,0.522439,0.550254
4,0.522326,0.549716
5,0.527171,0.509519


In [45]:
model = RandomForestClassifier(**hp[5])
var = sf_features
cv_output5 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.504738,0.537044
1,0.500473,0.537203
2,0.506579,0.518151
3,0.504825,0.526079
4,0.505878,0.524954
5,0.508211,0.510678


In [46]:
model = RandomForestClassifier(**hp[6])
var = sf_features2
cv_output6 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.527763,0.542518
1,0.525373,0.549178
2,0.527171,0.539948
3,0.525823,0.544854
4,0.527283,0.544854
5,0.532972,0.507775


In [47]:
model = RandomForestClassifier(**hp[7])
var = sf_features3
cv_output7 = proper_CV(df.loc[:, var], df.loc[:, "target"], model, display_res=True)

Unnamed: 0,cv_train,cv_val
0,0.549394,0.554372
1,0.548176,0.560371
2,0.551402,0.544311
3,0.546772,0.567189
4,0.550329,0.549716
5,0.555249,0.52439


In [48]:
pd.DataFrame(
    [
        cv_output0[2].mean().tolist(),
        cv_output1[2].mean().tolist(),
        cv_output2[2].mean().tolist(),
        cv_output3[2].mean().tolist(),
        cv_output4[2].mean().tolist(),
        cv_output5[2].mean().tolist(),
        cv_output6[2].mean().tolist(),
        cv_output7[2].mean().tolist(),
         
    ],
    columns=["train_mean", "test_mean"],
)

Unnamed: 0,train_mean,test_mean
0,0.48463,0.528924
1,0.417697,0.519705
2,0.393421,0.515534
3,0.512344,0.541404
4,0.522286,0.541896
5,0.505117,0.525685
6,0.527731,0.538188
7,0.55022,0.550058


In [49]:
pd.DataFrame(
    [
        cv_output0[2].std().tolist(),
        cv_output1[2].std().tolist(),
        cv_output2[2].std().tolist(),
        cv_output3[2].std().tolist(),
        cv_output4[2].std().tolist(),
        cv_output5[2].std().tolist(),
        cv_output6[2].std().tolist(),
        cv_output7[2].std().tolist(),
         
    ],
    columns=["train_std", "test_std"],
)

Unnamed: 0,train_std,test_std
0,0.001296,0.012239
1,0.002266,0.012652
2,0.002242,0.011796
3,0.002501,0.014263
4,0.002957,0.016522
5,0.002611,0.010434
6,0.002727,0.015208
7,0.002948,0.014902


Third model seems to be the best one 

In [50]:
print(mi_features_50)

["('never',).5", "('Kid(s)',)", "('Partner',)", "('gt8',)", "('2h',)", "('Restaurant(<20)',)", "('Legal',)", "('7AM',)", "('Single',)", "('Healthcare Practitioners & Technical',)", "('Sales & Related',)", "('Carry out & Take away',)", "('1~3',).2", "('1d',)", "('2PM',)", "('1~3',).1", "('Bar',)", "('below21',)", "('Installation Maintenance & Repair',)", "('10PM',)", "('1~3',).5", "('10AM',)", "('4~8',).5", "('Food Preparation & Serving Related',)", "('21',)", "('Sunny',)", "('less1',).5", "('Student',)", "('never',).1", "('No Urgent Place',)", "('1~3',)", 'toCoupon_GEQ25min', "('Friend(s)',)", "('Female',)", "('Less than $12500',)", "('Architecture & Engineering',)", "('Computer & Mathematical',)", "('Married partner',)", "('Personal Care & Service',)", "('Protective Service',)", "('Divorced',)", "('Coffee House',)", "('$25000 - $37499',)", "('gt8',).2", "('Business & Financial',)", "('Transportation & Material Moving',)", 'to_Coupon', "('Snowy',)", "('Some college - no degree',)", "('

#### Fitting the final model and saving it

In [51]:
model = RandomForestClassifier(**hp[1]) 
model.fit(df.loc[:, mi_features_50].values, df.loc[:, "target"].values.ravel())

In [52]:
filename = "../models/rf.sav"

In [53]:
pickle.dump(model, open(filename, "wb"))