# Preprocessing and Training Data

In [1]:
# import necessary packages and load dataset

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tabulate import tabulate

import shap

In [2]:
df = pd.read_csv('../data/hotel_data_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day,stays_in_weekend_nights,stays_in_week_nights,adults,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date_year,reservation_status_date_month,reservation_status_date_day,reservation_status_day_of_week,arrival_date_day_of_week
0,Resort Hotel,0,7,2015,7,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015,7,2,Thursday,Wednesday
1,Resort Hotel,0,13,2015,7,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015,7,2,Thursday,Wednesday
2,Resort Hotel,0,14,2015,7,27,1,0,2,2,...,Transient,98.0,0,1,Check-Out,2015,7,3,Friday,Wednesday
3,Resort Hotel,0,14,2015,7,27,1,0,2,2,...,Transient,98.0,0,1,Check-Out,2015,7,3,Friday,Wednesday
4,Resort Hotel,0,0,2015,7,27,1,0,2,2,...,Transient,107.0,0,0,Check-Out,2015,7,3,Friday,Wednesday


In [4]:
df.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                  int64
arrival_date_week_number            int64
arrival_date_day                    int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [5]:
df=df.drop(columns='reservation_status')

## Transform categorical, text data

In [6]:
df1= pd.get_dummies(df, columns=['hotel','arrival_date_day_of_week','meal', 'country', 
                                 'market_segment', 'distribution_channel', 'reserved_room_type', 
                                 'assigned_room_type', 'deposit_type', 'customer_type',
                                 'reservation_status_day_of_week'], drop_first=False)
df1.shape

(116951, 260)

## Train/Test Split (Churn)

In [7]:
len(df1) * .7, len(df1) * .3

(81865.7, 35085.299999999996)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df1.drop(columns='adr'), 
                                                    df1.adr, test_size=0.3, 
                                                    random_state=46)

In [9]:
X_train.shape, X_test.shape

((81865, 259), (35086, 259))

In [10]:
y_train.shape, y_test.shape

((81865,), (35086,))

In [25]:
# columns with numeric values to scale

num_cols = df.drop(columns=['adr','is_canceled']).select_dtypes([np.number]).columns.values
num_cols

array(['lead_time', 'arrival_date_year', 'arrival_date_month',
       'arrival_date_week_number', 'arrival_date_day',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults',
       'children', 'babies', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'agent', 'days_in_waiting_list',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status_date_year', 'reservation_status_date_month',
       'reservation_status_date_day'], dtype=object)

In [12]:
X_tr_num = X_train[num_cols]
X_te_num = X_test[num_cols]

In [13]:
X_tr_num

Unnamed: 0,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,days_in_waiting_list,required_car_parking_spaces,total_of_special_requests,reservation_status_date_year,reservation_status_date_month,reservation_status_date_day
60587,41,2016,12,53,27,1,5,2,0.0,0,...,0,0,1,220.0,0,0,0,2016,12,27
40867,11,2015,8,35,28,0,2,2,0.0,0,...,0,0,0,9.0,0,0,2,2015,8,30
37114,170,2017,7,27,4,0,5,2,0.0,0,...,0,0,0,241.0,0,0,0,2017,7,9
62255,61,2017,2,7,13,1,2,2,0.0,0,...,0,0,0,9.0,0,0,0,2016,12,14
30397,4,2016,12,51,17,0,1,2,0.0,0,...,0,0,0,241.0,0,0,1,2016,12,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84410,11,2016,3,13,26,2,2,2,0.0,0,...,0,0,1,9.0,0,0,1,2016,3,30
56692,285,2016,10,41,3,1,3,2,0.0,0,...,0,0,0,31.0,24,0,0,2016,1,16
30248,34,2016,12,50,7,1,4,2,0.0,0,...,0,0,3,240.0,0,0,1,2016,12,12
88901,113,2016,6,24,9,0,3,2,0.0,0,...,0,0,0,7.0,0,0,0,2016,6,12


In [14]:
scaler = StandardScaler()
scaler.fit(X_tr_num)
X_tr_scaled = scaler.transform(X_tr_num)
X_te_scaled = scaler.transform(X_te_num)


In [15]:
X_train_scaled = X_train
X_train_scaled.loc[:, num_cols]=X_tr_scaled

X_test_scaled = X_test
X_test_scaled.loc[:, num_cols]=X_te_scaled

In [16]:
X_train_scaled.head()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,reservation_status_day_of_week_Friday,reservation_status_day_of_week_Monday,reservation_status_day_of_week_Saturday,reservation_status_day_of_week_Sunday,reservation_status_day_of_week_Thursday,reservation_status_day_of_week_Tuesday,reservation_status_day_of_week_Wednesday
60587,1,-0.601039,-0.227628,1.76872,1.904861,1.27393,0.067879,1.32045,0.28817,-0.26135,...,0,0,1,0,0,0,0,0,1,0
40867,0,-0.882236,-1.642786,0.471472,0.579049,1.387756,-0.940825,-0.277555,0.28817,-0.26135,...,0,0,0,0,0,0,1,0,0,0
37114,0,0.608109,1.187529,0.14716,-0.0102,-1.344086,-0.940825,1.32045,0.28817,-0.26135,...,0,1,0,0,0,0,1,0,0,0
62255,1,-0.413574,1.187529,-1.4744,-1.483324,-0.319645,0.067879,-0.277555,0.28817,-0.26135,...,0,1,0,0,0,0,0,0,0,1
30397,0,-0.947849,-0.227628,1.76872,1.757548,0.135662,-0.940825,-0.810223,0.28817,-0.26135,...,0,1,0,0,0,0,1,0,0,0


# Regression Models 

1. Linear Regression
2. KNN Regression
3. SVM Regression
4. Decision Tree Regression
5. Gradient Boosting Regression
6. Random Forest Regression


In [20]:
X_tr_scaled

array([[-0.60103925, -0.2276283 ,  1.76871951, ..., -0.13571078,
         1.69746969,  1.29176763],
       [-0.88223647, -1.64278559,  0.47147155, ..., -1.53406991,
         0.50042299,  1.63333515],
       [ 0.60810882,  1.18752898,  0.14715956, ...,  1.26264835,
         0.20116131, -0.7576375 ],
       ...,
       [-0.66665193, -0.2276283 ,  1.76871951, ..., -0.13571078,
         1.69746969, -0.41606998],
       [ 0.07383409, -0.2276283 , -0.17715243, ..., -0.13571078,
        -0.09810036, -0.41606998],
       [-0.34796174, -0.2276283 , -1.47440039, ..., -0.13571078,
        -1.59440874, -1.09920502]])

### Linear Regression

In [18]:
linreg= LinearRegression()

linreg.fit(X_train_scaled, y_train)
y_pred_lr = linreg.predict(X_test_scaled)

In [19]:
print('R2:',r2_score(y_test, y_pred_lr))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_lr))

R2: -4538919097154.504
Root Mean Squared Error: 99213625.48559132
Mean Absolute Error: 1156328.6430559894


### KNN Regression

In [20]:
knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

In [21]:
print('R2:',r2_score(y_test, y_pred_knn))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_knn)))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_knn))

R2: 0.7427542356430072
Root Mean Squared Error: 23.61941927025236
Mean Absolute Error: 14.272897622983526


### Linear SVM

In [22]:
svm_reg = LinearSVR()

svm_reg.fit(X_train_scaled, y_train)
y_pred_svm = svm_reg.predict(X_test_scaled)


In [23]:
print('R2:',r2_score(y_test, y_pred_svm))
print('Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_svm)))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_svm))

R2: 0.4684636595648617
Mean Squared Error: 33.95170432447404
Mean Absolute Error: 23.219194313146115


### Decision Tree

In [24]:
dt_reg = DecisionTreeRegressor(random_state=1234)

dt_reg.fit(X_train_scaled, y_train)
y_pred_dt = dt_reg.predict(X_test_scaled)

In [25]:
print('R2:',r2_score(y_test, y_pred_dt))
print('Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_dt)))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_dt))

R2: 0.8617846487075161
Mean Squared Error: 17.31303631718084
Mean Absolute Error: 8.086168094848984


In [26]:
headers = ["name", "score"]
values_dt = sorted(zip(X_train.columns, dt_reg.feature_importances_), key=lambda x: x[1] * -1)
print('Feature Importance - decision Tree')
print(tabulate(values_dt[:10], headers, tablefmt="plain"))

Feature Importance - decision Tree
name                          score
arrival_date_month        0.184632
reserved_room_type_A      0.151834
arrival_date_week_number  0.0741171
children                  0.0599835
hotel_City Hotel          0.0543883
agent                     0.0536485
lead_time                 0.0505949
adults                    0.0421657
hotel_Resort Hotel        0.0363952
market_segment_Online TA  0.0279116


### Gradient Boosting

In [27]:
gb_reg = GradientBoostingRegressor(random_state = 47)

gb_reg.fit(X_train_scaled, y_train)
y_pred_gb = gb_reg.predict(X_test_scaled)

In [28]:
print('R2:',r2_score(y_test, y_pred_gb))
print('Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_gb)))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_gb))

R2: 0.7643719394340762
Mean Squared Error: 22.605212990854806
Mean Absolute Error: 16.20387838328969


### Random Forest Regression

In [17]:
rf = RandomForestRegressor(random_state=47)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

In [30]:
print('R2:',r2_score(y_test, y_pred_rf))
print('Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_rf))

R2: 0.9251736360104076
Mean Squared Error: 12.738621103863492
Mean Absolute Error: 6.534024286716394


In [31]:
headers = ["name", "score"]
values_rf = sorted(zip(X_train.columns, rf.feature_importances_), key=lambda x: x[1] * -1)
print('Feature Importance - Random Forest')
print(tabulate(values_rf[:10], headers, tablefmt="plain"))

Feature Importance - Random Forest
name                              score
arrival_date_month            0.15495
reserved_room_type_A          0.151652
arrival_date_week_number      0.0871773
children                      0.0599263
agent                         0.056724
lead_time                     0.0543627
hotel_City Hotel              0.0541777
adults                        0.0435173
hotel_Resort Hotel            0.0334113
reservation_status_date_year  0.0251267


### Refine the Decision Tree Model (Tuning Parameters)

In [32]:
param_grid_dt ={
    
    "splitter":['best','random'],
    "max_depth": list(range(0, 211, 10)) + [None],
    "min_samples_split":range(1,10),
    "min_samples_leaf":range(1,10)

}

In [33]:
# Randomized Search

random_dt = RandomizedSearchCV(dt_reg, param_distributions = param_grid_dt, cv=5, verbose=1, n_jobs=-1, n_iter=200)

random_dt.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 26.6min finished


RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=1234),
                   n_iter=200, n_jobs=-1,
                   param_distributions={'max_depth': [0, 10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110, 120,
                                                      130, 140, 150, 160, 170,
                                                      180, 190, 200, 210,
                                                      None],
                                        'min_samples_leaf': range(1, 10),
                                        'min_samples_split': range(1, 10),
                                        'splitter': ['best', 'random']},
                   verbose=1)

In [34]:
random_dt.best_params_

{'splitter': 'best',
 'min_samples_split': 7,
 'min_samples_leaf': 6,
 'max_depth': 30}

In [35]:
# param_grid_dt2 ={
#     "max_depth": list(range(30, 130, 10)),
#     "min_samples_split":range(3,10),
#     "min_samples_leaf":range(5,10)
# }

In [36]:
# Grid Search 

# grid_dt = GridSearchCV(dt_reg, param_grid = param_grid_dt2, cv=5, verbose=1, n_jobs=-1)

# grid_dt.fit(X_train_scaled, y_train)

In [37]:
# grid_dt.best_params_

In [44]:
dt_reg2 = DecisionTreeRegressor(splitter = 'best', min_samples_split = 7, min_samples_leaf=6, max_depth=30 ,random_state=1234)

dt_reg2.fit(X_train_scaled, y_train)
y_pred_dt2 = dt_reg2.predict(X_test_scaled)

In [45]:
print('R2:',r2_score(y_test, y_pred_dt2))
print('Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_dt2)))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_dt2))

R2: 0.8772156802532639
Mean Squared Error: 16.31798589239602
Mean Absolute Error: 8.737636558573538


In [40]:
headers = ["name", "score"]
values_dt2 = sorted(zip(X_train.columns, dt_reg2.feature_importances_), key=lambda x: x[1] * -1)
print('Feature Importance - decision tree (tuning)')
print(tabulate(values_dt2[:10], headers, tablefmt="plain"))

Feature Importance - decision tree (tuning)
name                          score
arrival_date_month        0.19704
reserved_room_type_A      0.163094
arrival_date_week_number  0.0767929
children                  0.0628499
hotel_City Hotel          0.0559667
agent                     0.0526621
lead_time                 0.0462107
adults                    0.0424099
hotel_Resort Hotel        0.0405521
market_segment_Online TA  0.029981


### Refine the Random Forest Model (Tuning Parameters)

In [18]:
param_grid_rf = {
    
    'n_estimators':[100,500,1000],
    'max_depth':list(range(0, 211, 50)) + [None],
    'max_features':['auto','sqrt','log2']

}

In [19]:
grid_rf2 = GridSearchCV(rf, param_grid = param_grid_rf, cv=5, verbose= 1, n_jobs=-1)

grid_rf2.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 628.5min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 852.1min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=47), n_jobs=-1,
             param_grid={'max_depth': [0, 50, 100, 150, 200, None],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 500, 1000]},
             verbose=1)

In [20]:
grid_rf2.best_params_

{'max_depth': 50, 'max_features': 'auto', 'n_estimators': 1000}

In [21]:
rf2 = RandomForestRegressor(n_estimators=1000, max_depth=50, max_features='auto',random_state=47)
rf2.fit(X_train_scaled, y_train)
y_pred_rf2 = rf2.predict(X_test_scaled)

In [22]:
print('R2:',r2_score(y_test, y_pred_rf2))
print('Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_rf2)))
print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_rf2))

R2: 0.9259167066882537
Mean Squared Error: 12.67521221469838
Mean Absolute Error: 6.474431611611048


In [23]:
headers = ["name", "score"]
values_rf2= sorted(zip(X_train.columns, rf2.feature_importances_), key=lambda x: x[1] * -1)
print('Feature Importance - random forest (tuning)')
print(tabulate(values_rf2[:10], headers, tablefmt="plain"))

Feature Importance - random forest (tuning)
name                              score
arrival_date_month            0.155967
reserved_room_type_A          0.151775
arrival_date_week_number      0.0861456
children                      0.0601974
agent                         0.0560885
lead_time                     0.0543344
hotel_City Hotel              0.0534511
adults                        0.0429299
hotel_Resort Hotel            0.0338831
reservation_status_date_year  0.0259936
