# Necessary Imports

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold
from sklearn.compose import ColumnTransformer

# New Section

In [15]:
data = pd.read_csv('../data.csv')
data.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,...,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,...,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,...,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22


In [16]:
# We can fill NaN values for number of children with 0 (only 4 NaN values)
data['children'] = data['children'].fillna(0)

In [17]:
data['arrival_date_month'].replace(
                                  ['July', 'August', 'September', 'October', 'November', 'December', 'January', 'February', 'March', 'April', 'May', 'June'],
                                  [7,8,9,10,11,12,1,2,3,4,5,6],
                                  inplace=True
                              )

data.rename(columns={'arrival_date_year': 'year', 
                  'arrival_date_month': 'month',
                  'arrival_date_day_of_month': 'day'
                  }, 
               inplace=True
              )

data.insert(3,'arrival_date',pd.to_datetime(data[['year', 'month', 'day']]))

In [18]:
# Removing country column for now or we can use target encoding
data = data.drop(['country'], axis=1)

# Removing agent and company columns since market_segment gives same information
data = data.drop(['agent','company'], axis=1)

# Removing all columns relating to arrival date except week number and year for now
data = data.drop(['month','day'], axis=1)

In [19]:
# We can combine reserved_room_type and assigned_room_type columns into one that has boolean values for whether reserved and assigned matched
# 0 -> Reserved and assigned room types matched
# 1 -> Reserved and assigned room types didn't match
data['reserved_assigned_match'] = np.where(data['reserved_room_type'] == data['assigned_room_type'], 0, 1)
data = data.drop(['reserved_room_type','assigned_room_type'], axis=1)

In [20]:
# We can change the values in the meal types column. 0, 0.33, 0.67, 1 since the meal types are incremental
data['meal'].replace(['Undefined', 'SC', 'BB', 'HB', 'FB'], [0, 0, 0.33, 0.67, 1], inplace=True)

In [21]:
# We can remove the distribution channels column as market_segment column gives us same information with more granularity
data = data.drop(['distribution_channel'], axis=1)

In [22]:
# data.corr()['is_canceled']

In [23]:
#Remove records where adults, babies, children, stays_in_weekend_nights, stays_in_week_nights, is_canceled values 0,
data.drop(
     data[(data['adults']==0) &
        (data['children']==0) &
        (data['babies']==0) &
        (data['stays_in_weekend_nights']==0) & 
        (data['stays_in_week_nights']==0) &
        (data['is_canceled']==0)].index,
     inplace=True
     )

In [24]:
#Remove records with adr, is_canceled 0, whose market segment is neither Complementary, Corporate nor Aviation
data.drop(data[
    (data["adr"]==0) &  
    (data["is_canceled"]==0)  &
    (data["market_segment"] != "Complementary") &
    (data["market_segment"] != "Corporate") &
    (data["market_segment"] != "Aviation") 
  ].index, 
  inplace = True
  )

In [25]:
#Remove records with stays_in_weekend_nights, stays_in_week_nights 0, arrival and reservation status date are same, reservation status not Check Out
data.drop(data[
    (data["stays_in_week_nights"]==0) &
    (data["stays_in_weekend_nights"]==0) &
    (data["arrival_date"] == data["reservation_status_date"]) &
    (data["reservation_status"]!="Check-Out")
  ].index,
  inplace=True
)

In [26]:
data['reservation_status_date'] = pd.to_datetime(data['reservation_status_date'],
                                            format = '%Y-%m-%d'
                                            )
data['cancellation_days'] = data['arrival_date'] - data['reservation_status_date']
data['cancellation_days'] = data['cancellation_days'].dt.days
data.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date,year,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,reserved_assigned_match,cancellation_days
2,Resort Hotel,0,7,2015-07-01,2015,27,0,1,1,0.0,...,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02,1,-1
3,Resort Hotel,0,13,2015-07-01,2015,27,0,1,1,0.0,...,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02,0,-1
4,Resort Hotel,0,14,2015-07-01,2015,27,0,2,2,0.0,...,No Deposit,0,Transient,98.0,0,1,Check-Out,2015-07-03,0,-2
5,Resort Hotel,0,14,2015-07-01,2015,27,0,2,2,0.0,...,No Deposit,0,Transient,98.0,0,1,Check-Out,2015-07-03,0,-2
6,Resort Hotel,0,0,2015-07-01,2015,27,0,2,2,0.0,...,No Deposit,0,Transient,107.0,0,0,Check-Out,2015-07-03,0,-2
7,Resort Hotel,0,9,2015-07-01,2015,27,0,2,2,0.0,...,No Deposit,0,Transient,103.0,0,1,Check-Out,2015-07-03,0,-2
8,Resort Hotel,1,85,2015-07-01,2015,27,0,3,2,0.0,...,No Deposit,0,Transient,82.0,0,1,Canceled,2015-05-06,0,56
9,Resort Hotel,1,75,2015-07-01,2015,27,0,3,2,0.0,...,No Deposit,0,Transient,105.5,0,0,Canceled,2015-04-22,0,70
10,Resort Hotel,1,23,2015-07-01,2015,27,0,4,2,0.0,...,No Deposit,0,Transient,123.0,0,0,Canceled,2015-06-23,0,8
11,Resort Hotel,0,35,2015-07-01,2015,27,0,4,2,0.0,...,No Deposit,0,Transient,145.0,0,0,Check-Out,2015-07-05,0,-4


In [27]:
# Reservation Status can be ignored since it is being predicted     -> classification
# Reservation Status can be ignored since we will only be training  -> regression
# on those points where the booking has been cancelled
# regression_data = data.copy()
data = data.drop(['reservation_status','reservation_status_date','arrival_date'], axis=1)

In [28]:
data.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,year,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reserved_assigned_match,cancellation_days
2,Resort Hotel,0,7,2015,27,0,1,1,0.0,0,...,0,0,No Deposit,0,Transient,75.0,0,0,1,-1
3,Resort Hotel,0,13,2015,27,0,1,1,0.0,0,...,0,0,No Deposit,0,Transient,75.0,0,0,0,-1
4,Resort Hotel,0,14,2015,27,0,2,2,0.0,0,...,0,0,No Deposit,0,Transient,98.0,0,1,0,-2
5,Resort Hotel,0,14,2015,27,0,2,2,0.0,0,...,0,0,No Deposit,0,Transient,98.0,0,1,0,-2
6,Resort Hotel,0,0,2015,27,0,2,2,0.0,0,...,0,0,No Deposit,0,Transient,107.0,0,0,0,-2
7,Resort Hotel,0,9,2015,27,0,2,2,0.0,0,...,0,0,No Deposit,0,Transient,103.0,0,1,0,-2
8,Resort Hotel,1,85,2015,27,0,3,2,0.0,0,...,0,0,No Deposit,0,Transient,82.0,0,1,0,56
9,Resort Hotel,1,75,2015,27,0,3,2,0.0,0,...,0,0,No Deposit,0,Transient,105.5,0,0,0,70
10,Resort Hotel,1,23,2015,27,0,4,2,0.0,0,...,0,0,No Deposit,0,Transient,123.0,0,0,0,8
11,Resort Hotel,0,35,2015,27,0,4,2,0.0,0,...,0,0,No Deposit,0,Transient,145.0,0,0,0,-4


In [29]:
data = data[data['is_canceled'] == 1]
data = data.drop(['is_canceled'], axis = 1)
data.corr()['cancellation_days']

  data.corr()['cancellation_days']


lead_time                         0.724791
year                              0.079013
arrival_date_week_number          0.088579
stays_in_weekend_nights          -0.065427
stays_in_week_nights              0.046672
adults                            0.061568
children                         -0.042173
babies                           -0.019502
meal                              0.030825
is_repeated_guest                 0.040845
previous_cancellations            0.060475
previous_bookings_not_canceled   -0.021188
booking_changes                  -0.064055
days_in_waiting_list              0.010432
adr                              -0.104598
required_car_parking_spaces            NaN
total_of_special_requests        -0.143320
reserved_assigned_match          -0.106531
cancellation_days                 1.000000
Name: cancellation_days, dtype: float64

In [30]:
X = data.drop('cancellation_days', axis='columns')
y = data.cancellation_days
print(type(X))

<class 'pandas.core.frame.DataFrame'>


## Linear Regression Model

In [31]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

In [32]:
def get_LR_regressor(features):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorial', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )

    regressor = make_pipeline(
        columnTransformer,
        LinearRegression()
    )
    
    return regressor

In [33]:
from itertools import combinations
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
from numpy.typing import ArrayLike
from sklearn.model_selection import KFold
from typing import List

In [298]:
features = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

regressor = get_LR_regressor(features)
regressor.fit(X=X_train, y=y_train)
print("EQN: ", regressor[1].coef_)
print("Score: ", regressor.score(X_test, y_test))
print(cross_val_score(estimator=get_LR_regressor(features), X=X, y=y, cv=kfold))

EQN:  [ 2.41247118e+12  2.41247118e+12  2.41247118e+12  2.41247118e+12
  3.09920284e+13  3.09920284e+13  3.09920284e+13 -2.02384395e+12
 -2.02384395e+12 -2.02384395e+12 -2.02384395e+12 -2.02384395e+12
 -2.02384395e+12 -2.02384395e+12 -2.02384395e+12 -2.51463531e+13
 -2.51463531e+13  1.95249271e-01  8.43337284e+00 -6.64103497e+00
 -1.46331800e+00 -2.91447183e-01  1.27688441e+01 -4.22404235e-01
  4.18717349e-01 -7.99976206e+00  9.44165508e-01 -4.08113533e-01
  4.54382010e+00 -2.10426890e+00 -3.68232990e+05  6.38087536e+01
 -5.18963979e+00  5.02651978e+00 -3.22676086e+00]
Score:  0.5750420156456886
[0.60231397 0.60471644 0.61312731 0.6080899  0.60786354]


## Feature selection by complete enumeration

In [299]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [300]:
class Solution:
    features: List[int]
    mse: float

    def __init__(self, y: ArrayLike):
        self.features = list()
        self.mse = mean_squared_error(y, [y.mean()]*len(y))

    def update(self, features: List[int], mse: float) -> bool:
        if(mse < self.mse):
            print(', '.join(str(x) for x in features))
            print(f"\tNew Error: {mse:.3f} better than {self.mse:.3f}")
            self.features = features
            self.mse = mse
            return True

        return False

In [301]:
all_features = features
current_features = all_features
best = Solution(y)

In [302]:
while len(current_features)>0:
    selected_feature = None

    for feature in current_features:
        new_features = current_features.drop(feature)
        Xr = X[new_features]
        mses = cross_val_score(estimator=get_LR_regressor(new_features), X=Xr, y=y, cv=kfold)
        mse = -np.average(mses)

        if(best.update(features=new_features, mse=mse)):
            selected_feature = feature
        
    if(selected_feature):
        current_features = current_features.drop(selected_feature)
    else:
        break

lead_time, year, arrival_date_week_number, stays_in_weekend_nights, stays_in_week_nights, adults, children, babies, meal, market_segment, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, booking_changes, deposit_type, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests, reserved_assigned_match
	New Error: -0.607 better than 8226.009
hotel, lead_time, year, arrival_date_week_number, stays_in_weekend_nights, stays_in_week_nights, children, babies, meal, market_segment, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, booking_changes, deposit_type, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests, reserved_assigned_match
	New Error: -0.607 better than -0.607
hotel, lead_time, year, arrival_date_week_number, stays_in_weekend_nights, stays_in_week_nights, adults, children, babies, meal, market_segment, is_repeated_guest, previous_cancellations, pre

In [303]:
best_features = list(best.features)


In [304]:
features = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

regressor = get_LR_regressor(features)
regressor.fit(X=X_train, y=y_train)
print("EQN: ", regressor[1].coef_)

EQN:  [-2.84736108e+13 -2.84736108e+13 -2.84736108e+13 -2.84736108e+13
 -6.38588035e+12 -6.38588035e+12 -6.38588035e+12 -7.71582751e+11
 -7.71582751e+11 -7.71582751e+11 -7.71582751e+11 -7.71582751e+11
 -7.71582751e+11 -7.71582751e+11 -7.71582751e+11  8.87254509e+11
  8.87254509e+11  8.98753684e-02  8.08934393e+00 -6.50188023e+00
 -1.40171244e+00 -4.22003561e-01  1.28209816e+01 -4.95422071e-01
  4.44743562e-01 -8.34245682e+00  7.76214551e-01 -1.87817457e-01
  4.53456559e+00 -2.18577901e+00 -4.89129845e+02  6.33457841e+01
 -5.14649956e+00  5.10668945e+00 -3.22949219e+00]


In [305]:
print(cross_val_score(estimator=get_LR_regressor(new_features), X=Xr, y=y, cv=kfold))


[0.60064134 0.60402501 0.61207874 0.60708777 0.60652665]


In [306]:
print("Score: ", regressor.score(X_test,y_test))

Score:  0.6067210397885243


## Feature Selection by polynomial regression

In [307]:
def get_LR_regressor(features):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorial', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )
    regressor = make_pipeline(
        columnTransformer,
        LinearRegression(n_estimators = n_estimators, max_depth = max_depth)
    )
    
    return regressor

In [308]:
while len(current_features)>0:
    selected_feature = None

    for feature in current_features:
        new_features = current_features.drop(feature)
        Xr = X[new_features]
        mses = cross_val_score(estimator=get_LR_regressor(new_features), X=Xr, y=y, cv=kfold)
        mse = -np.average(mses)

        if(best.update(features=new_features, mse=mse)):
            selected_feature = feature
        
    if(selected_feature):
        current_features = current_features.drop(selected_feature)
    else:
        break

In [310]:
n_estimators = [100,150,200]
max_depth = [20,30,40]

features = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

for number in n_estimators:
    scores = []
    for max_val in max_depth:
        regressor = get_polynomial_regressor(features, number, max_val)
        regressor.fit(X=X_train, y=y_train)
        scores.append(regressor.score(X_test,y_test))
    # plot_data = {'x':np.asarray(list_string),'y':np.asarray(scores)}
    # plot_data = pd.DataFrame(plot_data)
    # plt.figure()
    # sns.scatterplot(data=plot_data, x='x', y='y')
    print(scores)
        

TypeError: LinearRegression.__init__() got an unexpected keyword argument 'n_estimators'

In [274]:
from sklearn.linear_model import Lasso, Ridge

In [277]:
lasso_alpha = np.logspace(start=-3, stop=0, num=10)

lasso = make_pipeline(
    PolynomialFeatures(degree=2),
    StandardScaler(),
    Lasso(max_iter=100000)
)

lasso_cv = GridSearchCV(
    estimator=lasso,
    param_grid={
        'lasso__alpha': lasso_alpha
    },
    cv=5
)

In [278]:
lasso_cv.fit(X_train, y_train)

ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 870, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_polynomial.py", line 287, in fit
    _, n_features = self._validate_data(X, accept_sparse=True).shape
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 577, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'City Hotel'

--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 870, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_polynomial.py", line 287, in fit
    _, n_features = self._validate_data(X, accept_sparse=True).shape
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 577, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "c:\Users\Aravind\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'Resort Hotel'
