In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold
from sklearn.compose import ColumnTransformer

In [52]:
data = pd.read_csv('../data.csv')
data.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,...,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,...,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,...,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22


In [53]:
# We can fill NaN values for number of children with 0 (only 4 NaN values)
data['children'] = data['children'].fillna(0)

In [60]:
def get_LR_classifier(features, solver, penalty, c_val):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
        'meal',
        'country',
        'arrival_date_month',
        'reserved_room_type',
        'assigned_room_type',
        'market_segment',
        'distribution_channel',
        'agent',
        'company'
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorial', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )

    classifier = make_pipeline(
        columnTransformer,
        LogisticRegression(max_iter=10000, solver=solver, penalty=penalty, C=c_val)
    )
    
    return classifier

In [61]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [62]:
from itertools import combinations
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
from numpy.typing import ArrayLike
from typing import List
from sklearn.model_selection import train_test_split


In [63]:
class Solution:
    features: List[int]
    mse: float

    def __init__(self, y: ArrayLike):
        self.features = list()
        self.mse = mean_squared_error(y, [y.mean()]*len(y))

    def update(self, features: List[int], mse: float) -> bool:
        if(mse < self.mse):
            print(', '.join(str(x) for x in features))
            print(f"\tNew Error: {mse:.3f} better than {self.mse:.3f}")
            self.features = features
            self.mse = mse
            return True

        return False

In [64]:
X = data.drop(['is_canceled','reservation_status','reservation_status_date'], axis='columns')
y = data.is_canceled

In [65]:
features = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

regressor = get_LR_classifier(features, 'liblinear', 'l2', 10000)
regressor.fit(X=X_train, y=y_train)
print("EQN: ", regressor[1].coef_)
print("Score: ", regressor.score(X_test, y_test))
print(cross_val_score(estimator=get_LR_classifier(features, 'liblinear', 'l2', 10000), X=X, y=y, cv=kfold))

EQN:  [[-2.53281795e+00  1.94817561e+00 -1.34245724e+00 -5.93629105e-01
  -7.96699704e-01  3.06512254e-01 -8.43283026e-01 -2.96333376e-01
   1.02938293e-01 -3.64466343e-01 -1.80982359e-01 -1.18825580e+00
  -4.11013841e+00  2.33366211e+00 -3.40107260e+00  4.65491789e-01
   7.54218812e+00  2.97930149e+00  6.98558721e-01  6.19028293e-01
  -2.07254574e+00 -4.32990393e+00 -5.28871302e+00  4.01665927e-01
   2.00102959e-02  1.74802752e+00 -1.95887224e+00 -1.80363289e-02
   5.88775806e+00 -3.24025706e+00  1.04678190e+00 -2.70698144e-01
   1.94186550e+00 -2.20927433e+00  5.90456956e-01  8.29408264e-01
  -3.15922176e+00  1.18220151e+00 -3.44741839e+00 -1.03144068e+00
  -3.52408811e+00  3.27195474e-01  7.71940852e-01  1.42324774e+00
   3.82848286e-01 -6.20393055e+00  4.03592395e-01  9.49079373e-01
  -5.41360067e+00  1.48366241e+00 -8.84982781e-01 -4.93181486e+00
   5.03934376e-01  2.91455525e-01 -1.30001395e-01 -4.88996127e+00
  -4.38765462e+00  1.99246501e-01  5.22339734e-01  6.25340190e-01
   5

In [66]:
all_features = features
current_features = all_features
best = Solution(y)

In [69]:
while len(current_features)>0:
    selected_feature = None

    for feature in current_features:
        new_features = current_features.drop(feature)
        Xr = X[new_features]
        mses = cross_val_score(estimator=get_LR_classifier(new_features, 'liblinear', 'l2', 10000), X=Xr, y=y, cv=kfold)
        mse = -np.average(mses)

        if(best.update(features=new_features, mse=mse)):
            selected_feature = feature
        
    if(selected_feature):
        current_features = current_features.drop(selected_feature)
    else:
        break

lead_time, arrival_date_year, arrival_date_month, arrival_date_week_number, arrival_date_day_of_month, stays_in_weekend_nights, stays_in_week_nights, adults, children, babies, meal, country, market_segment, distribution_channel, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, reserved_room_type, assigned_room_type, booking_changes, deposit_type, agent, company, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests
	New Error: -0.834 better than 0.233
hotel, lead_time, arrival_date_year, arrival_date_month, arrival_date_day_of_month, stays_in_weekend_nights, stays_in_week_nights, adults, children, babies, meal, country, market_segment, distribution_channel, is_repeated_guest, previous_cancellations, previous_bookings_not_canceled, reserved_room_type, assigned_room_type, booking_changes, deposit_type, agent, company, days_in_waiting_list, customer_type, adr, required_car_parking_spaces, total_of_special_requests
	New E