# Necessary Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeaveOneOut, cross_val_score, KFold
from sklearn.compose import ColumnTransformer

# New Section

In [4]:
data = pd.read_csv('../data.csv')
data.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,...,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,...,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,...,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22


In [5]:
# We can fill NaN values for number of children with 0 (only 4 NaN values)
data['children'] = data['children'].fillna(0)

In [135]:
# Removing country column for now or we can use target encoding
data = data.drop(['country'], axis=1)

# Removing agent and company columns since market_segment gives same information
data = data.drop(['agent','company'], axis=1)

# Removing all columns relating to arrival date except week number and year for now
data = data.drop(['arrival_date_month','arrival_date_day_of_month'], axis=1)

In [136]:
# We can combine reserved_room_type and assigned_room_type columns into one that has boolean values for whether reserved and assigned matched
# 0 -> Reserved and assigned room types matched
# 1 -> Reserved and assigned room types didn't match
data['reserved_assigned_match'] = np.where(data['reserved_room_type'] == data['assigned_room_type'], 0, 1)
data = data.drop(['reserved_room_type','assigned_room_type'], axis=1)

In [137]:
# We can change the values in the meal types column. 0, 0.33, 0.67, 1 since the meal types are incremental
data['meal'].replace(['Undefined', 'SC', 'BB', 'HB', 'FB'], [0, 0, 0.33, 0.67, 1], inplace=True)

In [138]:
# We can remove the distribution channels column as market_segment column gives us same information with more granularity
data = data.drop(['distribution_channel'], axis=1)

In [139]:
# Reservation Status can be ignored since it is being predicted     -> classification
# Reservation Status can be ignored since we will only be training  -> regression
# on those points where the booking has been cancelled
regression_data = data.copy()
data = data.drop(['reservation_status','reservation_status_date'], axis=1)

In [141]:
# data = data.drop(data[(data.adr==0) & (data.market_segment=='Complementary')].index)
# data = data.drop(data[(data.adr==0) & (data['stays_in_week_nights']==0) & (data['stays_in_weekend_nights']==0)].index)
test = data.loc[(data['adr']==0)]
test2 = data.loc[(data['adr']==0) & (data['deposit_type']=='No Deposit')]
# test4 = data.loc[(data['stays_in_week_nights']==0) & (data['stays_in_weekend_nights']==0)]
test3 = data.loc[(data['deposit_type']=='No Deposit') & (data['stays_in_week_nights']==0) & (data['stays_in_weekend_nights']==0)]
# test5 = data.loc[(data['adr']==0) & (data['stays_in_week_nights']==0) & (data['stays_in_weekend_nights']==0)]
print(test2.shape)
# print(test4.shape)
print(test3.shape)
# print(test5.shape)
print(test.compare(test3))

# data1 = data.drop(data[(data.stays_in_week_nights == 0) & (data.stays_in_weekend_nights == 0)].index)
# print(data1.shape)
# data2 = data1.loc[(data1['adr']==0)]
# print(data2.market_segment.unique())


# data3 = data.loc[(data['stays_in_week_nights'] == 0) & (data['stays_in_weekend_nights'] == 0) & (data['deposit_type'] == 'No Deposit')]
# print(data3.market_segment.unique())

data6 = data.loc[(data['adr']==0) & (data['deposit_type'] == 'No Deposit')]
print(data6.market_segment.unique())
#data.loc[(data['adr']==0) & (data['market_segment']=='Complementary')]      #680
#data.loc[(data['adr']==0) & (data['market_segment']!='Complementary') & (data['stays_in_week_nights']==0) & (data['stays_in_weekend_nights']==0)] 


(1959, 23)
(715, 23)


ValueError: Can only compare identically-labeled DataFrame objects

In [27]:
test_data = data.copy()

test_data['arrival_date_month'].replace(
                                  ['July', 'August', 'September', 'October', 'November', 'December', 'January', 'February', 'March', 'April', 'May', 'June'],
                                  [7,8,9,10,11,12,1,2,3,4,5,6],
                                  inplace=True
                              )

test_data.rename(columns={'arrival_date_year': 'year', 
                  'arrival_date_month': 'month',
                  'arrival_date_day_of_month': 'day'
                  }, 
               inplace=True
              )

test_data.insert(3,'arrival_date',pd.to_datetime(test_data[['year', 'month', 'day']]))

test = test_data.loc[(test_data['deposit_type'] == 'No Deposit') & (test_data['adr'] > 0)]
test.shape[0]
# stays_zero = test_data.loc[(test_data['stays_in_week_nights']==0) & (test_data['stays_in_weekend_nights']==0) & (test_data['reservation_status'] == 'Check-Out')]
# dates_match = test_data.loc[(test_data['arrival_date'] == test_data['reservation_status_date']) & (test_data['reservation_status'] == 'Check-Out')]
# dates_match_1 = test_data.loc[(test_data['arrival_date'] == test_data['reservation_status_date']) & (test_data['reservation_status'] == 'Check-Out')]

# # stays_zero = stays_zero.sort_index(axis=1)
# # dates_match = dates_match.sort_index(axis=1)
# # stays_zero = stays_zero.sort_index()
# # dates_match = dates_match.sort_index()
# # print(stays_zero.compare(dates_match))
# print("Size of stays_zero: ", stays_zero.shape[0])
# print("Size of dates_match: ", dates_match.shape[0])


# df_diff1 = pd.concat([stays_zero,dates_match]).drop_duplicates(keep=False)
# df_diff2 = pd.concat([dates_match,stays_zero]).drop_duplicates(keep=False)
# print("Diff1: ", df_diff1.shape[0])
# print("Diff2: ", df_diff2.shape[0])


# adr_zero = test_data.loc[(test_data['adr']==0)]
# print("No. of records with 0 ADR: ",adr_zero.shape[0])
# print("--------------------------------------")

# stays_zero_adr_zero = test_data.loc[(test_data['adr']==0) & (test_data['stays_in_week_nights']==0) & (test_data['stays_in_weekend_nights']==0)]
# stays_zero = test_data.loc[(test_data['stays_in_week_nights']==0) & (test_data['stays_in_weekend_nights']==0)]
# stays_zero_transient = test_data.loc[(test_data['stays_in_week_nights']==0) & (test_data['stays_in_weekend_nights']==0) & (test_data['customer_type']=='Transient')]
# transient = test_data.loc[(test_data['customer_type']=='Transient')]
# print("No. of records with 0 ADR and 0 stays: ", stays_zero_adr_zero.shape[0])
# print("No. of records with 0 stays: ", stays_zero.shape[0])
# print("No. of records with 0 stays and transient: ", stays_zero_transient.shape[0])
# print("No. of records with transient: ", transient.shape[0])
# print("Difference between the two: ", stays_zero.compare(stays_zero_adr_zero))

# lead_vs_waiting = test_data.loc[(test_data['lead_time']<test_data['days_in_waiting_list'])]
# print("No. of records with lead time less than waiting time: ",lead_vs_waiting.shape[0])

# print("--------------------------------------")

# print("Removing 0 stays records...")
# test_data = test_data.drop(test_data[(test_data.stays_in_week_nights==0) & (test_data.stays_in_weekend_nights==0)].index)
# adr_zero = test_data.loc[(test_data['adr']==0)]
# stays_zero = test_data.loc[(test_data['stays_in_week_nights']==0) & (test_data['stays_in_weekend_nights']==0)]
# print("No. of records with 0 ADR now: ",adr_zero.shape[0])
# print("No. of records with 0 stays: ", stays_zero.shape[0])
# print("--------------------------------------")

# complementary_adr_zero = test_data.loc[(test_data['adr']==0) & (test_data['market_segment']=='Complementary')]
# complementary = test_data.loc[(test_data['market_segment']=='Complementary')]
# no_deposit = test_data.loc[(test_data['deposit_type']=='No Deposit')]

# print("No. of records with 0 ADR and complementary: ", complementary_adr_zero.shape[0])
# print("No. of records with complementary: ", complementary.shape[0])
# print("No. of records with no deposit: ", no_deposit.shape[0])

# print("Difference between the two: ", complementary.compare(complementary_adr_zero))
# print("--------------------------------------")

# stays_zero = test_data.loc[(test_data['stays_in_week_nights']==0) & (test_data['stays_in_weekend_nights']==0)]
# date_match = 



102681

In [None]:
data.corr()['is_canceled']

is_canceled                       1.000000
lead_time                         0.293123
arrival_date_year                 0.016660
arrival_date_week_number          0.008148
stays_in_weekend_nights          -0.001791
stays_in_week_nights              0.024765
adults                            0.060017
children                          0.005036
babies                           -0.032491
meal                              0.003845
is_repeated_guest                -0.084793
previous_cancellations            0.110133
previous_bookings_not_canceled   -0.057358
booking_changes                  -0.144381
days_in_waiting_list              0.054186
adr                               0.047557
required_car_parking_spaces      -0.195498
total_of_special_requests        -0.234658
reserved_assigned_match          -0.247770
Name: is_canceled, dtype: float64

In [None]:
X = data.drop('is_canceled', axis='columns')
y = data.is_canceled

## SVM Model

In [None]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline

In [None]:
def get_classifier(features):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorial', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )

    classifier = make_pipeline(
        columnTransformer,
        svm.SVC()
    )
    
    return classifier

In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
from itertools import combinations
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
from numpy.typing import ArrayLike
from typing import List

In [None]:
print("features: ", X.columns)

features:  Index(['hotel', 'lead_time', 'arrival_date_year', 'arrival_date_week_number',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
       'babies', 'meal', 'market_segment', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'deposit_type', 'days_in_waiting_list',
       'customer_type', 'adr', 'required_car_parking_spaces',
       'total_of_special_requests', 'reserved_assigned_match'],
      dtype='object')


In [None]:
features = X.columns
classifier = get_classifier(features)
# X = X[:500]
# y = y[:500]
print(cross_val_score(classifier, X, y, cv=10))

[0.70784823 0.5301114  0.57710026 0.46737583 0.73096574 0.68715973
 0.68984002 0.45121032 0.38361672 0.52609096]


## Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
def get_regressor(features):
    # Deposit Type, Customer Type, Hotel and Market Segment will be one hot encoded
    categorical_features = list(set(features) & set([
        'deposit_type',
        'customer_type',
        'hotel',
        'market_segment',
    ]))
    numerical_features = list(set(features) - set(categorical_features))

    columnTransformer = ColumnTransformer(
        transformers = [
            ('categorial', OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]
    )

    regressor = make_pipeline(
        columnTransformer,
        LinearRegression()        
    )
    
    return regressor

In [None]:
canceled_data = regression_data[regression_data["is_canceled"]==1]

In [None]:
X = canceled_data.drop('reservation_status_date',axis='columns')
y = canceled_data.reservation_status
features = X.columns
classifier = get_regressor(features)
# X = X[:500]
# y = y[:500]
print(cross_val_score(classifier, X, y, cv=10))

## Vaidehi's work

In [None]:
transformer = ColumnTransformer(transformers = [
    ('encoder', OneHotEncoder(sparse = False), ['hotel', 'arrival_date_year', 'arrival_date_month','meal', 'market_segment',
 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status'])
] ,  remainder = 'passthrough')
X = np.array(transformer.fit_transform(hotel_dataset))

In [None]:
df2 = pd.DataFrame(X)
df2

In [None]:
hotel_dataset_post_ohe = df2.drop(columns = [74, 75, 76, 77, 78, 79, 80, 81, 82, 83])
hotel_dataset_post_ohe

In [None]:
sc_hotel_dataset_post_ohe = StandardScaler()
sc_Y = StandardScaler()
hotel_dataset_post_ohe = sc_hotel_dataset_post_ohe.fit_transform(hotel_dataset_post_ohe)
Y = Y.reshape(-1,1)
Y = sc_y.fit_transform(Y)

In [None]:

hotel_dataset_post_ohe_train, hotel_dataset_post_ohe_test, Y_train, Y_test = train_test_split(hotel_dataset_post_ohe, Y, test_size = 0.2, random_state = 1)

In [None]:
regressor = SVR(kernel = 'rbf')
regressor.fit(hotel_dataset_post_ohe, Y)