In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder


In [72]:
dataset = pd.read_csv('hotel_bookings.csv')
dataset.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [73]:
dataset.drop(['company', 'arrival_date_day_of_month', 'reservation_status_date', 'reservation_status', 'arrival_date_year' ], axis=1, inplace=True)

In [74]:
dataset.dropna(inplace=True)
dataset = dataset.reset_index(drop=True)

In [75]:
dataset= pd.read_csv('hotel_bookings_clean.csv')

In [76]:
dataset.head()

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,assigned_room_type,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,is_room_same
0,Resort Hotel,0,342,0,0,2,0.0,0,BB,PRT,...,C,3,No Deposit,,0,Transient,0.0,0,0,1
1,Resort Hotel,0,737,0,0,2,0.0,0,BB,PRT,...,C,4,No Deposit,,0,Transient,0.0,0,0,1
2,Resort Hotel,0,7,0,1,1,0.0,0,BB,GBR,...,C,0,No Deposit,,0,Transient,75.0,0,0,0
3,Resort Hotel,0,13,0,1,1,0.0,0,BB,GBR,...,A,0,No Deposit,536.0,0,Transient,75.0,0,0,1
4,Resort Hotel,0,14,0,2,2,0.0,0,BB,GBR,...,A,0,No Deposit,240.0,0,Transient,98.0,0,1,1


In [77]:
# create a test set tratified by deposit_type

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

for train_index, test_index in split.split(dataset, dataset['deposit_type']):
    strat_train_set = dataset.loc[train_index]
    strat_test_set = dataset.loc[test_index]

 

In [78]:
X_train = strat_train_set.drop('is_canceled', axis=1)
y_train = strat_train_set['is_canceled']

X_test = strat_test_set.drop('is_canceled', axis=1)
y_test = strat_test_set['is_canceled']

In [79]:
# USe OrdinalEncoder to encode categorical variables
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

cat_cols = X_train.select_dtypes(include=['object']).columns

for col in cat_cols:
    encoder.fit(dataset[col].values.reshape(-1, 1))
    X_train[col] = encoder.transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = encoder.transform(X_test[col].values.reshape(-1, 1))

In [80]:
# import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

model = XGBClassifier()
 
scores = cross_val_score(model, X_train, y_train, cv=5)
scores.mean()

0.8765909728062798

In [81]:
# test the model on the test set
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
 
accuracy_score(y_test, y_pred)
 

0.8741064670759398

In [82]:
feature_importances = model.feature_importances_

sorted_idx = np.argsort(feature_importances)[::-1]
for i in sorted_idx:
    print(f"Importance of feature {X_train.columns[i]}: {feature_importances[i]:.4f}")

Importance of feature deposit_type: 0.5923
Importance of feature required_car_parking_spaces: 0.0981
Importance of feature is_room_same: 0.0785
Importance of feature market_segment: 0.0443
Importance of feature previous_cancellations: 0.0345
Importance of feature total_of_special_requests: 0.0292
Importance of feature country: 0.0179
Importance of feature customer_type: 0.0132
Importance of feature agent: 0.0118
Importance of feature previous_bookings_not_canceled: 0.0113
Importance of feature booking_changes: 0.0113
Importance of feature lead_time: 0.0082
Importance of feature distribution_channel: 0.0077
Importance of feature is_repeated_guest: 0.0055
Importance of feature days_in_waiting_list: 0.0048
Importance of feature meal: 0.0044
Importance of feature adults: 0.0042
Importance of feature stays_in_weekend_nights: 0.0038
Importance of feature children: 0.0034
Importance of feature assigned_room_type: 0.0032
Importance of feature hotel: 0.0032
Importance of feature adr: 0.0029
Imp

In [83]:
# classificaion report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      7504
           1       0.84      0.81      0.83      4387

    accuracy                           0.87     11891
   macro avg       0.87      0.86      0.86     11891
weighted avg       0.87      0.87      0.87     11891

