In [10]:
import sys
import os

data_folder = os.path.join('..','data')
file_to_open = os.path.join(data_folder,"hotel_bookings.csv")
sys.path.append(os.path.join('..','src'))
from utils import DataLoader
from data_preprocessing import DataPreprocessor
from feature_engineering import HotelBookingFeatures

data_loader = DataLoader()
hotel_bookings = data_loader.load_data(file_to_open)

# create an instance of the HotelBookingFeatures class
booking_features = HotelBookingFeatures(hotel_bookings)

# add the new columns to the DataFrame
booking_features.is_weekend_stay()
booking_features.num_days_stayed()
booking_features.booking_lead_time()

# create an instance of the DataPreprocessor class
data_preprocessor = DataPreprocessor(hotel_bookings)

# preprocess the data using the various methods
data_preprocessor.drop_na_columns()
data_preprocessor.convert_datetime()
data_preprocessor.encode_categorical_variables()
data_preprocessor.drop_duplicates()

Dropping rows with missing values...


100%|██████████| 1/1 [00:00<00:00, 19.09it/s]


Rows with missing values dropped.
Converting date columns to datetime...


100%|██████████| 3/3 [00:00<00:00, 104.77it/s]


Date columns converted to datetime.
Encoding categorical variables...


100%|██████████| 10/10 [00:00<00:00, 157.42it/s]

Categorical variables encoded.
hotel_name encoding:
Original values: {0: 'Algarve Retreat', 1: 'Braga City Hotel', 2: 'Duro Valley Resort', 3: 'Lisbon City Hotel', 4: 'Porto City Hotel'}
meal encoding:
Original values: {0: 'BB', 1: 'HB', 2: 'SC', 3: 'Undefined', 4: 'FB'}
source_country encoding:
Original values: {0: 'PRT', 1: 'FRA', 2: 'JPN', 3: 'DEU', 4: 'GBR', 5: 'ESP', 6: 'POL', 7: 'BRA', 8: 'AUT', 9: 'FIN', 10: 'NLD', 11: 'CPV', 12: 'ITA', 13: 'CHN', 14: 'GRC', 15: 'ARG', 16: 'IRL', 17: 'RUS', 18: 'CHE', 19: 'BEL', 20: 'USA', 21: 'THA', 22: 'LTU', 23: 'TWN', 24: 'BHR', 25: 'CN', 26: 'SAU', 27: 'AGO', 28: 'NOR', 29: 'LUX', 30: 'EST', 31: 'ROU', 32: 'SWE', 33: 'MKD', 34: 'ISR', 35: 'ZAF', 36: 'COL', 37: 'MEX', 38: 'OMN', 39: 'GIB', 40: 'MNE', 41: 'AND', 42: 'AUS', 43: 'DNK', 44: 'IRN', 45: 'CZE', 46: 'KOR', 47: 'KEN', 48: 'MYS', 49: 'SVN', 50: 'SMR', 51: 'UKR', 52: 'BGR', 53: 'SRB', 54: 'TUN', 55: 'TUR', 56: 'LVA', 57: 'HRV', 58: 'HUN', 59: 'IDN', 60: 'DZA', 61: 'BLR', 62: 'ATA', 63:




market_segment encoding:
Original values: {0: 'Online TA', 1: 'Groups', 2: 'Direct', 3: 'Corporate', 4: 'Offline TA/TO', 5: 'Complementary', 6: 'Aviation', 7: 'Undefined'}
distribution_channel encoding:
Original values: {0: 'Direct', 1: 'TA/TO', 2: 'Corporate', 3: 'GDS', 4: 'Undefined'}
assigned_room_type encoding:
Original values: {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'K', 10: 'L'}
guest_type encoding:
Original values: {0: 'Group', 1: 'Family', 2: 'Couple', 3: 'Single'}
customer_type encoding:
Original values: {0: 'Transient-Party', 1: 'Transient', 2: 'Group', 3: 'Contract'}
season encoding:
Original values: {0: 'Spring', 1: 'Summer', 2: 'Autumn', 3: 'Winter'}
company encoding:
Original values: {0: 40.0, 1: 130.0, 2: 174.0, 3: 154.0, 4: 186.0, 5: 223.0, 6: 102.0, 7: 67.0, 8: 153.0, 9: 351.0, 10: 62.0, 11: 233.0, 12: 110.0, 13: 448.0, 14: 291.0, 15: 94.0, 16: 45.0, 17: 219.0, 18: 51.0, 19: 268.0, 20: 270.0, 21: 38.0, 22: 39.0, 23: 240.0, 24: 20.0, 

100%|██████████| 1/1 [00:00<00:00, 19.00it/s]

33 rows dropped.





In [12]:
import pandas as pd
from sklearn.metrics import mean_squared_error


# calculate the mean value of stay_days
mean_stay_days = hotel_bookings['num_days_stayed'].mean()

# create a list of predictions with the same length as the number of instances
predictions = [mean_stay_days] * len(hotel_bookings)

# calculate the mean squared error of the predictions
mse = mean_squared_error(hotel_bookings['num_days_stayed'], predictions)

# print the mean squared error
print('Mean Squared Error:', mse)

Mean Squared Error: 60.72233154989432


In [15]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
# define a new dataframe with the arrival_date column as the index
sm_df = hotel_bookings.set_index('arrival_date')

# resample the data to a weekly frequency and fill missing values with the mean
weekly_bookings = sm_df['num_days_stayed'].resample('W').mean().fillna(sm_df['num_days_stayed'].mean())

# fit a SARIMA model to the data
model = sm.tsa.statespace.SARIMAX(weekly_bookings, order=(1, 1, 1), seasonal_order=(1, 1, 1, 52))
results = model.fit()

# make predictions for the next 52 weeks
forecast = results.forecast(steps=52)

# calculate the mean squared error of the predictions
mse = mean_squared_error(weekly_bookings[-52:], forecast)

# print the mean squared error
print('Mean Squared Error:', mse)

  warn('Too few observations to estimate starting parameters%s.'
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  8.67942D-01    |proj g|=  3.93286D-02

At iterate    5    f=  8.61337D-01    |proj g|=  1.22496D-02

At iterate   10    f=  8.55363D-01    |proj g|=  2.75375D-03

At iterate   15    f=  8.55264D-01    |proj g|=  1.42057D-04

At iterate   20    f=  8.55259D-01    |proj g|=  1.73905D-03

At iterate   25    f=  8.55229D-01    |proj g|=  1.02611D-03

At iterate   30    f=  8.55210D-01    |proj g|=  1.69180D-03

At iterate   35    f=  8.55195D-01    |proj g|=  8.31375D-04

At iterate   40    f=  8.55188D-01    |proj g|=  1.70254D-04

At iterate   45    f=  8.55185D-01    |proj g|=  1.40838D-04

At iterate   50    f=  8.55183D-01    |proj g|=  8.87099D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cau



Mean Squared Error: 71.65085506123651


In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error

# read the data into a pandas DataFrame
hotel_bookings = pd.read_csv('your_data_file.csv')

# convert the arrival_date column to a datetime object and set it as the index
hotel_bookings['arrival_date'] = pd.to_datetime(hotel_bookings['arrival_date'])
hotel_bookings = hotel_bookings.set_index('arrival_date')

# resample the data to a weekly frequency and fill missing values with the mean
weekly_bookings = hotel_bookings['stay_days'].resample('W').mean().fillna(hotel_bookings['stay_days'].mean())

# split the data into training and testing sets
train_size = int(len(weekly_bookings) * 0.8)
train_data, test_data = weekly_bookings[:train_size], weekly_bookings[train_size:]

# normalize the data
train_mean, train_std = train_data.mean(), train_data.std()
train_data = (train_data - train_mean) / train_std
test_data = (test_data - train_mean) / train_std

# create sequences of length 52 for training and testing
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

seq_length = 52
X_train, y_train = create_sequences(train_data, seq_length)
X_test, y_test = create_sequences(test_data, seq_length)

# create an LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(seq_length, 1)))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')

# train the model
model.fit(X_train, y_train, epochs=50, batch_size=32)

# make predictions for the test set
y_pred = model.predict(X_test)

# calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)

# print the mean squared error
print('Mean Squared Error:', mse)