In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

The below cell is used to import the required libraries.

In [2]:
train_data = pd.read_csv('../data/train_data.csv')
test_data = pd.read_csv('../data/sample_submission.csv')
bookings_data = pd.read_csv('../data/bookings_data.csv')
bookings = pd.read_csv('../data/bookings.csv')
hotel_data = pd.read_csv('../data/hotels_data.csv')
customer_data = pd.read_csv('../data/customer_data.csv')
payments_data = pd.read_csv('../data/payments_data.csv')

Converting the payment type to a categorical variable and encoding using sklearn’s LabelEncoder.

In [3]:
# convert payment type to numeric using sk preprocessing label encoder
le = LabelEncoder()
le.fit(payments_data['payment_type'])
payments_data['payment_type'] = le.transform(payments_data['payment_type'])

payments_data['payment_type'] = payments_data['payment_type'] + 1

Seperating unique values from repeat values

In [4]:
# keep only entries with payment_sequential as 1
payments_data_unique = payments_data[payments_data['payment_sequential'] == 1]
payments_data_repeat = payments_data[payments_data['payment_sequential'] > 1]

Sorting the repeat values based on payment_sequential

In [5]:
# sort payments_data_repeat by payment_sequential
payments_data_repeat = payments_data_repeat.sort_values(by=['payment_sequential'], ascending=True)

Combining the repeated data by adding up the payment_value, payment_installments and payment_type

In [6]:
# making payment data unique for each booking_id by adding the payments made by other methods to primary payment method
columns = ['payment_value', 'payment_installments', 'payment_type']

for payment_data_repeat in payments_data_repeat.itertuples():
    booking_id = payment_data_repeat.booking_id
    payment_data_unique = payments_data_unique[payments_data_unique['booking_id'] == booking_id]
    for column in columns:
        new_value = payment_data_unique[column] + payment_data_repeat.__getattribute__(column)
        payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, column] = new_value
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_sequential'] = payment_data_repeat.payment_sequential

payments_data_unique['payment_type'] = payments_data_unique['payment_type'] / payments_data_unique['payment_sequential']

Merging the hotel data with the booking data

In [7]:
bookings_data_new = bookings_data.merge(hotel_data, on='hotel_id', how='left')

Changing the expiry date to seconds

In [8]:
# changing the date format to datetime
bookings_data_new['booking_expiry_date'] = pd.to_datetime(bookings_data_new['booking_expiry_date'])

# change to seconds
bookings_data_new['booking_expiry_date'] = bookings_data_new['booking_expiry_date'].astype(np.int64) // 10 ** 9

Splitting the repeated data in bookings_data into unique and repeat

In [9]:
# split bookings_data into unique and repeat bookings
bookings_data_unique = bookings_data_new[bookings_data_new['booking_sequence_id'] == 1]
bookings_data_repeat = bookings_data_new[bookings_data_new['booking_sequence_id'] > 1]

Sorting the repeat data based on booking_sequence_id

In [10]:
# sort bookings_data_repeat by booking_sequence_id
bookings_data_repeat = bookings_data_repeat.sort_values(by=['booking_sequence_id'], ascending=True)

Merging bookings_data for each booking_id

In [11]:
# merging bookings_data for each booking_id
columns = ['price', 'agent_fees', 'hotel_category', 'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty', 'booking_expiry_date']

for booking_data_repeat in bookings_data_repeat.itertuples():
    bookings_id = booking_data_repeat.booking_id
    booking_data_unique = bookings_data_unique[bookings_data_unique['booking_id'] == bookings_id]
    for column in columns:
        new_value = booking_data_unique[column] + booking_data_repeat.__getattribute__(column)
        bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, column] = new_value
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'booking_sequence_id'] = booking_data_repeat.booking_sequence_id

Taking average for a few columns

In [12]:
# make entries in bookings_data_unique by taking average of the values based in booking_sequence_id
columns = ['hotel_category', 'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty', 'booking_expiry_date']

for column in columns:
    bookings_data_unique[column] = bookings_data_unique[column] / bookings_data_unique['booking_sequence_id']

Merging the booking data from both sides

In [13]:
# merge bookings and bookings_data as bookings_df
bookings_df = pd.merge(bookings, bookings_data_unique, on='booking_id', how='left')

Merging customer and payment data

In [14]:
# merge bookings_df and customer_data as bookings_customer_df
bookings_customer_df = pd.merge(bookings_df, customer_data, on='customer_id', how='left')

# merge bookings_hotel_df and payments_data as bookings_payment_df
bookings_payment_df = pd.merge(bookings_customer_df, payments_data_unique, on='booking_id', how='left')

bookings_payment_df.drop(['customer_id'], axis=1, inplace=True)

Making numerical encodings for the categorical variables

In [15]:
cat_columns = ['seller_agent_id', 'booking_status', 'country', 'customer_unique_id', 'hotel_id']

for column in cat_columns:
    le = LabelEncoder()
    le.fit(bookings_payment_df[column])
    bookings_payment_df[column] = le.transform(bookings_payment_df[column])
    if column == 'booking_status' or column == 'country':
        bookings_payment_df[column] = bookings_payment_df[column] + 1

Changing the other date columns to seconds and getting the differences between them

In [16]:
# change date columns to seconds
date_columns = ['booking_create_timestamp', 'booking_approved_at', 'booking_checkin_customer_date']

for column in date_columns:
    bookings_payment_df[column] = pd.to_datetime(bookings_payment_df[column])
    # change to seconds
    bookings_payment_df[column] = bookings_payment_df[column].astype(np.int64) // 10 ** 9

# change approved-at to approved_at - create_timestamp
bookings_payment_df['booking_approved_at'] = bookings_payment_df['booking_approved_at'] - bookings_payment_df['booking_create_timestamp']

# change expiry to expiry - checkin
bookings_payment_df['booking_expiry_date'] = bookings_payment_df['booking_expiry_date'] - bookings_payment_df['booking_checkin_customer_date']

# create new column for expiry - create
bookings_payment_df['booking_expiry_create'] = bookings_payment_df['booking_expiry_date'] - bookings_payment_df['booking_approved_at']

In [17]:
# take all columns
columns = bookings_payment_df.columns

# remove booking_id
columns = columns.drop(['booking_id'])

# change all null or nan values to mean of respective columns
for column in columns:
    mean = bookings_payment_df[column].mean()
    bookings_payment_df[column].fillna(mean, inplace=True)

In [18]:
# scale date columns using StandardScaler
date_columns = ['booking_approved_at', 'booking_expiry_date']

scaled_columns = StandardScaler().fit_transform(bookings_payment_df[date_columns])

bookings_payment_df[date_columns] = scaled_columns

In [19]:
# assert no null values
assert bookings_payment_df.isnull().sum().sum() == 0

In [27]:
# train_booking_df contains bookings_df with booking_id in train_data
train_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(train_data['booking_id'])]

# create X_train and Y_train
train_booking_df = train_booking_df.sort_values(by=['booking_id'])
X_train = train_booking_df.drop(['booking_id'], axis=1)
train_data = train_data.sort_values(by=['booking_id'])

# take only unique values
train_data = train_data.drop_duplicates(subset=['booking_id'])
Y_train = train_data['rating_score']

print(X_train.shape)
print(Y_train.shape)

(49868, 21)
(49868,)


In [29]:
params = {'early_stopping': False, 'l2_regularization': 0.2, 'learning_rate': 0.04, 'loss': 'squared_error', 
'max_depth': 4, 'max_iter': 500, 'max_leaf_nodes': 15, 'validation_fraction': 0.2}

# grid_params around the above params for better results
cv_params = {
    'max_depth': [3, 4, 5],
    'max_leaf_nodes': [10, 15, 20],
    'learning_rate': [0.03, 0.04, 0.05],
    'l2_regularization': [0.1, 0.2, 0.3],
    'max_iter': [400, 500, 600],
    'validation_fraction': [0.2],
    'early_stopping': [True, False],
    'loss': ['squared_error']
}

# grid search
grid = GridSearchCV(HistGradientBoostingRegressor(), cv_params, scoring='neg_mean_squared_error', cv=5, n_jobs=6, verbose=100)

# fit grid search
grid.fit(X_train, Y_train)

# print best params
print(grid.best_params_)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
{'early_stopping': False, 'l2_regularization': 0.2, 'learning_rate': 0.05, 'loss': 'squared_error', 'max_depth': 5, 'max_iter': 400, 'max_leaf_nodes': 15, 'validation_fraction': 0.2}


In [30]:
X_train_2, X_valid, Y_train_2, Y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

rf = HistGradientBoostingRegressor(**params)
rf.fit(X_train_2, Y_train_2)

# mse of valid and train
print(mean_squared_error(Y_valid, rf.predict(X_valid)))
print(mean_squared_error(Y_train_2, rf.predict(X_train_2)))

1.3241385583434275
1.2485873591595589


In [31]:
# use best parameters to train model
# use best params
model = HistGradientBoostingRegressor(**params)
# fit model
model.fit(X_train, Y_train)

train_mse = mean_squared_error(Y_train, model.predict(X_train))
print("train_mse: {}".format(train_mse))

test_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(test_data['booking_id'])]

# create X_test
test_booking_df = test_booking_df.sort_values(by=['booking_id'])
X_test = test_booking_df.drop(['booking_id'], axis=1)

Y_test_pred = model.predict(X_test)

# prepare submission file
submission = pd.DataFrame()
submission['booking_id'] = test_booking_df['booking_id']
submission['rating_score'] = Y_test_pred

# change ratings below 0 to 0 and above 5 to 5
submission['rating_score'] = submission['rating_score'].apply(lambda x: 1 if x < 1 else x)
submission['rating_score'] = submission['rating_score'].apply(lambda x: 5 if x > 5 else x)

submission.to_csv('main_submission-2.csv', index=False)
submission.describe()

train_mse: 1.256188186359262


Unnamed: 0,rating_score
count,49079.0
mean,4.088837
std,0.681466
min,1.0
25%,4.064135
50%,4.33555
75%,4.449554
max,4.933109
