In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [3]:
train_data = pd.read_csv('../data/train_data.csv')
test_data = pd.read_csv('../data/sample_submission.csv')
bookings_data = pd.read_csv('../data/bookings_data.csv')
bookings = pd.read_csv('../data/bookings.csv')
hotel_data = pd.read_csv('../data/hotels_data.csv')
customer_data = pd.read_csv('../data/customer_data.csv')
payments_data = pd.read_csv('../data/payments_data.csv')

In [4]:
# convert payment type to numeric using sk preprocessing label encoder
le = LabelEncoder()
le.fit(payments_data['payment_type'])
payments_data['payment_type'] = le.transform(payments_data['payment_type'])

# add 1 to the payment type to avoid 0 values
payments_data['payment_type'] = payments_data['payment_type'] + 1

In [5]:
# keep only entries with payment_sequential as 1
payments_data_unique = payments_data[payments_data['payment_sequential'] == 1]
payments_data_repeat = payments_data[payments_data['payment_sequential'] > 1]

In [6]:
# sort payments_data_repeat by payment_sequential
payments_data_repeat = payments_data_repeat.sort_values(by=['payment_sequential'], ascending=True)

In [7]:
# making payment data unique for each booking_id by adding the payments made by other methods to primary payment method
columns = ['payment_value', 'payment_installments', 'payment_type']

for payment_data_repeat in payments_data_repeat.itertuples():
    booking_id = payment_data_repeat.booking_id
    payment_data_unique = payments_data_unique[payments_data_unique['booking_id'] == booking_id]
    for column in columns:
        new_value = payment_data_unique[column] + payment_data_repeat.__getattribute__(column)
        payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, column] = new_value
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_sequential'] = payment_data_repeat.payment_sequential

In [8]:
bookings_data_new = bookings_data.merge(hotel_data, on='hotel_id', how='left')

In [9]:
bookings_data_new['booking_expiry_date'] = pd.to_datetime(bookings_data_new['booking_expiry_date'])
# change to seconds
bookings_data_new['booking_expiry_date'] = bookings_data_new['booking_expiry_date'].astype(np.int64) // 10 ** 9

In [10]:
# split bookings_data into unique and repeat bookings
bookings_data_unique = bookings_data_new[bookings_data_new['booking_sequence_id'] == 1]
bookings_data_repeat = bookings_data_new[bookings_data_new['booking_sequence_id'] > 1]

In [11]:
# sort bookings_data_repeat by booking_sequence_id
bookings_data_repeat = bookings_data_repeat.sort_values(by=['booking_sequence_id'], ascending=True)

In [12]:
# merging bookings_data for each booking_id
columns = ['price', 'agent_fees', 'hotel_category', 'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty', 'booking_expiry_date']

for booking_data_repeat in bookings_data_repeat.itertuples():
    bookings_id = booking_data_repeat.booking_id
    booking_data_unique = bookings_data_unique[bookings_data_unique['booking_id'] == bookings_id]
    for column in columns:
        new_value = booking_data_unique[column] + booking_data_repeat.__getattribute__(column)
        bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, column] = new_value
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'booking_sequence_id'] = booking_data_repeat.booking_sequence_id

In [13]:
# make entries in bookings_data_unique by taking average of the values based in booking_sequence_id
columns = ['hotel_category', 'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty', 'booking_expiry_date']

for column in columns:
    bookings_data_unique[column] = bookings_data_unique[column] / bookings_data_unique['booking_sequence_id']

In [14]:
# merge bookings and bookings_data as bookings_df
bookings_df = pd.merge(bookings, bookings_data_unique, on='booking_id', how='left')

In [15]:
# merge bookings_df and customer_data as bookings_customer_df
bookings_customer_df = pd.merge(bookings_df, customer_data, on='customer_id', how='left')

# merge bookings_hotel_df and payments_data as bookings_payment_df
bookings_payment_df = pd.merge(bookings_customer_df, payments_data_unique, on='booking_id', how='left')

bookings_payment_df.drop(['customer_id'], axis=1, inplace=True)

In [16]:
cat_columns = ['seller_agent_id', 'booking_status', 'country', 'customer_unique_id', 'hotel_id']

for column in cat_columns:
    le = LabelEncoder()
    le.fit(bookings_payment_df[column])
    bookings_payment_df[column] = le.transform(bookings_payment_df[column])
    if column == 'booking_status' or column == 'country':
        bookings_payment_df[column] = bookings_payment_df[column] + 1

In [17]:
# change date columns to seconds 
# booking_create_timestamp
# booking_approved_at 
# booking_checkin_customer_date
date_columns = ['booking_create_timestamp', 'booking_approved_at', 'booking_checkin_customer_date']

for column in date_columns:
    bookings_payment_df[column] = pd.to_datetime(bookings_payment_df[column])
    # change to seconds
    bookings_payment_df[column] = bookings_payment_df[column].astype(np.int64) // 10 ** 9

# change approved-at to approved_at - create_timestamp
bookings_payment_df['booking_approved_at'] = bookings_payment_df['booking_approved_at'] - bookings_payment_df['booking_create_timestamp']

# change expiry to expiry - checkin
bookings_payment_df['booking_expiry_date'] = bookings_payment_df['booking_expiry_date'] - bookings_payment_df['booking_checkin_customer_date']

In [18]:
bookings_payment_df.drop(['booking_create_timestamp', 'booking_checkin_customer_date'], axis=1, inplace=True)

In [19]:
# take all columns
columns = bookings_payment_df.columns

# remove booking_id
columns = columns.drop(['booking_id'])

# change all null or nan values to mean of respective columns
for column in columns:
    mean = bookings_payment_df[column].mean()
    bookings_payment_df[column].fillna(mean, inplace=True)

In [20]:
print(bookings_payment_df.dtypes)

booking_id                   object
booking_status                int32
booking_approved_at           int64
booking_sequence_id         float64
hotel_id                      int32
seller_agent_id               int32
booking_expiry_date         float64
price                       float64
agent_fees                  float64
hotel_category              float64
hotel_name_length           float64
hotel_description_length    float64
hotel_photos_qty            float64
customer_unique_id            int32
country                       int32
payment_sequential          float64
payment_type                float64
payment_installments        float64
payment_value               float64
dtype: object


In [21]:
# scale date columns using StandardScaler
date_columns = ['booking_approved_at', 'booking_expiry_date']

scaled_columns = StandardScaler().fit_transform(bookings_payment_df[date_columns])

bookings_payment_df[date_columns] = scaled_columns

bookings_payment_df.describe(include='all')

Unnamed: 0,booking_id,booking_status,booking_approved_at,booking_sequence_id,hotel_id,seller_agent_id,booking_expiry_date,price,agent_fees,hotel_category,hotel_name_length,hotel_description_length,hotel_photos_qty,customer_unique_id,country,payment_sequential,payment_type,payment_installments,payment_value
count,99441,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0
unique,99441,,,,,,,,,,,,,,,,,,
top,c54678b7cc49136f2d6af7e481f51cbd,,,,,,,,,,,,,,,,,,
freq,1,,,,,,,,,,,,,,,,,,
mean,,3.076166,-4.001407e-18,1.141731,16030.302461,1417.288995,6.148591000000001e-17,137.754076,22.823562,28.857976,48.843101,794.016654,2.249931,48049.895224,5.011484,1.044726,1.690378,2.98131,160.984584
std,,0.561226,1.000005,0.53635,9287.168077,935.610297,1.000005,209.822693,21.566375,22.373529,9.846012,645.565179,1.720124,27758.278975,2.580036,0.381138,2.148253,2.740687,221.907616
min,,1.0,-24.96703,1.0,0.0,0.0,-0.1625356,0.85,0.0,1.0,5.0,4.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
25%,,3.0,0.04005658,1.0,7995.0,526.0,-0.1513949,45.99,13.9,10.0,43.0,357.0,1.0,23986.0,3.0,1.0,1.0,1.0,62.01
50%,,3.0,0.04005768,1.0,16249.0,1342.0,-0.1511389,88.0,17.27,28.0,51.0,621.0,2.0,48053.0,5.0,1.0,1.0,2.0,105.37
75%,,3.0,0.04017999,1.0,23920.0,2260.0,-0.1509277,149.9,23.92,38.0,57.0,982.0,3.0,72088.0,7.0,1.0,2.0,4.0,176.86


In [22]:
# assert no null values
assert bookings_payment_df.isnull().sum().sum() == 0

In [23]:
# train_booking_df contains bookings_df with booking_id in train_data
train_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(train_data['booking_id'])]

# create X_train and Y_train
train_booking_df = train_booking_df.sort_values(by=['booking_id'])
X_train = train_booking_df.drop(['booking_id'], axis=1)
train_data = train_data.sort_values(by=['booking_id'])

# take only unique values
train_data = train_data.drop_duplicates(subset=['booking_id'])
Y_train = train_data['rating_score']

print(X_train.shape)
print(Y_train.shape)

(49868, 18)
(49868,)


In [24]:
# # data processing 
# 'learning_rate': [0.05],
#     'max_iter': [500],
#     'max_leaf_nodes': [31],
#     'max_depth': [7],
#     'l2_regularization': [0.2],
#     'early_stopping': [False],
#     'validation_fraction': [0.2],
#     'loss': ['squared_error']
# histgradboosting

# use gridsearch cv to find best parameters for HistGradientBoostingRegressor
param_grid = {
    'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.1],
    'max_iter': [500, 600, 700, 800],
    'max_leaf_nodes': [10, 12, 15, 31, None],
    'max_depth': [3, 4, 5, 6, 7],
    'l2_regularization': [ 0.15, 0.2, 0.25, 0.3],
    'early_stopping': [True, False],
    'validation_fraction': [0.2],
    'loss': ['squared_error']
}

grid = GridSearchCV(estimator=HistGradientBoostingRegressor(), param_grid=param_grid, cv=5, n_jobs=6, verbose=3)
grid.fit(X_train, Y_train)

print(grid.best_params_)

Fitting 5 folds for each of 4000 candidates, totalling 20000 fits


In [None]:
params = grid.best_params_

X_train_2, X_valid, Y_train_2, Y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

# create ElasticNetCV model with best parameters
elastic_net_cv = HistGradientBoostingRegressor(**params)

# fit the model
elastic_net_cv.fit(X_train_2, Y_train_2)

train_mse = mean_squared_error(Y_train_2, elastic_net_cv.predict(X_train_2))
val_mse = mean_squared_error(Y_valid, elastic_net_cv.predict(X_valid))

print('Train MSE: ', train_mse)
print('Validation MSE: ', val_mse)

Train MSE:  1.2562290011297357
Validation MSE:  1.3318898764664076


In [None]:
# use best parameters to train model
# use best params
model = HistGradientBoostingRegressor(**params)
# fit model
model.fit(X_train, Y_train)

train_mse = mean_squared_error(Y_train, model.predict(X_train))
print("train_mse: {}".format(train_mse))

test_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(test_data['booking_id'])]

# create X_test
test_booking_df = test_booking_df.sort_values(by=['booking_id'])
X_test = test_booking_df.drop(['booking_id'], axis=1)

Y_test_pred = model.predict(X_test)

# prepare submission file
submission = pd.DataFrame()
submission['booking_id'] = test_booking_df['booking_id']
submission['rating_score'] = Y_test_pred

# change ratings below 0 to 0 and above 5 to 5
submission['rating_score'] = submission['rating_score'].apply(lambda x: 1 if x < 1 else x)
submission['rating_score'] = submission['rating_score'].apply(lambda x: 5 if x > 5 else x)

submission.to_csv('HistGrad-best-something-5.csv', index=False)
submission.describe()

train_mse: 1.2664811461469232


Unnamed: 0,rating_score
count,49079.0
mean,4.089499
std,0.673513
min,1.0
25%,4.077994
50%,4.342336
75%,4.446725
max,4.986167
