In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('../data/train_data.csv')
test_data = pd.read_csv('../data/sample_submission.csv')
bookings_data = pd.read_csv('../data/bookings_data.csv')
bookings = pd.read_csv('../data/bookings.csv')
hotel_data = pd.read_csv('../data/hotels_data.csv')
customer_data = pd.read_csv('../data/customer_data.csv')
payments_data = pd.read_csv('../data/payments_data.csv')

In [None]:
# convert payment type to numeric using sk preprocessing label encoder
le = LabelEncoder()
le.fit(payments_data['payment_type'])
payments_data['payment_type'] = le.transform(payments_data['payment_type'])

# add 1 to the payment type to avoid 0 values
payments_data['payment_type'] = payments_data['payment_type'] + 1

payments_data.describe(include='all')

In [None]:
# keep only entries with payment_sequential as 1
payments_data_unique = payments_data[payments_data['payment_sequential'] == 1]
payments_data_repeat = payments_data[payments_data['payment_sequential'] > 1]

payments_data_unique.describe(include='all')

In [None]:
# sort payments_data_repeat by payment_sequential
payments_data_repeat = payments_data_repeat.sort_values(by=['payment_sequential'], ascending=True)

payments_data_repeat.describe(include='all')

In [None]:
# making payment data unique for each booking_id by adding the payments made by other methods to primary payment method
columns = ['payment_value', 'payment_installments', 'payment_type']

for payment_data_repeat in payments_data_repeat.itertuples():
    booking_id = payment_data_repeat.booking_id
    payment_data_unique = payments_data_unique[payments_data_unique['booking_id'] == booking_id]
    for column in columns:
        new_value = payment_data_unique[column] + payment_data_repeat.__getattribute__(column)
        payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, column] = new_value
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_sequential'] = payment_data_repeat.payment_sequential

payments_data_unique.describe(include='all')

In [None]:
# make columns values average based on payment_sequential
for column in columns:
    payments_data_unique[column] = payments_data_unique[column] / payments_data_unique['payment_sequential']

payments_data_unique.describe(include='all')

In [None]:
bookings_data.describe(include='all')

In [None]:
bookings_data_new = bookings_data.merge(hotel_data, on='hotel_id', how='left')

bookings_data_new.describe(include='all')

In [None]:
# split bookings_data into unique and repeat bookings
bookings_data_unique = bookings_data_new[bookings_data_new['booking_sequence_id'] == 1]
bookings_data_repeat = bookings_data_new[bookings_data_new['booking_sequence_id'] > 1]

bookings_data_unique.describe(include='all')

In [None]:
# sort bookings_data_repeat by booking_sequence_id
bookings_data_repeat = bookings_data_repeat.sort_values(by=['booking_sequence_id'], ascending=True)

bookings_data_repeat.describe(include='all')

In [None]:
# merging bookings_data for each booking_id
columns = ['price', 'agent_fees', 'hotel_category', 'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty', 'booking_expiry_date']

for booking_data_repeat in bookings_data_repeat.itertuples():
    bookings_id = booking_data_repeat.booking_id
    booking_data_unique = bookings_data_unique[bookings_data_unique['booking_id'] == bookings_id]
    for column in columns:
        new_value = booking_data_unique[column] + booking_data_repeat.__getattribute__(column)
        bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, column] = new_value
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'booking_sequence_id'] = booking_data_repeat.booking_sequence_id

In [None]:
bookings_data_unique.describe(include='all')

In [None]:
# make entries in bookings_data_unique by taking average of the values based in booking_sequence_id
columns = ['hotel_category', 'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty', 'booking_expiry_date']

for column in columns:
    bookings_data_unique[column] = bookings_data_unique[column] / bookings_data_unique['booking_sequence_id']

bookings_data_unique.describe(include='all')

In [None]:
bookings.describe(include='all')

In [None]:
# merge bookings and bookings_data as bookings_df
bookings_df = pd.merge(bookings, bookings_data_unique, on='booking_id', how='left')

bookings_df.describe(include='all')

In [None]:
# merge bookings_df and customer_data as bookings_customer_df
bookings_customer_df = pd.merge(bookings_df, customer_data, on='customer_id', how='left')

# merge bookings_hotel_df and payments_data as bookings_payment_df
bookings_payment_df = pd.merge(bookings_customer_df, payments_data_unique, on='booking_id', how='left')

bookings_payment_df.drop(['customer_id'], axis=1, inplace=True)

bookings_payment_df.describe(include='all')

In [None]:
cat_columns = ['seller_agent_id', 'booking_status', 'country', 'customer_unique_id', 'hotel_id']

for column in cat_columns:
    le = LabelEncoder()
    le.fit(bookings_payment_df[column])
    bookings_payment_df[column] = le.transform(bookings_payment_df[column])
    if column == 'booking_status' or column == 'country':
        bookings_payment_df[column] = bookings_payment_df[column] + 1

print(bookings_payment_df.dtypes)

In [None]:
date_columns = ['booking_create_timestamp', 'booking_approved_at', 'booking_checkin_customer_date', 'booking_expiry_date']

for date_column in date_columns:
    bookings_payment_df[date_column] = pd.to_datetime(bookings_payment_df[date_column])

# change approved-at to approved_at - create_timestamp
bookings_payment_df['booking_approved_at'] = bookings_payment_df['booking_approved_at'] - bookings_payment_df['booking_create_timestamp']
bookings_payment_df['booking_approved_at'] = bookings_payment_df['booking_approved_at'].dt.total_seconds()

# change expiry to expiry - checkin
bookings_payment_df['booking_expiry_date'] = bookings_payment_df['booking_expiry_date'] - bookings_payment_df['booking_checkin_customer_date']
bookings_payment_df['booking_expiry_date'] = bookings_payment_df['booking_expiry_date'].dt.total_seconds()

print(bookings_payment_df.dtypes)
bookings_payment_df.describe(include='all')

In [None]:
bookings_payment_df.drop(['booking_create_timestamp', 'booking_checkin_customer_date'], axis=1, inplace=True)

In [None]:
# take all columns
columns = bookings_payment_df.columns

# remove booking_id
columns = columns.drop(['booking_id'])

# change all null or nan values to mean of respective columns
for column in columns:
    mean = bookings_payment_df[column].mean()
    bookings_payment_df[column].fillna(mean, inplace=True)

In [None]:
print(bookings_payment_df.dtypes)

In [None]:
bookings_payment_df.describe(include='all')

In [None]:
# scale date columns using StandardScaler
date_columns = ['booking_approved_at', 'booking_expiry_date']

scaled_columns = StandardScaler().fit_transform(bookings_payment_df[date_columns])

bookings_payment_df[date_columns] = scaled_columns

bookings_payment_df.describe(include='all')

In [None]:
# assert no null values
assert bookings_payment_df.isnull().sum().sum() == 0

In [None]:
# train_booking_df contains bookings_df with booking_id in train_data
train_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(train_data['booking_id'])]

# create X_train and Y_train
train_booking_df = train_booking_df.sort_values(by=['booking_id'])
X_train = train_booking_df.drop(['booking_id'], axis=1)
train_data = train_data.sort_values(by=['booking_id'])

# take only unique values
train_data = train_data.drop_duplicates(subset=['booking_id'])
Y_train = train_data['rating_score']

print(X_train.shape)
print(Y_train.shape)

In [None]:
# use gridsearch cv to find best parameters for HistGradientBoostingRegressor
param_grid = {
    'learning_rate': [0.03, 0.04, 0.05, 0.06, 0.07, 0.1, 0.2],
    'max_iter': [300, 400, 500, 600, 700],
    'max_leaf_nodes': [15, 31, 63],
    'max_depth': [3, 5, 7, 9, 11],
    'l2_regularization': [0.1, 0.2, 0.3],
    'early_stopping': [True, False],
    'validation_fraction': [0.2],
    'loss': ['squared_error']
}

grid = GridSearchCV(estimator=HistGradientBoostingRegressor(), param_grid=param_grid, cv=5, n_jobs=5, verbose=3)
grid.fit(X_train, Y_train)

print(grid.best_params_)

In [None]:
params = grid.best_params_

X_train_2, X_valid, Y_train_2, Y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

# create ElasticNetCV model with best parameters
elastic_net_cv = HistGradientBoostingRegressor(**params)

# fit the model
elastic_net_cv.fit(X_train_2, Y_train_2)

train_mse = mean_squared_error(Y_train_2, elastic_net_cv.predict(X_train_2))
val_mse = mean_squared_error(Y_valid, elastic_net_cv.predict(X_valid))

print('Train MSE: ', train_mse)
print('Validation MSE: ', val_mse)

In [None]:
# use best parameters to train model
# use best params
model = HistGradientBoostingRegressor(**params)
# fit model
model.fit(X_train, Y_train)

train_mse = mean_squared_error(Y_train, model.predict(X_train))
print("train_mse: {}".format(train_mse))

test_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(test_data['booking_id'])]

# create X_test
test_booking_df = test_booking_df.sort_values(by=['booking_id'])
X_test = test_booking_df.drop(['booking_id'], axis=1)

Y_test_pred = model.predict(X_test)

# prepare submission file
submission = pd.DataFrame()
submission['booking_id'] = test_booking_df['booking_id']
submission['rating_score'] = Y_test_pred

# change ratings below 0 to 0 and above 5 to 5
submission['rating_score'] = submission['rating_score'].apply(lambda x: 1 if x < 1 else x)
submission['rating_score'] = submission['rating_score'].apply(lambda x: 5 if x > 5 else x)

submission.to_csv('HistGrad-best-something-3.csv', index=False)
submission.describe()