In [94]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

In [48]:
train_data = pd.read_csv('../data/train_data.csv')
test_data = pd.read_csv('../data/sample_submission.csv')
bookings_data = pd.read_csv('../data/bookings_data.csv')
bookings = pd.read_csv('../data/bookings.csv')
hotel_data = pd.read_csv('../data/hotels_data.csv')
customer_data = pd.read_csv('../data/customer_data.csv')
payments_data = pd.read_csv('../data/payments_data.csv')

In [49]:
# keep only entries with payment_sequential as 1
payments_data_unique = payments_data[payments_data['payment_sequential'] == 1]
payments_data_repeat = payments_data[payments_data['payment_sequential'] != 1]

In [50]:
# making payment data unique for each booking_id
for payment_data_repeat in payments_data_repeat.itertuples():
    booking_id = payment_data_repeat.booking_id
    payment_value_new = payments_data_unique[payments_data_unique['booking_id'] == booking_id]['payment_value'] + payment_data_repeat.payment_value
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_value'] = payment_value_new

In [51]:
# train_data count
train_data_count = train_data.shape[0]
print('train_data count: ', train_data_count)

# test_data count
test_data_count = test_data.shape[0]
print('test_data count: ', test_data_count)

train_data count:  50000
test_data count:  49079


In [52]:
bookings_data_unique = bookings_data[bookings_data['booking_sequence_id'] == 1]
bookings_data_repeat = bookings_data[bookings_data['booking_sequence_id'] > 1]

In [53]:
# merging bookings_data for each booking_id
for booking_data_repeat in bookings_data_repeat.itertuples():
    bookings_id = booking_data_repeat.booking_id
    booking_data_unique = bookings_data_unique[bookings_data_unique['booking_id'] == bookings_id]
    new_price = booking_data_repeat.price + booking_data_unique.price
    new_agent_fees = booking_data_repeat.agent_fees + booking_data_unique.agent_fees
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'price'] = new_price
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'agent_fees'] = new_agent_fees

In [31]:
# merge bookings and bookings_data as bookings_df
bookings_df = pd.merge(bookings, bookings_data_unique, on='booking_id', how='left')

In [32]:
# merge bookings_df and customer_data as bookings_customer_df
bookings_customer_df = pd.merge(bookings_df, customer_data, on='customer_id', how='left')

# merge bookings_customer_merged and hotel_data as bookings_hotel_merged
bookings_hotel_df = pd.merge(bookings_customer_df, hotel_data, on='hotel_id', how='left')

bookings_hotel_df.drop(['customer_id', 'hotel_id', 'booking_sequence_id'], axis=1, inplace=True)

In [33]:
bookings_payment_df = pd.merge(bookings_hotel_df, payments_data_unique, on='booking_id', how='left')

bookings_payment_df.drop(['payment_sequential'], axis=1, inplace=True)

cat_columns = ['seller_agent_id', 'booking_status', 'country', 'payment_type', 'customer_unique_id']

for column in cat_columns:
    bookings_payment_df[column] = bookings_payment_df[column].astype('category')
    bookings_payment_df[column] = bookings_payment_df[column].cat.codes

print(bookings_payment_df.dtypes)

booking_id                        object
booking_status                      int8
booking_create_timestamp          object
booking_approved_at               object
booking_checkin_customer_date     object
seller_agent_id                    int16
booking_expiry_date               object
price                            float64
agent_fees                       float64
customer_unique_id                 int32
country                             int8
hotel_category                   float64
hotel_name_length                float64
hotel_description_length         float64
hotel_photos_qty                 float64
payment_type                        int8
payment_installments             float64
payment_value                    float64
dtype: object


In [34]:
date_columns = ['booking_create_timestamp', 'booking_approved_at', 'booking_checkin_customer_date','booking_expiry_date']

base_date = pd.to_datetime('2007-06-01')
base_minutes = base_date.value / 10**9 / 60
for date_column in date_columns:
    bookings_payment_df[date_column] = pd.to_datetime(bookings_payment_df[date_column])
    bookings_payment_df[date_column] = bookings_payment_df[date_column].apply(lambda x: (x.value / 10**9 / 60) - base_minutes)

print(bookings_payment_df.dtypes)
# bookings_payment_df.describe(include='all')

booking_id                        object
booking_status                      int8
booking_create_timestamp         float64
booking_approved_at              float64
booking_checkin_customer_date    float64
seller_agent_id                    int16
booking_expiry_date              float64
price                            float64
agent_fees                       float64
customer_unique_id                 int32
country                             int8
hotel_category                   float64
hotel_name_length                float64
hotel_description_length         float64
hotel_photos_qty                 float64
payment_type                        int8
payment_installments             float64
payment_value                    float64
dtype: object


In [35]:
# train_booking_df contains bookings_df with booking_id in train_data
train_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(train_data['booking_id'])]

# create X_train and Y_train
train_booking_df = train_booking_df.sort_values(by=['booking_id'])
X_train = train_booking_df.drop(['booking_id'], axis=1)
train_data = train_data.sort_values(by=['booking_id'])
# take only unique values
train_data = train_data.drop_duplicates(subset=['booking_id'])
Y_train = train_data['rating_score']

print(X_train.shape)
print(Y_train.shape)

(49868, 17)
(49868,)


In [101]:
X_train_actual = X_train[: int(0.8 * len(X_train))]
Y_train_actual = Y_train[: int(0.8 * len(Y_train))]
X_valid = X_train[int(0.8 * len(X_train)) :]
Y_valid = Y_train[int(0.8 * len(Y_train)) :]

  Y_train_actual = Y_train[: int(0.8 * len(Y_train))]
  Y_valid = Y_train[int(0.8 * len(Y_train)) :]


In [102]:
regressor1 = CatBoostRegressor(iterations=1000, learning_rate=0.08, depth=12, loss_function='RMSE', eval_metric='RMSE', random_seed=49, od_type='Iter', od_wait=100)
regressor2 = XGBRegressor(n_estimators=1000, learning_rate=0.08, max_depth=12, random_state=49, n_jobs=-1, objective='reg:squarederror', eval_metric='rmse')
regressor3 = LGBMRegressor(n_estimators=1000, learning_rate=0.08, max_depth=12, random_state=49, n_jobs=-1, objective='regression', metric='rmse')

regressor1.fit(X_train_actual, Y_train_actual, eval_set=(X_valid, Y_valid), use_best_model=True, verbose=100)
regressor2.fit(X_train_actual, Y_train_actual, eval_set=[(X_valid, Y_valid)], early_stopping_rounds=100, verbose=100)
regressor3.fit(X_train_actual, Y_train_actual, eval_set=[(X_valid, Y_valid)], early_stopping_rounds=100, verbose=100)


0:	learn: 1.3328133	test: 1.3324366	best: 1.3324366 (0)	total: 71.5ms	remaining: 1m 11s
100:	learn: 1.1104983	test: 1.2242339	best: 1.2241424 (89)	total: 6.52s	remaining: 58.1s
200:	learn: 1.0059021	test: 1.2187004	best: 1.2186467 (195)	total: 12.8s	remaining: 50.8s
300:	learn: 0.9134255	test: 1.2187568	best: 1.2181422 (257)	total: 19.1s	remaining: 44.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1.21814217
bestIteration = 257

Shrink model to first 258 iterations.
[0]	validation_0-rmse:3.56207




[100]	validation_0-rmse:1.20606
[163]	validation_0-rmse:1.20947




[100]	valid_0's rmse: 1.20131
[200]	valid_0's rmse: 1.19789
[300]	valid_0's rmse: 1.19803


In [103]:
Y_train_pred1 = regressor1.predict(X_train)
Y_valid_pred1 = regressor1.predict(X_valid)

Y_train_pred2 = regressor2.predict(X_train)
Y_valid_pred2 = regressor2.predict(X_valid)

Y_train_pred3 = regressor3.predict(X_train)
Y_valid_pred3 = regressor3.predict(X_valid)

Y_train_pred = (Y_train_pred1 + Y_train_pred2 + Y_train_pred3) / 3
Y_valid_pred = (Y_valid_pred1 + Y_valid_pred2 + Y_valid_pred3) / 3

# calculate RMSE
rmse = mean_squared_error(Y_train, Y_train_pred)
print("RMSE: %f" % (rmse))

rmse_valid = mean_squared_error(Y_valid, Y_valid_pred)
print("RMSE: %f" % (rmse_valid))

RMSE: 1.037394
RMSE: 1.430192


In [104]:
test_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(test_data['booking_id'])]

# create X_test
test_booking_df = test_booking_df.sort_values(by=['booking_id'])
X_test = test_booking_df.drop(['booking_id'], axis=1)

Y_test_pred = regressor1.predict(X_test)
Y_test_pred2 = regressor2.predict(X_test)
Y_test_pred3 = regressor3.predict(X_test)

Y_test_pred = (Y_test_pred + Y_test_pred2 + Y_test_pred3) / 3

# prepare submission file
submission = pd.DataFrame()
submission['booking_id'] = test_booking_df['booking_id']
submission['rating_score'] = Y_test_pred

# change ratings below 0 to 0 and above 5 to 5
submission['rating_score'] = submission['rating_score'].apply(lambda x: 1 if x < 1 else x)
submission['rating_score'] = submission['rating_score'].apply(lambda x: 5 if x > 5 else x)

submission.to_csv('submission_1.csv', index=False)
submission.describe()

Unnamed: 0,rating_score
count,49079.0
mean,4.083279
std,0.574748
min,1.0
25%,4.043376
50%,4.251229
75%,4.377894
max,4.904847
