In [30]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [31]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('sample_submission.csv')
bookings_data = pd.read_csv('bookings_data.csv')
bookings = pd.read_csv('bookings.csv')
hotel_data = pd.read_csv('hotels_data.csv')
customer_data = pd.read_csv('customer_data.csv')
payments_data = pd.read_csv('payments_data.csv')

In [32]:
# keep only entries with payment_sequential as 1
payments_data_unique = payments_data[payments_data['payment_sequential'] == 1]
payments_data_repeat = payments_data[payments_data['payment_sequential'] != 1]
payments_data_unique.describe(include='all')

Unnamed: 0,booking_id,payment_sequential,payment_type,payment_installments,payment_value
count,99360,99360.0,99360,99360.0,99360.0
unique,99360,,5,,
top,6f3fe1789b1e8b2acac839d17b81ef22,,credit_card,,
freq,1,,76476,,
mean,,1.0,,2.92964,158.336774
std,,0.0,,2.714947,220.511857
min,,1.0,,1.0,0.0
25%,,1.0,,1.0,59.9475
50%,,1.0,,2.0,103.33
75%,,1.0,,4.0,175.11


In [33]:
# sort payments_data_repeat by payment_sequential
payments_data_repeat = payments_data_repeat.sort_values(by=['payment_sequential'], ascending=True)
payments_data_repeat.describe(include='all')

Unnamed: 0,booking_id,payment_sequential,payment_type,payment_installments,payment_value
count,4526,4526.0,4526,4526.0,4526.0
unique,3039,,4,,
top,d1b0e818e3ccc5cb0e39231352fa65da,,voucher,,
freq,28,,4154,,
mean,,3.127265,,1.178524,61.098164
std,,2.670762,,0.989643,96.149183
min,,2.0,,0.0,0.0
25%,,2.0,,1.0,14.905
50%,,2.0,,1.0,31.92
75%,,3.0,,1.0,71.23


In [34]:
# making payment data unique for each booking_id by adding the payments made by other methods to primary payment method
for payment_data_repeat in payments_data_repeat.itertuples():
    booking_id = payment_data_repeat.booking_id
    payment_value_new = payments_data_unique[payments_data_unique['booking_id'] == booking_id]['payment_value'] + payment_data_repeat.payment_value
    payment_installments_new = payments_data_unique[payments_data_unique['booking_id'] == booking_id]['payment_installments'] + payment_data_repeat.payment_installments
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_value'] = payment_value_new
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_installments'] = payment_installments_new
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_sequential'] = payment_data_repeat.payment_sequential

payments_data_unique.describe(include='all')

Unnamed: 0,booking_id,payment_sequential,payment_type,payment_installments,payment_value
count,99360,99360.0,99360,99360.0,99360.0
unique,99360,,5,,
top,6f3fe1789b1e8b2acac839d17b81ef22,,credit_card,,
freq,1,,76476,,
mean,,1.044726,,2.98131,160.984584
std,,0.381293,,2.741804,221.99805
min,,1.0,,1.0,0.0
25%,,1.0,,1.0,62.01
50%,,1.0,,2.0,105.29
75%,,1.0,,4.0,176.9325


In [35]:
payments_data_unique.drop(['payment_type'], axis=1, inplace=True)
table_1 = pd.merge(bookings_data,hotel_data, on='hotel_id', how='left')
table_1.drop(['hotel_id','seller_agent_id','hotel_category'], axis=1, inplace=True)

In [36]:
date_columns = ['booking_expiry_date']

for date_column in date_columns:
    # convert date to datetime
    table_1[date_column] = pd.to_datetime(table_1[date_column])
    # change the date time entries of column to number of seconds from base date
    table_1[date_column] = table_1[date_column].astype(np.int64) // 10**9  
    # take the average date as the start date
    start_date_seconds =table_1[date_column].mean()
    # subtract the start date from the date column
    table_1[date_column] = table_1[date_column] - start_date_seconds

print(table_1.dtypes)
table_1.describe(include='all')

booking_id                   object
booking_sequence_id           int64
booking_expiry_date         float64
price                       float64
agent_fees                  float64
hotel_name_length           float64
hotel_description_length    float64
hotel_photos_qty            float64
dtype: object


Unnamed: 0,booking_id,booking_sequence_id,booking_expiry_date,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty
count,112650,112650.0,112650.0,112650.0,112650.0,111047.0,111047.0,111047.0
unique,98666,,,,,,,
top,f5f79c56e9e4120aec44ef8272b63d03,,,,,,,
freq,21,,,,,,,
mean,,1.197834,-5.719929e-08,120.653739,19.99032,48.775978,787.867029,2.209713
std,,0.705124,13201820.0,183.633928,15.806405,10.025581,652.135608,1.721438
min,,1.0,-41095280.0,0.85,0.0,5.0,4.0,1.0
25%,,1.0,-9398365.0,39.9,13.08,42.0,348.0,1.0
50%,,1.0,1635763.0,74.99,16.26,52.0,603.0,1.0
75%,,1.0,10623430.0,134.9,21.15,57.0,987.0,3.0


In [37]:
# split bookings_data into unique and repeat bookings
bookings_data_unique =table_1[table_1['booking_sequence_id'] == 1]
bookings_data_repeat =table_1[table_1['booking_sequence_id'] > 1]

bookings_data_unique.describe(include='all')

Unnamed: 0,booking_id,booking_sequence_id,booking_expiry_date,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty
count,98666,98666.0,98666.0,98666.0,98666.0,97250.0,97250.0,97250.0
unique,98666,,,,,,,
top,242fe8c5a6d1ba2dd792cb1621400010,,,,,,,
freq,1,,,,,,,
mean,,1.0,-10478.85,125.964327,20.201927,48.846386,794.161398,2.250591
std,,0.0,13220970.0,191.375106,15.909873,9.999239,654.751953,1.747095
min,,1.0,-41095280.0,0.85,0.0,5.0,4.0,1.0
25%,,1.0,-9463282.0,41.505,13.31,42.0,349.0,1.0
50%,,1.0,1635331.0,79.0,16.36,52.0,607.0,2.0
75%,,1.0,10625610.0,139.9,21.23,57.0,996.0,3.0


In [38]:
# sort bookings_data_repeat by booking_sequence_id
bookings_data_repeat = bookings_data_repeat.sort_values(by=['booking_sequence_id'], ascending=True)
bookings_data_repeat.describe(include='all')

Unnamed: 0,booking_id,booking_sequence_id,booking_expiry_date,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty
count,13984,13984.0,13984.0,13984.0,13984.0,13797.0,13797.0,13797.0
unique,9803,,,,,,,
top,f5f79c56e9e4120aec44ef8272b63d03,,,,,,,
freq,20,,,,,,,
mean,,2.593678,73934.95,83.184167,18.497299,48.279698,743.500326,1.921577
std,,1.334476,13066140.0,107.874536,14.97197,10.195912,631.63373,1.49715
min,,2.0,-41012720.0,0.85,0.0,6.0,8.0,1.0
25%,,2.0,-8767644.0,30.0,11.85,42.0,338.0,1.0
50%,,2.0,1901641.0,56.0,15.56,51.0,557.0,1.0
75%,,3.0,10605510.0,99.9,20.16,57.0,919.0,2.0


In [39]:
# merging bookings_data for each booking_id
for booking_data_repeat in bookings_data_repeat.itertuples():
    bookings_id = booking_data_repeat.booking_id
    booking_data_unique = bookings_data_unique[bookings_data_unique['booking_id'] == bookings_id]
    new_price = booking_data_repeat.price + booking_data_unique.price
    new_hlength = booking_data_repeat.hotel_name_length+booking_data_unique.hotel_name_length
    new_hdesc = booking_data_repeat.hotel_description_length+booking_data_unique.hotel_description_length
    new_hqty = booking_data_repeat.hotel_photos_qty+booking_data_unique.hotel_photos_qty 
    new_agent_fees = booking_data_repeat.agent_fees + booking_data_unique.agent_fees
    new_exp = booking_data_repeat.booking_expiry_date + booking_data_unique.booking_expiry_date
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'booking_expiry_date'] = new_exp
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'hotel_name_length'] = new_hlength
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'hotel_description_length'] = new_hdesc
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'hotel_photos_qty'] = new_hqty
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'price'] = new_price
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'agent_fees'] = new_agent_fees
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'booking_sequence_id'] = booking_data_repeat.booking_sequence_id


In [40]:
columns = ['price', 'agent_fees', 'hotel_category', 'hotel_name_length', 'hotel_description_length', 'hotel_photos_qty']

for column in columns:
    bookings_data_unique[column] = bookings_data_unique[column] / bookings_data_unique['booking_sequence_id']

In [41]:
bookings_data_unique.describe(include='all')

Unnamed: 0,booking_id,booking_sequence_id,booking_expiry_date,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty
count,98666,98666.0,98666.0,98666.0,98666.0,97215.0,97215.0,97215.0
unique,98666,,,,,,,
top,242fe8c5a6d1ba2dd792cb1621400010,,,,,,,
freq,1,,,,,,,
mean,,1.141731,-10355.93,137.754076,22.823562,48.843101,794.016654,2.249931
std,,0.538452,13221070.0,210.645145,21.650909,9.958101,652.914402,1.739707
min,,1.0,-41095280.0,0.85,0.0,5.0,4.0,1.0
25%,,1.0,-9463282.0,45.9,13.85,42.0,351.0,1.0
50%,,1.0,1635331.0,86.9,17.17,52.0,608.0,2.0
75%,,1.0,10625610.0,149.9,24.04,57.0,996.0,3.0


In [42]:
# merge bookings and bookings_data as bookings_df
bookings_df = pd.merge(bookings, bookings_data_unique, on='booking_id', how='left')

bookings_df.describe(include='all')

Unnamed: 0,booking_id,customer_id,booking_status,booking_create_timestamp,booking_approved_at,booking_checkin_customer_date,booking_sequence_id,booking_expiry_date,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty
count,99441,99441,99441,99441,99281,96476,98666.0,98666.0,98666.0,98666.0,97215.0,97215.0,97215.0
unique,99441,99441,8,98875,90733,95664,,,,,,,
top,c54678b7cc49136f2d6af7e481f51cbd,51297304e76186b10a928d9ef432eb62,completed,2008-04-13 10:31:14,2008-03-01 04:14:10,2008-05-10 23:21:46,,,,,,,
freq,1,1,96478,3,9,3,,,,,,,
mean,,,,,,,1.141731,-10355.93,137.754076,22.823562,48.843101,794.016654,2.249931
std,,,,,,,0.538452,13221070.0,210.645145,21.650909,9.958101,652.914402,1.739707
min,,,,,,,1.0,-41095280.0,0.85,0.0,5.0,4.0,1.0
25%,,,,,,,1.0,-9463282.0,45.9,13.85,42.0,351.0,1.0
50%,,,,,,,1.0,1635331.0,86.9,17.17,52.0,608.0,2.0
75%,,,,,,,1.0,10625610.0,149.9,24.04,57.0,996.0,3.0


In [43]:
# merge bookings_df and customer_data as bookings_customer_df
table_2= pd.merge(bookings_df, customer_data, on='customer_id', how='left')

# merge bookings_hotel_df and payments_data as bookings_payment_df
bookings_payment_df = pd.merge(table_2, payments_data_unique, on='booking_id', how='left')

bookings_payment_df.drop(['customer_id'], axis=1, inplace=True)

bookings_payment_df.describe(include='all')

Unnamed: 0,booking_id,booking_status,booking_create_timestamp,booking_approved_at,booking_checkin_customer_date,booking_sequence_id,booking_expiry_date,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty,customer_unique_id,country,payment_sequential,payment_installments,payment_value
count,99441,99441,99441,99281,96476,98666.0,98666.0,98666.0,98666.0,97215.0,97215.0,97215.0,99441,99441,99360.0,99360.0,99360.0
unique,99441,8,98875,90733,95664,,,,,,,,96096,9,,,
top,c54678b7cc49136f2d6af7e481f51cbd,completed,2008-04-13 10:31:14,2008-03-01 04:14:10,2008-05-10 23:21:46,,,,,,,,f50201ccdcedfb9e2ac84558d50f5ead,Slovakia,,,
freq,1,96478,3,9,3,,,,,,,,17,11212,,,
mean,,,,,,1.141731,-10355.93,137.754076,22.823562,48.843101,794.016654,2.249931,,,1.044726,2.98131,160.984584
std,,,,,,0.538452,13221070.0,210.645145,21.650909,9.958101,652.914402,1.739707,,,0.381293,2.741804,221.99805
min,,,,,,1.0,-41095280.0,0.85,0.0,5.0,4.0,1.0,,,1.0,1.0,0.0
25%,,,,,,1.0,-9463282.0,45.9,13.85,42.0,351.0,1.0,,,1.0,1.0,62.01
50%,,,,,,1.0,1635331.0,86.9,17.17,52.0,608.0,2.0,,,1.0,2.0,105.29
75%,,,,,,1.0,10625610.0,149.9,24.04,57.0,996.0,3.0,,,1.0,4.0,176.9325


In [44]:
date_columns = ['booking_create_timestamp','booking_approved_at','booking_checkin_customer_date' ]

for date_column in date_columns:
    # convert date to datetime
    bookings_payment_df[date_column] = pd.to_datetime(bookings_payment_df[date_column])
    # change the date time entries of column to number of seconds from base date
    bookings_payment_df[date_column] = bookings_payment_df[date_column].astype(np.int64) // 10**9  
    # take the average date as the start date
    start_date_seconds =bookings_payment_df[date_column].mean()
    # subtract the start date from the date column
    bookings_payment_df[date_column] = bookings_payment_df[date_column] - start_date_seconds

print(bookings_payment_df.dtypes)
bookings_payment_df.describe(include='all')

booking_id                        object
booking_status                    object
booking_create_timestamp         float64
booking_approved_at              float64
booking_checkin_customer_date    float64
booking_sequence_id              float64
booking_expiry_date              float64
price                            float64
agent_fees                       float64
hotel_name_length                float64
hotel_description_length         float64
hotel_photos_qty                 float64
customer_unique_id                object
country                           object
payment_sequential               float64
payment_installments             float64
payment_value                    float64
dtype: object


Unnamed: 0,booking_id,booking_status,booking_create_timestamp,booking_approved_at,booking_checkin_customer_date,booking_sequence_id,booking_expiry_date,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty,customer_unique_id,country,payment_sequential,payment_installments,payment_value
count,99441,99441,99441.0,99441.0,99441.0,98666.0,98666.0,98666.0,98666.0,97215.0,97215.0,97215.0,99441,99441,99360.0,99360.0,99360.0
unique,99441,8,,,,,,,,,,,96096,9,,,
top,c54678b7cc49136f2d6af7e481f51cbd,completed,,,,,,,,,,,f50201ccdcedfb9e2ac84558d50f5ead,Slovakia,,,
freq,1,96478,,,,,,,,,,,17,11212,,,
mean,,,3.931805e-08,-1.265927e-08,5.524043e-08,1.141731,-10355.93,137.754076,22.823562,48.843101,794.016654,2.249931,,,1.044726,2.98131,160.984584
std,,,13277020.0,417956200.0,1772974000.0,0.538452,13221070.0,210.645145,21.650909,9.958101,652.914402,1.739707,,,0.381293,2.741804,221.99805
min,,,-41686070.0,-10405990000.0,-10113140000.0,1.0,-41095280.0,0.85,0.0,5.0,4.0,1.0,,,1.0,1.0,0.0
25%,,,-9482214.0,7234372.0,300029100.0,1.0,-9463282.0,45.9,13.85,42.0,351.0,1.0,,,1.0,1.0,62.01
50%,,,1606883.0,18350790.0,311940000.0,1.0,1635331.0,86.9,17.17,52.0,608.0,2.0,,,1.0,2.0,105.29
75%,,,10738740.0,27478660.0,321181900.0,1.0,10625610.0,149.9,24.04,57.0,996.0,3.0,,,1.0,4.0,176.9325


In [45]:
for booking_data_repeat in bookings_payment_df.itertuples():
    bookings_id = booking_data_repeat.booking_id
    booking_data_unique = bookings_payment_df[bookings_payment_df['booking_id'] == bookings_id]
    bookings_payment_df.loc[bookings_payment_df['booking_id'] == bookings_id, 'booking_create_timestamp'] = (booking_data_unique.booking_approved_at)-(booking_data_unique.booking_create_timestamp)
    bookings_payment_df.loc[bookings_payment_df['booking_id'] == bookings_id, 'booking_checkin_customer_date'] = (booking_data_unique.booking_expiry_date)-(booking_data_unique.booking_checkin_customer_date) 
bookings_payment_df.drop(['booking_approved_at','booking_expiry_date'], axis=1, inplace=True)
bookings_payment_df.describe(include='all')

Unnamed: 0,booking_id,booking_status,booking_create_timestamp,booking_checkin_customer_date,booking_sequence_id,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty,customer_unique_id,country,payment_sequential,payment_installments,payment_value
count,99441,99441,99441.0,98666.0,98666.0,98666.0,98666.0,97215.0,97215.0,97215.0,99441,99441,99360.0,99360.0,99360.0
unique,99441,8,,,,,,,,,96096,9,,,
top,c54678b7cc49136f2d6af7e481f51cbd,completed,,,,,,,,,f50201ccdcedfb9e2ac84558d50f5ead,Slovakia,,,
freq,1,96478,,,,,,,,,17,11212,,,
mean,,,-5.570077e-08,-79446840.0,1.141731,137.754076,22.823562,48.843101,794.016654,2.249931,,,1.044726,2.98131,160.984584
std,,,417796100.0,1535109000.0,0.538452,210.645145,21.650909,9.958101,652.914402,1.739707,,,0.381293,2.741804,221.99805
min,,,-10431070000.0,-327981300.0,1.0,0.85,0.0,5.0,4.0,1.0,,,1.0,1.0,0.0
25%,,,16735400.0,-310951900.0,1.0,45.9,13.85,42.0,351.0,1.0,,,1.0,1.0,62.01
50%,,,16735860.0,-310559200.0,1.0,86.9,17.17,52.0,608.0,2.0,,,1.0,2.0,105.29
75%,,,16786960.0,-310236500.0,1.0,149.9,24.04,57.0,996.0,3.0,,,1.0,4.0,176.9325


In [46]:
cat_columns = ['booking_status', 'country', 'customer_unique_id']

for column in cat_columns:
    le = LabelEncoder()
    le.fit(bookings_payment_df[column])
    bookings_payment_df[column] = le.transform(bookings_payment_df[column])
    if column == 'booking_status' or column == 'country':
        bookings_payment_df[column] = bookings_payment_df[column] + 1

print(bookings_payment_df.dtypes)

booking_id                        object
booking_status                     int64
booking_create_timestamp         float64
booking_checkin_customer_date    float64
booking_sequence_id              float64
price                            float64
agent_fees                       float64
hotel_name_length                float64
hotel_description_length         float64
hotel_photos_qty                 float64
customer_unique_id                 int64
country                            int64
payment_sequential               float64
payment_installments             float64
payment_value                    float64
dtype: object


In [47]:
# take all columns
columns = bookings_payment_df.columns

# remove booking_id
columns = columns.drop(['booking_id'])

# change all null or nan values to mean of respective columns
for column in columns:
    mean = bookings_payment_df[column].mean()
    bookings_payment_df[column].fillna(mean, inplace=True)

In [48]:
bookings_payment_df.describe(include='all')

Unnamed: 0,booking_id,booking_status,booking_create_timestamp,booking_checkin_customer_date,booking_sequence_id,price,agent_fees,hotel_name_length,hotel_description_length,hotel_photos_qty,customer_unique_id,country,payment_sequential,payment_installments,payment_value
count,99441,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0
unique,99441,,,,,,,,,,,,,,
top,c54678b7cc49136f2d6af7e481f51cbd,,,,,,,,,,,,,,
freq,1,,,,,,,,,,,,,,
mean,,3.076166,-5.570077e-08,-79446840.0,1.141731,137.754076,22.823562,48.843101,794.016654,2.249931,48049.895224,5.011484,1.044726,2.98131,160.984584
std,,0.561226,417796100.0,1529115000.0,0.53635,209.822693,21.566375,9.846012,645.565179,1.720124,27758.278975,2.580036,0.381138,2.740687,221.907616
min,,1.0,-10431070000.0,-327981300.0,1.0,0.85,0.0,5.0,4.0,1.0,0.0,1.0,1.0,1.0,0.0
25%,,3.0,16735400.0,-310946000.0,1.0,45.99,13.9,43.0,357.0,1.0,23986.0,3.0,1.0,1.0,62.01
50%,,3.0,16735860.0,-310554500.0,1.0,88.0,17.27,51.0,621.0,2.0,48053.0,5.0,1.0,2.0,105.37
75%,,3.0,16786960.0,-310231500.0,1.0,149.9,23.92,57.0,982.0,3.0,72088.0,7.0,1.0,4.0,176.86


In [49]:
# assert no null values
assert bookings_payment_df.isnull().sum().sum() == 0

In [50]:
# train_booking_df contains bookings_df with booking_id in train_data
train_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(train_data['booking_id'])]

# create X_train and Y_train
train_booking_df = train_booking_df.sort_values(by=['booking_id'])
X_train = train_booking_df.drop(['booking_id'], axis=1)
train_data = train_data.sort_values(by=['booking_id'])
# take only unique values
train_data = train_data.drop_duplicates(subset=['booking_id'])
Y_train = train_data['rating_score']

print(X_train.shape)
print(Y_train.shape)

(49868, 14)
(49868,)


In [51]:
# use gridsearch cv to find best parameters for HistGradientBoostingRegressor
param_grid = {
    'learning_rate': [0.05],
    'max_iter': [500],
    'max_leaf_nodes': [31],
    'max_depth': [7],
    'l2_regularization': [0.2],
    'early_stopping': [False],
    'validation_fraction': [0.2],
    'loss': ['squared_error']
}

grid = GridSearchCV(estimator=HistGradientBoostingRegressor(), param_grid=param_grid, cv=5, n_jobs=5, verbose=3)
grid.fit(X_train, Y_train)

print(grid.best_params_)
best_params = grid.best_params_

# split train to train and validation
X_train_2, X_val, Y_train_2, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

# create model
model = HistGradientBoostingRegressor(**best_params)
# fit model
model.fit(X_train_2, Y_train_2)

train_mse = mean_squared_error(Y_train_2, model.predict(X_train_2))
val_mse = mean_squared_error(Y_val, model.predict(X_val))

print('Train MSE: ', train_mse)
print('Validation MSE: ', val_mse)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'early_stopping': False, 'l2_regularization': 0.2, 'learning_rate': 0.05, 'loss': 'squared_error', 'max_depth': 7, 'max_iter': 500, 'max_leaf_nodes': 31, 'validation_fraction': 0.2}
Train MSE:  1.1100834369073929
Validation MSE:  1.3511098130131414


In [52]:
# use best parameters to train model
# use best params
best_params = grid.best_params_
# create model
model = HistGradientBoostingRegressor(**best_params)
# fit model
model.fit(X_train, Y_train)

train_mse = mean_squared_error(Y_train, model.predict(X_train))
print("train_mse: {}".format(train_mse))

test_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(test_data['booking_id'])]

# create X_test
test_booking_df = test_booking_df.sort_values(by=['booking_id'])
X_test = test_booking_df.drop(['booking_id'], axis=1)

Y_test_pred = model.predict(X_test)

# prepare submission file
submission = pd.DataFrame()
submission['booking_id'] = test_booking_df['booking_id']
submission['rating_score'] = Y_test_pred

# change ratings below 0 to 0 and above 5 to 5
submission['rating_score'] = submission['rating_score'].apply(lambda x: 1 if x < 1 else x)
submission['rating_score'] = submission['rating_score'].apply(lambda x: 5 if x > 5 else x)

submission.to_csv('okay-something.csv', index=False)
submission.describe()

train_mse: 1.1409147109454039


Unnamed: 0,rating_score
count,49079.0
mean,4.089064
std,0.690904
min,1.0
25%,4.06354
50%,4.337441
75%,4.458963
max,5.0
