In [175]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, VotingRegressor, BaggingRegressor
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [158]:
train_data = pd.read_csv('../data/train_data.csv')
test_data = pd.read_csv('../data/sample_submission.csv')
bookings_data = pd.read_csv('../data/bookings_data.csv')
bookings = pd.read_csv('../data/bookings.csv')
hotel_data = pd.read_csv('../data/hotels_data.csv')
customer_data = pd.read_csv('../data/customer_data.csv')
payments_data = pd.read_csv('../data/payments_data.csv')

In [159]:
# convert payment type to numeric using sk preprocessing label encoder
le = LabelEncoder()
le.fit(payments_data['payment_type'])
payments_data['payment_type'] = le.transform(payments_data['payment_type'])

# add 1 to the payment type to avoid 0 values
payments_data['payment_type'] = payments_data['payment_type'] + 1

payments_data.describe(include='all')

Unnamed: 0,booking_id,payment_sequential,payment_type,payment_installments,payment_value
count,103886,103886.0,103886.0,103886.0,103886.0
unique,99440,,,,
top,d1b0e818e3ccc5cb0e39231352fa65da,,,,
freq,29,,,,
mean,,1.092679,1.618043,2.853349,154.10038
std,,0.706584,1.133229,2.687051,217.494064
min,,1.0,1.0,0.0,0.0
25%,,1.0,1.0,1.0,56.79
50%,,1.0,1.0,1.0,100.0
75%,,1.0,2.0,4.0,171.8375


In [160]:
# keep only entries with payment_sequential as 1
payments_data_unique = payments_data[payments_data['payment_sequential'] == 1]
payments_data_repeat = payments_data[payments_data['payment_sequential'] != 1]

payments_data_unique.describe(include='all')

Unnamed: 0,booking_id,payment_sequential,payment_type,payment_installments,payment_value
count,99360,99360.0,99360.0,99360.0,99360.0
unique,99360,,,,
top,6f3fe1789b1e8b2acac839d17b81ef22,,,,
freq,1,,,,
mean,,1.0,1.478422,2.92964,158.336774
std,,0.0,0.918534,2.714947,220.511857
min,,1.0,1.0,1.0,0.0
25%,,1.0,1.0,1.0,59.9475
50%,,1.0,1.0,2.0,103.33
75%,,1.0,1.0,4.0,175.11


In [161]:
# sort payments_data_repeat by payment_sequential
payments_data_repeat = payments_data_repeat.sort_values(by=['payment_sequential'], ascending=True)

payments_data_repeat.describe(include='all')

Unnamed: 0,booking_id,payment_sequential,payment_type,payment_installments,payment_value
count,4526,4526.0,4526.0,4526.0,4526.0
unique,3039,,,,
top,d1b0e818e3ccc5cb0e39231352fa65da,,,,
freq,28,,,,
mean,,3.127265,4.683164,1.178524,61.098164
std,,2.670762,1.063888,0.989643,96.149183
min,,2.0,1.0,0.0,0.0
25%,,2.0,5.0,1.0,14.905
50%,,2.0,5.0,1.0,31.92
75%,,3.0,5.0,1.0,71.23


In [162]:
# making payment data unique for each booking_id by adding the payments made by other methods to primary payment method
for payment_data_repeat in payments_data_repeat.itertuples():
    booking_id = payment_data_repeat.booking_id
    payment_value_new = payments_data_unique[payments_data_unique['booking_id'] == booking_id]['payment_value'] + payment_data_repeat.payment_value
    payment_installments_new = payments_data_unique[payments_data_unique['booking_id'] == booking_id]['payment_installments'] + payment_data_repeat.payment_installments
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_value'] = payment_value_new
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_installments'] = payment_installments_new
    payments_data_unique.loc[payments_data_unique['booking_id'] == booking_id, 'payment_sequential'] = payment_data_repeat.payment_sequential

payments_data_unique.describe(include='all')

Unnamed: 0,booking_id,payment_sequential,payment_type,payment_installments,payment_value
count,99360,99360.0,99360.0,99360.0,99360.0
unique,99360,,,,
top,6f3fe1789b1e8b2acac839d17b81ef22,,,,
freq,1,,,,
mean,,1.044726,1.478422,2.98131,160.984584
std,,0.381293,0.918534,2.741804,221.99805
min,,1.0,1.0,1.0,0.0
25%,,1.0,1.0,1.0,62.01
50%,,1.0,1.0,2.0,105.29
75%,,1.0,1.0,4.0,176.9325


In [163]:
# split bookings_data into unique and repeat bookings
bookings_data_unique = bookings_data[bookings_data['booking_sequence_id'] == 1]
bookings_data_repeat = bookings_data[bookings_data['booking_sequence_id'] > 1]

bookings_data_unique.describe(include='all')

Unnamed: 0,booking_id,booking_sequence_id,hotel_id,seller_agent_id,booking_expiry_date,price,agent_fees
count,98666,98666.0,98666,98666,98666,98666.0,98666.0
unique,98666,,31881,3088,93001,,
top,242fe8c5a6d1ba2dd792cb1621400010,,856965c36a24e339b605899a4788cb24,19b47992c3666cc44a7e94c06560211a,2017-12-07 04:39:05,,
freq,1,,439,1844,6,,
mean,,1.0,,,,125.964327,20.201927
std,,0.0,,,,191.375106,15.909873
min,,1.0,,,,0.85,0.0
25%,,1.0,,,,41.505,13.31
50%,,1.0,,,,79.0,16.36
75%,,1.0,,,,139.9,21.23


In [164]:
# sort bookings_data_repeat by booking_sequence_id
bookings_data_repeat = bookings_data_repeat.sort_values(by=['booking_sequence_id'], ascending=True)

bookings_data_repeat.describe(include='all')

Unnamed: 0,booking_id,booking_sequence_id,hotel_id,seller_agent_id,booking_expiry_date,price,agent_fees
count,13984,13984.0,13984,13984,13984,13984.0,13984.0
unique,9803,,6593,1548,9828,,
top,f5f79c56e9e4120aec44ef8272b63d03,,46682990de24d770e7f83d422879e10f,0176fa81dab994f90235231001f50f92,2017-07-21 18:25:23,,
freq,20,,144,542,20,,
mean,,2.593678,,,,83.184167,18.497299
std,,1.334476,,,,107.874536,14.97197
min,,2.0,,,,0.85,0.0
25%,,2.0,,,,30.0,11.85
50%,,2.0,,,,56.0,15.56
75%,,3.0,,,,99.9,20.16


In [165]:
# merging bookings_data for each booking_id
for booking_data_repeat in bookings_data_repeat.itertuples():
    bookings_id = booking_data_repeat.booking_id
    booking_data_unique = bookings_data_unique[bookings_data_unique['booking_id'] == bookings_id]
    new_price = booking_data_repeat.price + booking_data_unique.price
    new_agent_fees = booking_data_repeat.agent_fees + booking_data_unique.agent_fees
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'price'] = new_price
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'agent_fees'] = new_agent_fees
    bookings_data_unique.loc[bookings_data_unique['booking_id'] == bookings_id, 'booking_sequence_id'] = booking_data_repeat.booking_sequence_id

In [166]:
bookings_data_unique.describe(include='all')

Unnamed: 0,booking_id,booking_sequence_id,hotel_id,seller_agent_id,booking_expiry_date,price,agent_fees
count,98666,98666.0,98666,98666,98666,98666.0,98666.0
unique,98666,,31881,3088,93001,,
top,242fe8c5a6d1ba2dd792cb1621400010,,856965c36a24e339b605899a4788cb24,19b47992c3666cc44a7e94c06560211a,2017-12-07 04:39:05,,
freq,1,,439,1844,6,,
mean,,1.141731,,,,137.754076,22.823562
std,,0.538452,,,,210.645145,21.650909
min,,1.0,,,,0.85,0.0
25%,,1.0,,,,45.9,13.85
50%,,1.0,,,,86.9,17.17
75%,,1.0,,,,149.9,24.04


In [167]:
# merge bookings and bookings_data as bookings_df
bookings_df = pd.merge(bookings, bookings_data_unique, on='booking_id', how='left')

bookings_df.describe(include='all')

Unnamed: 0,booking_id,customer_id,booking_status,booking_create_timestamp,booking_approved_at,booking_checkin_customer_date,booking_sequence_id,hotel_id,seller_agent_id,booking_expiry_date,price,agent_fees
count,99441,99441,99441,99441,99281,96476,98666.0,98666,98666,98666,98666.0,98666.0
unique,99441,99441,8,98875,90733,95664,,31881,3088,93001,,
top,c54678b7cc49136f2d6af7e481f51cbd,51297304e76186b10a928d9ef432eb62,completed,2008-04-13 10:31:14,2008-03-01 04:14:10,2008-05-10 23:21:46,,856965c36a24e339b605899a4788cb24,19b47992c3666cc44a7e94c06560211a,2018-06-11 03:31:04,,
freq,1,1,96478,3,9,3,,439,1844,6,,
mean,,,,,,,1.141731,,,,137.754076,22.823562
std,,,,,,,0.538452,,,,210.645145,21.650909
min,,,,,,,1.0,,,,0.85,0.0
25%,,,,,,,1.0,,,,45.9,13.85
50%,,,,,,,1.0,,,,86.9,17.17
75%,,,,,,,1.0,,,,149.9,24.04


In [168]:
# merge bookings_df and customer_data as bookings_customer_df
bookings_customer_df = pd.merge(bookings_df, customer_data, on='customer_id', how='left')

# merge bookings_customer_df and hotel_data as bookings_hotel_df
bookings_hotel_df = pd.merge(bookings_customer_df, hotel_data, on='hotel_id', how='left')

# merge bookings_hotel_df and payments_data as bookings_payment_df
bookings_payment_df = pd.merge(bookings_hotel_df, payments_data_unique, on='booking_id', how='left')

bookings_payment_df.drop(['customer_id', 'hotel_id'], axis=1, inplace=True)

bookings_payment_df.describe(include='all')

Unnamed: 0,booking_id,booking_status,booking_create_timestamp,booking_approved_at,booking_checkin_customer_date,booking_sequence_id,seller_agent_id,booking_expiry_date,price,agent_fees,customer_unique_id,country,hotel_category,hotel_name_length,hotel_description_length,hotel_photos_qty,payment_sequential,payment_type,payment_installments,payment_value
count,99441,99441,99441,99281,96476,98666.0,98666,98666,98666.0,98666.0,99441,99441,97250.0,97250.0,97250.0,97250.0,99360.0,99360.0,99360.0,99360.0
unique,99441,8,98875,90733,95664,,3088,93001,,,96096,9,,,,,,,,
top,c54678b7cc49136f2d6af7e481f51cbd,completed,2008-04-13 10:31:14,2008-03-01 04:14:10,2008-05-10 23:21:46,,19b47992c3666cc44a7e94c06560211a,2018-06-11 03:31:04,,,f50201ccdcedfb9e2ac84558d50f5ead,Slovakia,,,,,,,,
freq,1,96478,3,9,3,,1844,6,,,17,11212,,,,,,,,
mean,,,,,,1.141731,,,137.754076,22.823562,,,28.856422,48.846386,794.161398,2.250591,1.044726,1.478422,2.98131,160.984584
std,,,,,,0.538452,,,210.645145,21.650909,,,22.666001,9.999239,654.751953,1.747095,0.381293,0.918534,2.741804,221.99805
min,,,,,,1.0,,,0.85,0.0,,,1.0,5.0,4.0,1.0,1.0,1.0,1.0,0.0
25%,,,,,,1.0,,,45.9,13.85,,,10.0,42.0,349.0,1.0,1.0,1.0,1.0,62.01
50%,,,,,,1.0,,,86.9,17.17,,,28.0,52.0,607.0,2.0,1.0,1.0,2.0,105.29
75%,,,,,,1.0,,,149.9,24.04,,,38.0,57.0,996.0,3.0,1.0,1.0,4.0,176.9325


In [169]:
cat_columns = ['seller_agent_id', 'booking_status', 'country', 'customer_unique_id']

for column in cat_columns:
    le = LabelEncoder()
    le.fit(bookings_payment_df[column])
    bookings_payment_df[column] = le.transform(bookings_payment_df[column])
    if column == 'booking_status' or column == 'country':
        bookings_payment_df[column] = bookings_payment_df[column] + 1

print(bookings_payment_df.dtypes)

booking_id                        object
booking_status                     int32
booking_create_timestamp          object
booking_approved_at               object
booking_checkin_customer_date     object
booking_sequence_id              float64
seller_agent_id                    int32
booking_expiry_date               object
price                            float64
agent_fees                       float64
customer_unique_id                 int32
country                            int32
hotel_category                   float64
hotel_name_length                float64
hotel_description_length         float64
hotel_photos_qty                 float64
payment_sequential               float64
payment_type                     float64
payment_installments             float64
payment_value                    float64
dtype: object


In [170]:
date_columns = ['booking_create_timestamp', 'booking_approved_at', 'booking_checkin_customer_date','booking_expiry_date']

for date_column in date_columns:
    # convert date to datetime
    bookings_payment_df[date_column] = pd.to_datetime(bookings_payment_df[date_column])
    # change the date time entries of column to number of seconds from base date
    bookings_payment_df[date_column] = bookings_payment_df[date_column].astype(np.int64) // 10**9  
    # take the average date as the start date
    start_date_seconds = bookings_payment_df[date_column].mean()
    # subtract the start date from the date column
    bookings_payment_df[date_column] = bookings_payment_df[date_column] - start_date_seconds

print(bookings_payment_df.dtypes)
bookings_payment_df.describe(include='all')

booking_id                        object
booking_status                     int32
booking_create_timestamp         float64
booking_approved_at              float64
booking_checkin_customer_date    float64
booking_sequence_id              float64
seller_agent_id                    int32
booking_expiry_date              float64
price                            float64
agent_fees                       float64
customer_unique_id                 int32
country                            int32
hotel_category                   float64
hotel_name_length                float64
hotel_description_length         float64
hotel_photos_qty                 float64
payment_sequential               float64
payment_type                     float64
payment_installments             float64
payment_value                    float64
dtype: object


Unnamed: 0,booking_id,booking_status,booking_create_timestamp,booking_approved_at,booking_checkin_customer_date,booking_sequence_id,seller_agent_id,booking_expiry_date,price,agent_fees,customer_unique_id,country,hotel_category,hotel_name_length,hotel_description_length,hotel_photos_qty,payment_sequential,payment_type,payment_installments,payment_value
count,99441,99441.0,99441.0,99441.0,99441.0,98666.0,99441.0,99441.0,98666.0,98666.0,99441.0,99441.0,97250.0,97250.0,97250.0,97250.0,99360.0,99360.0,99360.0,99360.0
unique,99441,,,,,,,,,,,,,,,,,,,
top,c54678b7cc49136f2d6af7e481f51cbd,,,,,,,,,,,,,,,,,,,
freq,1,,,,,,,,,,,,,,,,,,,
mean,,3.076166,3.931805e-08,-1.265927e-08,5.524043e-08,1.141731,1417.288995,-9.766816e-08,137.754076,22.823562,48049.895224,5.011484,28.856422,48.846386,794.161398,2.250591,1.044726,1.478422,2.98131,160.984584
std,,0.561226,13277020.0,417956200.0,1772974000.0,0.538452,935.610297,944420200.0,210.645145,21.650909,27758.278975,2.580036,22.666001,9.999239,654.751953,1.747095,0.381293,0.918534,2.741804,221.99805
min,,1.0,-41686070.0,-10405990000.0,-10113140000.0,1.0,0.0,-10655010000.0,0.85,0.0,0.0,1.0,1.0,5.0,4.0,1.0,1.0,1.0,1.0,0.0
25%,,3.0,-9482214.0,7234372.0,300029100.0,1.0,526.0,73843250.0,45.9,13.85,23986.0,3.0,10.0,42.0,349.0,1.0,1.0,1.0,1.0,62.01
50%,,3.0,1606883.0,18350790.0,311940000.0,1.0,1342.0,85235110.0,86.9,17.17,48053.0,5.0,28.0,52.0,607.0,2.0,1.0,1.0,2.0,105.29
75%,,3.0,10738740.0,27478660.0,321181900.0,1.0,2260.0,94308760.0,149.9,24.04,72088.0,7.0,38.0,57.0,996.0,3.0,1.0,1.0,4.0,176.9325


In [171]:
# take all columns
columns = bookings_payment_df.columns

# remove booking_id
columns = columns.drop(['booking_id'])

# change all null or nan values to mean of respective columns
for column in columns:
    mean = bookings_payment_df[column].mean()
    bookings_payment_df[column].fillna(mean, inplace=True)

In [172]:
bookings_payment_df.describe(include='all')

Unnamed: 0,booking_id,booking_status,booking_create_timestamp,booking_approved_at,booking_checkin_customer_date,booking_sequence_id,seller_agent_id,booking_expiry_date,price,agent_fees,customer_unique_id,country,hotel_category,hotel_name_length,hotel_description_length,hotel_photos_qty,payment_sequential,payment_type,payment_installments,payment_value
count,99441,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0,99441.0
unique,99441,,,,,,,,,,,,,,,,,,,
top,c54678b7cc49136f2d6af7e481f51cbd,,,,,,,,,,,,,,,,,,,
freq,1,,,,,,,,,,,,,,,,,,,
mean,,3.076166,3.931805e-08,-1.265927e-08,5.524043e-08,1.141731,1417.288995,-9.766816e-08,137.754076,22.823562,48049.895224,5.011484,28.856422,48.846386,794.161398,2.250591,1.044726,1.478422,2.98131,160.984584
std,,0.561226,13277020.0,417956200.0,1772974000.0,0.53635,935.610297,944420200.0,209.822693,21.566375,27758.278975,2.580036,22.414906,9.888467,647.498575,1.727741,0.381138,0.91816,2.740687,221.907616
min,,1.0,-41686070.0,-10405990000.0,-10113140000.0,1.0,0.0,-10655010000.0,0.85,0.0,0.0,1.0,1.0,5.0,4.0,1.0,1.0,1.0,1.0,0.0
25%,,3.0,-9482214.0,7234372.0,300029100.0,1.0,526.0,73843250.0,45.99,13.9,23986.0,3.0,10.0,43.0,354.0,1.0,1.0,1.0,1.0,62.01
50%,,3.0,1606883.0,18350790.0,311940000.0,1.0,1342.0,85235110.0,88.0,17.27,48053.0,5.0,28.0,51.0,620.0,2.0,1.0,1.0,2.0,105.37
75%,,3.0,10738740.0,27478660.0,321181900.0,1.0,2260.0,94308760.0,149.9,23.92,72088.0,7.0,38.0,57.0,982.0,3.0,1.0,1.0,4.0,176.86


In [173]:
# assert no null values
assert bookings_payment_df.isnull().sum().sum() == 0

In [174]:
# train_booking_df contains bookings_df with booking_id in train_data
train_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(train_data['booking_id'])]

# create X_train and Y_train
train_booking_df = train_booking_df.sort_values(by=['booking_id'])
X_train = train_booking_df.drop(['booking_id'], axis=1)
train_data = train_data.sort_values(by=['booking_id'])
# take only unique values
train_data = train_data.drop_duplicates(subset=['booking_id'])
Y_train = train_data['rating_score']

print(X_train.shape)
print(Y_train.shape)

(49868, 19)
(49868,)


In [178]:
X_train_actual, Y_train_actual = X_train[:int(len(X_train)*0.8)], Y_train[:int(len(Y_train)*0.8)]
X_valid, Y_valid = X_train[int(len(X_train)*0.8):], Y_train[int(len(Y_train)*0.8):]

In [179]:
n_estimators = [100, 200, 300, 400, 500, 600]
max_depth = [8, 10, 12, 14, 16, 18, 20, None]

least_mse = np.inf
best_n_estimators, best_max_depth = 0, 0

for n_estimator in n_estimators:
    for depth in max_depth:
        model = RandomForestRegressor(n_estimators=n_estimator, max_depth=depth)
        model.fit(X_train_actual, Y_train_actual)
        Y_pred = model.predict(X_valid)
        mse = mean_squared_error(Y_valid, Y_pred)
        print("n_estimator: {}, max_depth: {}, mse: {}".format(n_estimator, depth, mse))
        if mse < least_mse:
            least_mse = mse
            best_n_estimators, best_max_depth = n_estimator, depth

print("Best n_estimators: ", best_n_estimators)
print("Best max_depth: ", best_max_depth)

n_estimator: 100, max_depth: 8, mse: 1.4649372326804329
n_estimator: 100, max_depth: 10, mse: 1.435723303454231
n_estimator: 100, max_depth: 12, mse: 1.4205560746646
n_estimator: 100, max_depth: 14, mse: 1.4148733189258773
n_estimator: 100, max_depth: 16, mse: 1.411884102678888
n_estimator: 100, max_depth: 18, mse: 1.410698083108185
n_estimator: 100, max_depth: 20, mse: 1.419144601246742
n_estimator: 100, max_depth: None, mse: 1.4418387908562262
n_estimator: 200, max_depth: 8, mse: 1.4654824283539998
n_estimator: 200, max_depth: 10, mse: 1.433081291250664
n_estimator: 200, max_depth: 12, mse: 1.4184253184633793
n_estimator: 200, max_depth: 14, mse: 1.4101366630341823
n_estimator: 200, max_depth: 16, mse: 1.408267276284793
n_estimator: 200, max_depth: 18, mse: 1.406602178899228
n_estimator: 200, max_depth: 20, mse: 1.4125374324036264
n_estimator: 200, max_depth: None, mse: 1.4339586224182874
n_estimator: 300, max_depth: 8, mse: 1.466053244131602
n_estimator: 300, max_depth: 10, mse: 1.4

In [187]:
print("Best parameters found!")
print("n_estimators = ", best_n_estimators)
print("max_depth = ", best_max_depth)
print("mse = ", least_mse)

best_max_depth = 18
best_n_estimators = 400

model = RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth)
model.fit(X_train, Y_train)

train_mse = mean_squared_error(Y_train, model.predict(X_train))
print("train_mse: {}".format(train_mse))

test_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(test_data['booking_id'])]

# create X_test
test_booking_df = test_booking_df.sort_values(by=['booking_id'])
X_test = test_booking_df.drop(['booking_id'], axis=1)

Y_test_pred = model.predict(X_test)

# prepare submission file
submission = pd.DataFrame()
submission['booking_id'] = test_booking_df['booking_id']
submission['rating_score'] = Y_test_pred

# change ratings below 0 to 0 and above 5 to 5
submission['rating_score'] = submission['rating_score'].apply(lambda x: 1 if x < 1 else x)
submission['rating_score'] = submission['rating_score'].apply(lambda x: 5 if x > 5 else x)

submission.to_csv('RandomForestTest-best-2.csv', index=False)
submission.describe()

Best parameters found!
n_estimators =  400
max_depth =  16
mse =  1.426853564947281
train_mse: 0.7572226358058625


Unnamed: 0,rating_score
count,49079.0
mean,4.086745
std,0.657303
min,1.043336
25%,4.10718
50%,4.318531
75%,4.424646
max,4.811914


In [181]:
# paramters for HistGradientBoostingRegressor
learning_rate = [0.1, 0.2, 0.3]
max_iter = [300, 400, 500, 600, 800, 1000]
max_depth = [8, 10, 12, 14, 16, 18, 20, None]
l2_regularization = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 5]

least_mse = np.inf
best_learning_rate, best_max_iter, best_max_depth, best_l2_regularization = 0, 0, 0, 0

for lr in learning_rate:
    for it in max_iter:
        for depth in max_depth:
            for reg in l2_regularization:
                model = HistGradientBoostingRegressor(learning_rate=lr, max_iter=it, max_depth=depth, l2_regularization=reg)
                model.fit(X_train_actual, Y_train_actual)
                Y_pred = model.predict(X_valid)
                mse = mean_squared_error(Y_valid, Y_pred)
                print("learning_rate: {}, max_iter: {}, max_depth: {}, l2_regularization: {}, mse: {}".format(lr, it, depth, reg, mse))
                if mse < least_mse:
                    least_mse = mse
                    best_learning_rate, best_max_iter, best_max_depth, best_l2_regularization = lr, it, depth, reg
                    print("New best parameters found!")
                    print("learning_rate = ", lr)
                    print("max_iter = ", it)
                    print("max_depth = ", depth)
                    print("l2_regularization = ", reg)
                    print("mse = ", mse)

learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization: 0.0, min_samples_leaf: 1, mse: 1.519380106872787
New best parameters found!
learning_rate =  0.01
max_iter =  100
max_depth =  8
l2_regularization =  0.0
min_samples_leaf =  1
mse =  1.519380106872787
learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization: 0.0, min_samples_leaf: 2, mse: 1.5225921099648418
learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization: 0.0, min_samples_leaf: 4, mse: 1.5179407544880885
New best parameters found!
learning_rate =  0.01
max_iter =  100
max_depth =  8
l2_regularization =  0.0
min_samples_leaf =  4
mse =  1.5179407544880885
learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization: 0.1, min_samples_leaf: 1, mse: 1.5176357007993877
New best parameters found!
learning_rate =  0.01
max_iter =  100
max_depth =  8
l2_regularization =  0.1
min_samples_leaf =  1
mse =  1.5176357007993877
learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization

In [182]:
print("Best parameters found!")
print("learning_rate = ", best_learning_rate)
print("max_iter = ", best_max_iter)
print("max_depth = ", best_max_depth)
print("l2_regularization = ", best_l2_regularization)
print("min_samples_leaf = ", best_min_samples_leaf)
print("mse = ", least_mse)

model = HistGradientBoostingRegressor(learning_rate=best_learning_rate, 
                                    max_iter=best_max_iter, 
                                    max_depth=best_max_depth, 
                                    l2_regularization=best_l2_regularization, 
                                    min_samples_leaf=best_min_samples_leaf, 
                                    random_state=69)
model.fit(X_train, Y_train)

train_mse = mean_squared_error(Y_train, model.predict(X_train))
print("train_mse: {}".format(train_mse))

test_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(test_data['booking_id'])]

# create X_test
test_booking_df = test_booking_df.sort_values(by=['booking_id'])
X_test = test_booking_df.drop(['booking_id'], axis=1)

Y_test_pred = model.predict(X_test)

# prepare submission file
submission = pd.DataFrame()
submission['booking_id'] = test_booking_df['booking_id']
submission['rating_score'] = Y_test_pred

# change ratings below 0 to 0 and above 5 to 5
submission['rating_score'] = submission['rating_score'].apply(lambda x: 1 if x < 1 else x)
submission['rating_score'] = submission['rating_score'].apply(lambda x: 5 if x > 5 else x)

submission.to_csv('HistGrad-best.csv', index=False)
submission.describe()

Best parameters found!
learning_rate =  0.1
max_iter =  300
max_depth =  10
l2_regularization =  0.3
min_samples_leaf =  4
mse =  1.4076139544323836
train_mse: 1.2482135133312457


Unnamed: 0,rating_score
count,49079.0
mean,4.089698
std,0.587997
min,1.0
25%,4.113473
50%,4.289222
75%,4.376285
max,4.807569


In [185]:
# paramters for voting regressor
learning_rate = [0.05, 0.1, 0.2, 0.3]
max_iter = [100, 200, 300, 400, 500, 600]
max_depth = [8, 10, 12, 14, 16, 18, 20, None]
l2_regularization = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
n_estimators = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

least_mse = np.inf
best_learning_rate, best_max_iter, best_max_depth, best_l2_regularization, best_n_estimators = 0, 0, 0, 0, 0

for lr in learning_rate:
    for it in max_iter:
        for depth in max_depth:
            for reg in l2_regularization:
                for est in n_estimators:
                    model = VotingRegressor([('rf', RandomForestRegressor(n_estimators=est, random_state=69)),
                                            ('hgbr', HistGradientBoostingRegressor(learning_rate=lr, max_iter=it, max_depth=depth, l2_regularization=reg, random_state=69)),
                                            ('bg', BaggingRegressor(n_estimators=est, random_state=69))])
                    model.fit(X_train_actual, Y_train_actual)
                    Y_pred = model.predict(X_valid)
                    mse = mean_squared_error(Y_valid, Y_pred)
                    print("learning_rate: {}, max_iter: {}, max_depth: {}, l2_regularization: {}, n_estimators: {}, mse: {}".format(lr, it, depth, reg, est, mse))
                    if mse < least_mse:
                        least_mse = mse
                        best_learning_rate, best_max_iter, best_max_depth, best_l2_regularization, best_n_estimators = lr, it, depth, reg, est
                        print("New best parameters found!")
                        print("learning_rate = ", lr)
                        print("max_iter = ", it)
                        print("max_depth = ", depth)
                        print("l2_regularization = ", reg)
                        print("n_estimators = ", est)
                        print("mse = ", mse)

learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization: 0.0, n_estimators: 10, mse: 1.4750684455951277
New best parameters found!
learning_rate =  0.01
max_iter =  100
max_depth =  8
l2_regularization =  0.0
n_estimators =  10
mse =  1.4750684455951277
learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization: 0.0, n_estimators: 20, mse: 1.4483419662484005
New best parameters found!
learning_rate =  0.01
max_iter =  100
max_depth =  8
l2_regularization =  0.0
n_estimators =  20
mse =  1.4483419662484005
learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization: 0.0, n_estimators: 30, mse: 1.4359219746547787
New best parameters found!
learning_rate =  0.01
max_iter =  100
max_depth =  8
l2_regularization =  0.0
n_estimators =  30
mse =  1.4359219746547787
learning_rate: 0.01, max_iter: 100, max_depth: 8, l2_regularization: 0.0, n_estimators: 40, mse: 1.431199730009461
New best parameters found!
learning_rate =  0.01
max_iter =  100
max_depth =  8
l2_regu

KeyboardInterrupt: 

In [None]:
print("Best parameters found!")
print("learning_rate = ", best_learning_rate)
print("max_iter = ", best_max_iter)
print("max_depth = ", best_max_depth)
print("l2_regularization = ", best_l2_regularization)
print("n_estimators = ", best_n_estimators)
print("mse = ", least_mse)

model = VotingRegressor([('rf', RandomForestRegressor(n_estimators=best_n_estimators, random_state=69)),
                        ('hgbr', HistGradientBoostingRegressor(learning_rate=best_learning_rate, max_iter=best_max_iter, max_depth=best_max_depth, l2_regularization=best_l2_regularization, random_state=69)),
                        ('bg', BaggingRegressor(n_estimators=best_n_estimators, random_state=69))])
model.fit(X_train, Y_train)

train_mse = mean_squared_error(Y_train, model.predict(X_train))
print("train_mse: {}".format(train_mse))

test_booking_df = bookings_payment_df[bookings_payment_df['booking_id'].isin(test_data['booking_id'])]

# create X_test
test_booking_df = test_booking_df.sort_values(by=['booking_id'])
X_test = test_booking_df.drop(['booking_id'], axis=1)

Y_test_pred = model.predict(X_test)

# prepare submission file
submission = pd.DataFrame()
submission['booking_id'] = test_booking_df['booking_id']
submission['rating_score'] = Y_test_pred

# change ratings below 0 to 0 and above 5 to 5
submission['rating_score'] = submission['rating_score'].apply(lambda x: 1 if x < 1 else x)
submission['rating_score'] = submission['rating_score'].apply(lambda x: 5 if x > 5 else x)

submission.to_csv('HistGrad-best.csv', index=False)
submission.describe()