In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import xgboost as xgb
from preprocess import preprocess_data
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import datetime

In [2]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
tqdm.pandas()

In [3]:
start = time.time()
df_train = pd.read_csv("raw_data/train_data.csv", parse_dates=[1])
df_test = pd.read_csv("raw_data/test_data.csv", parse_dates=[1])
end = time.time()
print("Data loaded in ", end - start, " seconds.")

Data loaded in  23.54608154296875  seconds.


In [35]:
rates = pd.read_csv("preprocessed_data/df_rates.csv")
rates['prop_id'] = rates['prop_id'].astype(int)

In [4]:
df_train.head(1)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0


In [5]:
df_test.head(1)

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff
0,1,2013-02-02 15:27:40,24,216,,,219,3180,3,4.5,1,2.94,0.0691,5.03,119.0,0,19222,1,10,2,0,1,0,,,0,,,,,,,,,,,,,,,,,,,,,,,,


## Preprocess Data

In [6]:
start = time.time()
preprocessed_train = preprocess_data(df_train, split='train')
print('------')
preprocessed_test = preprocess_data(df_test, split='test')
end = time.time()
print("Data preprocessing took ", end - start, " seconds.")

Adding mean, median and std of:  prop_starrating
Adding mean, median and std of:  prop_review_score
Adding mean, median and std of:  prop_log_historical_price
Adding mean, median and std of:  prop_location_score1
Adding mean, median and std of:  prop_location_score2
------
Adding mean, median and std of:  prop_starrating
Adding mean, median and std of:  prop_review_score
Adding mean, median and std of:  prop_log_historical_price
Adding mean, median and std of:  prop_location_score1
Adding mean, median and std of:  prop_location_score2
Data preprocessing took  215.02858328819275  seconds.


In [7]:
preprocessed_test.to_csv('preprocessed_data/test1.csv')
preprocessed_train.to_csv('preprocessed_data/train1.csv')

In [8]:
preprocessed_train.head(1)

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,click_bool,gross_bookings_usd,booking_bool,day_of_the_week,day,month,year,week,meanprop_starrating,medianprop_starrating,stdprop_starrating,meanprop_review_score,medianprop_review_score,stdprop_review_score,meanprop_log_historical_price,medianprop_log_historical_price,stdprop_log_historical_price,meanprop_location_score1,medianprop_location_score1,stdprop_location_score1,meanprop_location_score2,medianprop_location_score2,stdprop_location_score2
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,0,23246,1,0,4,0,1,1,-330.0,1301.234406,1,0,0.0,0,3,4,4,2013,14,3.071429,3.0,0.766356,3.482143,3.5,1.109572,4.870714,4.93,0.304022,2.299643,2.3,0.518734,0.045493,0.02255,0.047216


In [9]:
preprocessed_test.head(1)

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,day_of_the_week,day,month,year,week,meanprop_starrating,medianprop_starrating,stdprop_starrating,meanprop_review_score,medianprop_review_score,stdprop_review_score,meanprop_log_historical_price,medianprop_log_historical_price,stdprop_log_historical_price,meanprop_location_score1,medianprop_location_score1,stdprop_location_score1,meanprop_location_score2,medianprop_location_score2,stdprop_location_score2
0,1,24,216,219,3180,3,4.5,1,2.94,0.0691,5.03,119.0,0,19222,1,10,2,0,1,0,-330.0,1312.801653,0,5,2,2,2013,5,2.689655,3.0,0.712313,4.017241,4.5,0.940011,4.638966,4.66,0.281721,2.66069,2.77,0.300962,0.125383,0.0908,0.076598


In [10]:
print('Features Used')
preprocessed_test.isnull().sum()

Features Used


srch_id                            0
site_id                            0
visitor_location_country_id        0
prop_country_id                    0
prop_id                            0
prop_starrating                    0
prop_review_score                  0
prop_brand_bool                    0
prop_location_score1               0
prop_location_score2               0
prop_log_historical_price          0
price_usd                          0
promotion_flag                     0
srch_destination_id                0
srch_length_of_stay                0
srch_booking_window                0
srch_adults_count                  0
srch_children_count                0
srch_room_count                    0
srch_saturday_night_bool           0
srch_query_affinity_score          0
orig_destination_distance          0
random_bool                        0
day_of_the_week                    0
day                                0
month                              0
year                               0
w

In [36]:
preprocessed_test = pd.merge(preprocessed_test, rates, on='prop_id', how='left')
preprocessed_train = pd.merge(preprocessed_train, rates, on='prop_id', how='left')

In [40]:
preprocessed_test.head(1)

Unnamed: 0.1,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,day_of_the_week,day,month,year,week,meanprop_starrating,medianprop_starrating,stdprop_starrating,meanprop_review_score,medianprop_review_score,stdprop_review_score,meanprop_log_historical_price,medianprop_log_historical_price,stdprop_log_historical_price,meanprop_location_score1,medianprop_location_score1,stdprop_location_score1,meanprop_location_score2,medianprop_location_score2,stdprop_location_score2,Unnamed: 0,click_rate,booking_rate
0,1,24,216,219,3180,3,4.5,1,2.94,0.0691,5.03,119.0,0,19222,1,10,2,0,1,0,-330.0,1312.801653,0,5,2,2,2013,5,2.689655,3.0,0.712313,4.017241,4.5,0.940011,4.638966,4.66,0.281721,2.66069,2.77,0.300962,0.125383,0.0908,0.076598,100663.0,0.043011,0.043011


## Make Relevance Targets

Create a column called score which would be 
df['score'] = 4 * df.booking_bool + df.click_bool

In [37]:
preprocessed_train['rel'] = 4 * preprocessed_train.booking_bool + preprocessed_train.click_bool

In [38]:
X = preprocessed_train.drop(['booking_bool', 'click_bool', 'gross_bookings_usd', 'position'], axis=1)

In [42]:
print(f'num_features:{X.shape[1]-2}') # -2 as we donot use srch_id and rel
X = X.drop(columns = ['Unnamed: 0'], axis=1)


num_features:45


In [43]:
X.head(1)

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,day_of_the_week,day,month,year,week,meanprop_starrating,medianprop_starrating,stdprop_starrating,meanprop_review_score,medianprop_review_score,stdprop_review_score,meanprop_log_historical_price,medianprop_log_historical_price,stdprop_log_historical_price,meanprop_location_score1,medianprop_location_score1,stdprop_location_score1,meanprop_location_score2,medianprop_location_score2,stdprop_location_score2,rel,click_rate,booking_rate
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,4.95,104.77,0,23246,1,0,4,0,1,1,-330.0,1301.234406,1,3,4,4,2013,14,3.071429,3.0,0.766356,3.482143,3.5,1.109572,4.870714,4.93,0.304022,2.299643,2.3,0.518734,0.045493,0.02255,0.047216,0,0.026144,0.01634


# Train Test Split

In [44]:
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state = 7).split(X, groups=X['srch_id'])
X_train_inds, X_val_inds = next(gss)

In [45]:
assert len(X) == len(X_train_inds) + len(X_val_inds)

In [46]:
train_data= X.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','rel'])]
y_train = train_data.loc[:, train_data.columns.isin(['rel'])]
train_groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

In [47]:
val_data= X.iloc[X_val_inds]
#We need to keep the id for later predictions
X_val = val_data.loc[:, ~val_data.columns.isin(['srch_id','rel'])]
X_val_with_srch_id = val_data.loc[:, ~val_data.columns.isin(['rel'])]
y_val = val_data.loc[:, val_data.columns.isin(['rel'])]
eval_groups = X_val_with_srch_id.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

In [48]:
len(X_val)

990089

In [49]:
import xgboost as xgb

model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=500, 
    subsample=0.75,
    gamma = 0.0,
    predictor='gpu_predictor'
    )

model = model.fit(X_train, y_train, group=train_groups, eval_set=[(X_val, y_val)], 
          eval_group = [eval_groups], eval_metric='ndcg@5', verbose=True, early_stopping_rounds=20)

[0]	validation_0-ndcg@5:0.38170
[1]	validation_0-ndcg@5:0.39123
[2]	validation_0-ndcg@5:0.39776
[3]	validation_0-ndcg@5:0.40002
[4]	validation_0-ndcg@5:0.40400
[5]	validation_0-ndcg@5:0.40579
[6]	validation_0-ndcg@5:0.40664
[7]	validation_0-ndcg@5:0.40780
[8]	validation_0-ndcg@5:0.40817
[9]	validation_0-ndcg@5:0.40987
[10]	validation_0-ndcg@5:0.41029
[11]	validation_0-ndcg@5:0.41034
[12]	validation_0-ndcg@5:0.41062
[13]	validation_0-ndcg@5:0.41264
[14]	validation_0-ndcg@5:0.41216
[15]	validation_0-ndcg@5:0.41258
[16]	validation_0-ndcg@5:0.41262
[17]	validation_0-ndcg@5:0.41289
[18]	validation_0-ndcg@5:0.41296
[19]	validation_0-ndcg@5:0.41445
[20]	validation_0-ndcg@5:0.41484
[21]	validation_0-ndcg@5:0.41496
[22]	validation_0-ndcg@5:0.41549
[23]	validation_0-ndcg@5:0.41573
[24]	validation_0-ndcg@5:0.41611
[25]	validation_0-ndcg@5:0.41668
[26]	validation_0-ndcg@5:0.41730
[27]	validation_0-ndcg@5:0.41757
[28]	validation_0-ndcg@5:0.41809
[29]	validation_0-ndcg@5:0.41827
[30]	validation_0-nd

[245]	validation_0-ndcg@5:0.45528
[246]	validation_0-ndcg@5:0.45536
[247]	validation_0-ndcg@5:0.45538
[248]	validation_0-ndcg@5:0.45553
[249]	validation_0-ndcg@5:0.45568
[250]	validation_0-ndcg@5:0.45579
[251]	validation_0-ndcg@5:0.45590
[252]	validation_0-ndcg@5:0.45573
[253]	validation_0-ndcg@5:0.45572
[254]	validation_0-ndcg@5:0.45578
[255]	validation_0-ndcg@5:0.45579
[256]	validation_0-ndcg@5:0.45570
[257]	validation_0-ndcg@5:0.45573
[258]	validation_0-ndcg@5:0.45567
[259]	validation_0-ndcg@5:0.45599
[260]	validation_0-ndcg@5:0.45601
[261]	validation_0-ndcg@5:0.45582
[262]	validation_0-ndcg@5:0.45569
[263]	validation_0-ndcg@5:0.45602
[264]	validation_0-ndcg@5:0.45604
[265]	validation_0-ndcg@5:0.45629
[266]	validation_0-ndcg@5:0.45608
[267]	validation_0-ndcg@5:0.45628
[268]	validation_0-ndcg@5:0.45631
[269]	validation_0-ndcg@5:0.45637
[270]	validation_0-ndcg@5:0.45640
[271]	validation_0-ndcg@5:0.45643
[272]	validation_0-ndcg@5:0.45635
[273]	validation_0-ndcg@5:0.45627
[274]	validati

In [50]:
model.feature_importances_.round(decimals=2)

array([0.  , 0.  , 0.01, 0.03, 0.02, 0.01, 0.01, 0.01, 0.06, 0.02, 0.02,
       0.03, 0.  , 0.01, 0.01, 0.  , 0.  , 0.  , 0.  , 0.01, 0.  , 0.03,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.01, 0.01, 0.  , 0.01, 0.01, 0.  , 0.01, 0.08, 0.02, 0.48, 0.04],
      dtype=float32)

In [None]:
def recommend_properties(model, data):
    result = pd.DataFrame(columns=['srch_id', 'prop_id', 'scores'])
    df_list = []
    grouped_data = data.groupby('srch_id')
    for group_name, group in tqdm(grouped_data):
        scores = model.predict(group.loc[:, ~group.columns.isin(['srch_id'])])
        sorted_group = sort_properties(group_name, scores, group['prop_id'])
        df_list.append(sorted_group)
    result = pd.concat(df_list)
    return result

In [None]:
def sort_properties(srch_id, scores, prop_ids):
    sorted_group = pd.DataFrame(columns=['srch_id', 'prop_id', 'scores'])
    sorted_group['srch_id'] = len(scores)*[srch_id]
    sorted_group['prop_id'] = prop_ids.values
    sorted_group['scores'] = scores
    sorted_group = sorted_group.sort_values(by='scores', ascending=False)
    return sorted_group 

In [None]:
start = time.time()
recommendations = recommend_properties(model, preprocessed_test)
end = time.time()
print(f'Total Time for validation prediction:{end - start}')
submission = recommendations.drop(columns=['scores'])
submission.to_csv(f'submission1.csv', index=False)

In [None]:
def recommend_properties_group(model, group):
    scores = model.predict(group.loc[:, ~group.columns.isin(['srch_id'])])
    sorted_group = sort_properties(group.name, scores, group['prop_id'])
    return sorted_group