In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import xgboost as xgb
from preprocess import preprocess_data
from feature_engineer import add_combined_mean_prop_features, add_features, add_normalised_prop_features
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from tqdm import tqdm
import datetime
import xgboost as xgb

In [2]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
tqdm.pandas()

In [3]:
start = time.time()
df_train = pd.read_csv("raw_data/train_data.csv", parse_dates=[1])
df_test = pd.read_csv("raw_data/test_data.csv", parse_dates=[1])
end = time.time()
print("Data loaded in ", end - start, " seconds.")

Data loaded in  22.92912769317627  seconds.


In [4]:
rates = pd.read_csv("preprocessed_data/df_rates.csv")
rates['prop_id'] = rates['prop_id'].astype(int)

In [None]:
df_train.head(1)

In [None]:
df_test.head(1)

## Preprocess Data

In [5]:
start = time.time()
preprocessed_train = preprocess_data(df_train, split='train')
print('------')
preprocessed_test = preprocess_data(df_test, split='test')
end = time.time()
print("Data preprocessing took ", end - start, " seconds.")

Done imputing
------
Done imputing
Data preprocessing took  5.895792007446289  seconds.


In [None]:
preprocessed_train.head(1)

In [None]:
preprocessed_test.head(1)

## Adding and Dropping Features

In [None]:
preprocessed_train = add_normalised_prop_features(preprocessed_train)
preprocessed_test = add_normalised_prop_features(preprocessed_test)

In [None]:
#preprocessed_train, preprocessed_test = add_combined_mean_prop_features(preprocessed_train, preprocessed_test)

In [6]:
drop_columns = ['date_time', 'promotion_flag']
preprocessed_train.drop(columns=drop_columns, inplace=True)
preprocessed_test.drop(columns=drop_columns, inplace=True)

In [7]:
print('Features Used')
preprocessed_test.isnull().sum()

Features Used


srch_id                        0
site_id                        0
visitor_location_country_id    0
prop_country_id                0
prop_id                        0
prop_starrating                0
prop_review_score              0
prop_brand_bool                0
prop_location_score1           0
prop_location_score2           0
prop_log_historical_price      0
price_usd                      0
srch_destination_id            0
srch_length_of_stay            0
srch_booking_window            0
srch_adults_count              0
srch_children_count            0
srch_room_count                0
srch_saturday_night_bool       0
srch_query_affinity_score      0
orig_destination_distance      0
random_bool                    0
dtype: int64

## Make Relevance Targets

Create a column called score which would be 
df['score'] = 4 * df.booking_bool + df.click_bool

In [8]:
preprocessed_train['rel'] = 4 * preprocessed_train.booking_bool + preprocessed_train.click_bool

In [9]:
X = preprocessed_train.drop(['booking_bool', 'click_bool', 'gross_bookings_usd', 'position'], axis=1)

In [10]:
print(f'num_features:{X.shape[1]-2}') # -2 as we donot use srch_id and rel

num_features:21


In [11]:
X.head(1)

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,rel
0,1,12,187,219,893,3,3.5,1,2.83,0.0438,4.95,104.77,23246,1,0,4,0,1,1,-330.0,1301.234406,1,0


# Train Test Split

In [12]:
def train(X_train, y_tarin, train_groups, X_val=None, y_val=None, eval_groups=None):

    model = xgb.XGBRanker(  
        tree_method='gpu_hist',
        booster='gbtree',
        objective='rank:ndcg',
        random_state=42, 
        learning_rate=0.1,
        colsample_bytree=0.8, 
        eta=0.05, 
        max_depth=6, 
        n_estimators=250, 
        subsample=0.8,
        predictor='gpu_predictor'
        )
    
    if X_val is None:
        model = model.fit(X_train, y_train, group=train_groups, eval_set=[(X_train, y_train)], 
              eval_group = [train_groups], eval_metric='ndcg@5', verbose=True, early_stopping_rounds=20)
    else:
        model = model.fit(X_train, y_train, group=train_groups, eval_set=[(X_val, y_val), (X_train, y_train)], 
              eval_group = [eval_groups, train_groups], eval_metric='ndcg@5', verbose=True, early_stopping_rounds=20)
    
    return model
    

In [13]:
def get_cross_val_splits(df_train, train_inds, val_inds):
    train_data= df_train.iloc[train_inds]
    X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','rel'])]
    y_train = train_data.loc[:, train_data.columns.isin(['rel'])]
    train_groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()
    
    val_data= df_train.iloc[val_inds]
    #We need to keep the id for later predictions
    X_val = val_data.loc[:, ~val_data.columns.isin(['srch_id','rel'])]
    X_val_with_srch_id = val_data.loc[:, ~val_data.columns.isin(['rel'])]
    y_val = val_data.loc[:, val_data.columns.isin(['rel'])]
    eval_groups = X_val_with_srch_id.groupby('srch_id').size().to_frame('size')['size'].to_numpy()
    
    return X_train, y_train, train_groups, X_val, y_val, eval_groups 

In [14]:
gss = GroupShuffleSplit(test_size=0.1, n_splits=5, random_state = 7)
best_model = None
best_ndcg = 0
cross_val_score = []
for idx, (train_inds, val_inds) in enumerate(gss.split(X, groups=X['srch_id'])):
    X_train, y_train, train_groups, X_val, y_val, eval_groups = get_cross_val_splits(X, train_inds, val_inds)
    print(f'Training on fold:{idx}')
    model = train(X_train, y_train, train_groups, X_val, y_val, eval_groups)
    res_ndcg = model.evals_result()
    res_ndcg = res_ndcg['validation_0']['ndcg@5'][-1]
    cross_val_score.append(res_ndcg)
    if res_ndcg > 0:
        best_ndcg = res_ndcg
        best_model = model
    print('fold:'+str(idx)+' ndcg@5 ', res_ndcg)

print('cross validation performance:', sum(cross_val_score)/len(cross_val_score))
    


Training on fold:0
[0]	validation_0-ndcg@5:0.24011	validation_1-ndcg@5:0.24062
[1]	validation_0-ndcg@5:0.24704	validation_1-ndcg@5:0.24757
[2]	validation_0-ndcg@5:0.25318	validation_1-ndcg@5:0.25461
[3]	validation_0-ndcg@5:0.28776	validation_1-ndcg@5:0.28845
[4]	validation_0-ndcg@5:0.30672	validation_1-ndcg@5:0.30681
[5]	validation_0-ndcg@5:0.30152	validation_1-ndcg@5:0.30255
[6]	validation_0-ndcg@5:0.31446	validation_1-ndcg@5:0.31548
[7]	validation_0-ndcg@5:0.31142	validation_1-ndcg@5:0.31254
[8]	validation_0-ndcg@5:0.30863	validation_1-ndcg@5:0.30979
[9]	validation_0-ndcg@5:0.31774	validation_1-ndcg@5:0.31852
[10]	validation_0-ndcg@5:0.31522	validation_1-ndcg@5:0.31613
[11]	validation_0-ndcg@5:0.32020	validation_1-ndcg@5:0.32050
[12]	validation_0-ndcg@5:0.32397	validation_1-ndcg@5:0.32432
[13]	validation_0-ndcg@5:0.32659	validation_1-ndcg@5:0.32699
[14]	validation_0-ndcg@5:0.32667	validation_1-ndcg@5:0.32709
[15]	validation_0-ndcg@5:0.32883	validation_1-ndcg@5:0.32933
[16]	validation

[134]	validation_0-ndcg@5:0.36378	validation_1-ndcg@5:0.36930
[135]	validation_0-ndcg@5:0.36382	validation_1-ndcg@5:0.36955
[136]	validation_0-ndcg@5:0.36380	validation_1-ndcg@5:0.36966
[137]	validation_0-ndcg@5:0.36382	validation_1-ndcg@5:0.36980
[138]	validation_0-ndcg@5:0.36385	validation_1-ndcg@5:0.36997
[139]	validation_0-ndcg@5:0.36388	validation_1-ndcg@5:0.36995
[140]	validation_0-ndcg@5:0.36414	validation_1-ndcg@5:0.37010
[141]	validation_0-ndcg@5:0.36429	validation_1-ndcg@5:0.37031
[142]	validation_0-ndcg@5:0.36441	validation_1-ndcg@5:0.37043
[143]	validation_0-ndcg@5:0.36469	validation_1-ndcg@5:0.37057
[144]	validation_0-ndcg@5:0.36501	validation_1-ndcg@5:0.37069
[145]	validation_0-ndcg@5:0.36498	validation_1-ndcg@5:0.37079
[146]	validation_0-ndcg@5:0.36511	validation_1-ndcg@5:0.37087
[147]	validation_0-ndcg@5:0.36499	validation_1-ndcg@5:0.37089
[148]	validation_0-ndcg@5:0.36538	validation_1-ndcg@5:0.37094
[149]	validation_0-ndcg@5:0.36567	validation_1-ndcg@5:0.37105
[150]	va

[16]	validation_0-ndcg@5:0.32845	validation_1-ndcg@5:0.33180
[17]	validation_0-ndcg@5:0.32976	validation_1-ndcg@5:0.33265
[18]	validation_0-ndcg@5:0.33070	validation_1-ndcg@5:0.33357
[19]	validation_0-ndcg@5:0.33213	validation_1-ndcg@5:0.33454
[20]	validation_0-ndcg@5:0.33244	validation_1-ndcg@5:0.33536
[21]	validation_0-ndcg@5:0.33342	validation_1-ndcg@5:0.33594
[22]	validation_0-ndcg@5:0.33490	validation_1-ndcg@5:0.33736
[23]	validation_0-ndcg@5:0.33565	validation_1-ndcg@5:0.33853
[24]	validation_0-ndcg@5:0.33581	validation_1-ndcg@5:0.33913
[25]	validation_0-ndcg@5:0.33670	validation_1-ndcg@5:0.33998
[26]	validation_0-ndcg@5:0.33747	validation_1-ndcg@5:0.34061
[27]	validation_0-ndcg@5:0.33780	validation_1-ndcg@5:0.34067
[28]	validation_0-ndcg@5:0.33820	validation_1-ndcg@5:0.34145
[29]	validation_0-ndcg@5:0.33915	validation_1-ndcg@5:0.34217
[30]	validation_0-ndcg@5:0.34021	validation_1-ndcg@5:0.34304
[31]	validation_0-ndcg@5:0.34034	validation_1-ndcg@5:0.34277
[32]	validation_0-ndcg@5

[150]	validation_0-ndcg@5:0.36480	validation_1-ndcg@5:0.37132
[151]	validation_0-ndcg@5:0.36470	validation_1-ndcg@5:0.37135
[152]	validation_0-ndcg@5:0.36474	validation_1-ndcg@5:0.37153
[153]	validation_0-ndcg@5:0.36467	validation_1-ndcg@5:0.37164
[154]	validation_0-ndcg@5:0.36472	validation_1-ndcg@5:0.37179
[155]	validation_0-ndcg@5:0.36483	validation_1-ndcg@5:0.37197
[156]	validation_0-ndcg@5:0.36523	validation_1-ndcg@5:0.37208
[157]	validation_0-ndcg@5:0.36492	validation_1-ndcg@5:0.37225
[158]	validation_0-ndcg@5:0.36484	validation_1-ndcg@5:0.37229
[159]	validation_0-ndcg@5:0.36495	validation_1-ndcg@5:0.37237
[160]	validation_0-ndcg@5:0.36484	validation_1-ndcg@5:0.37245
[161]	validation_0-ndcg@5:0.36499	validation_1-ndcg@5:0.37246
[162]	validation_0-ndcg@5:0.36532	validation_1-ndcg@5:0.37265
[163]	validation_0-ndcg@5:0.36537	validation_1-ndcg@5:0.37271
[164]	validation_0-ndcg@5:0.36556	validation_1-ndcg@5:0.37274
[165]	validation_0-ndcg@5:0.36547	validation_1-ndcg@5:0.37280
[166]	va

[33]	validation_0-ndcg@5:0.34195	validation_1-ndcg@5:0.34382
[34]	validation_0-ndcg@5:0.34296	validation_1-ndcg@5:0.34475
[35]	validation_0-ndcg@5:0.34278	validation_1-ndcg@5:0.34489
[36]	validation_0-ndcg@5:0.34336	validation_1-ndcg@5:0.34523
[37]	validation_0-ndcg@5:0.34327	validation_1-ndcg@5:0.34536
[38]	validation_0-ndcg@5:0.34377	validation_1-ndcg@5:0.34556
[39]	validation_0-ndcg@5:0.34409	validation_1-ndcg@5:0.34572
[40]	validation_0-ndcg@5:0.34449	validation_1-ndcg@5:0.34628
[41]	validation_0-ndcg@5:0.34535	validation_1-ndcg@5:0.34726
[42]	validation_0-ndcg@5:0.34549	validation_1-ndcg@5:0.34715
[43]	validation_0-ndcg@5:0.34583	validation_1-ndcg@5:0.34737
[44]	validation_0-ndcg@5:0.34646	validation_1-ndcg@5:0.34782
[45]	validation_0-ndcg@5:0.34682	validation_1-ndcg@5:0.34842
[46]	validation_0-ndcg@5:0.34725	validation_1-ndcg@5:0.34907
[47]	validation_0-ndcg@5:0.34740	validation_1-ndcg@5:0.34949
[48]	validation_0-ndcg@5:0.34743	validation_1-ndcg@5:0.34959
[49]	validation_0-ndcg@5

[167]	validation_0-ndcg@5:0.36894	validation_1-ndcg@5:0.37272
[168]	validation_0-ndcg@5:0.36872	validation_1-ndcg@5:0.37284
[169]	validation_0-ndcg@5:0.36874	validation_1-ndcg@5:0.37292
[170]	validation_0-ndcg@5:0.36908	validation_1-ndcg@5:0.37297
[171]	validation_0-ndcg@5:0.36879	validation_1-ndcg@5:0.37299
[172]	validation_0-ndcg@5:0.36885	validation_1-ndcg@5:0.37300
[173]	validation_0-ndcg@5:0.36919	validation_1-ndcg@5:0.37318
[174]	validation_0-ndcg@5:0.36894	validation_1-ndcg@5:0.37337
[175]	validation_0-ndcg@5:0.36926	validation_1-ndcg@5:0.37349
[176]	validation_0-ndcg@5:0.36926	validation_1-ndcg@5:0.37358
[177]	validation_0-ndcg@5:0.36930	validation_1-ndcg@5:0.37367
[178]	validation_0-ndcg@5:0.36904	validation_1-ndcg@5:0.37378
[179]	validation_0-ndcg@5:0.36902	validation_1-ndcg@5:0.37384
[180]	validation_0-ndcg@5:0.36904	validation_1-ndcg@5:0.37386
[181]	validation_0-ndcg@5:0.36894	validation_1-ndcg@5:0.37392
[182]	validation_0-ndcg@5:0.36897	validation_1-ndcg@5:0.37403
[183]	va

[50]	validation_0-ndcg@5:0.34780	validation_1-ndcg@5:0.34934
[51]	validation_0-ndcg@5:0.34827	validation_1-ndcg@5:0.34977
[52]	validation_0-ndcg@5:0.34787	validation_1-ndcg@5:0.34999
[53]	validation_0-ndcg@5:0.34842	validation_1-ndcg@5:0.35050
[54]	validation_0-ndcg@5:0.34863	validation_1-ndcg@5:0.35095
[55]	validation_0-ndcg@5:0.34902	validation_1-ndcg@5:0.35142
[56]	validation_0-ndcg@5:0.34964	validation_1-ndcg@5:0.35206
[57]	validation_0-ndcg@5:0.35024	validation_1-ndcg@5:0.35245
[58]	validation_0-ndcg@5:0.35071	validation_1-ndcg@5:0.35275
[59]	validation_0-ndcg@5:0.35063	validation_1-ndcg@5:0.35297
[60]	validation_0-ndcg@5:0.35120	validation_1-ndcg@5:0.35353
[61]	validation_0-ndcg@5:0.35176	validation_1-ndcg@5:0.35406
[62]	validation_0-ndcg@5:0.35164	validation_1-ndcg@5:0.35413
[63]	validation_0-ndcg@5:0.35185	validation_1-ndcg@5:0.35459
[64]	validation_0-ndcg@5:0.35250	validation_1-ndcg@5:0.35487
[65]	validation_0-ndcg@5:0.35279	validation_1-ndcg@5:0.35548
[66]	validation_0-ndcg@5

[183]	validation_0-ndcg@5:0.36683	validation_1-ndcg@5:0.37414
[184]	validation_0-ndcg@5:0.36645	validation_1-ndcg@5:0.37424
[185]	validation_0-ndcg@5:0.36663	validation_1-ndcg@5:0.37434
[186]	validation_0-ndcg@5:0.36682	validation_1-ndcg@5:0.37445
[187]	validation_0-ndcg@5:0.36652	validation_1-ndcg@5:0.37453
[188]	validation_0-ndcg@5:0.36656	validation_1-ndcg@5:0.37461
[189]	validation_0-ndcg@5:0.36654	validation_1-ndcg@5:0.37467
[190]	validation_0-ndcg@5:0.36644	validation_1-ndcg@5:0.37477
[191]	validation_0-ndcg@5:0.36684	validation_1-ndcg@5:0.37480
[192]	validation_0-ndcg@5:0.36684	validation_1-ndcg@5:0.37495
[193]	validation_0-ndcg@5:0.36671	validation_1-ndcg@5:0.37495
[194]	validation_0-ndcg@5:0.36717	validation_1-ndcg@5:0.37502
[195]	validation_0-ndcg@5:0.36722	validation_1-ndcg@5:0.37503
[196]	validation_0-ndcg@5:0.36739	validation_1-ndcg@5:0.37522
[197]	validation_0-ndcg@5:0.36737	validation_1-ndcg@5:0.37529
[198]	validation_0-ndcg@5:0.36755	validation_1-ndcg@5:0.37544
[199]	va

[66]	validation_0-ndcg@5:0.34837	validation_1-ndcg@5:0.35563
[67]	validation_0-ndcg@5:0.34912	validation_1-ndcg@5:0.35623
[68]	validation_0-ndcg@5:0.34931	validation_1-ndcg@5:0.35650
[69]	validation_0-ndcg@5:0.34971	validation_1-ndcg@5:0.35716
[70]	validation_0-ndcg@5:0.35017	validation_1-ndcg@5:0.35766
[71]	validation_0-ndcg@5:0.35066	validation_1-ndcg@5:0.35808
[72]	validation_0-ndcg@5:0.35100	validation_1-ndcg@5:0.35837
[73]	validation_0-ndcg@5:0.35137	validation_1-ndcg@5:0.35845
[74]	validation_0-ndcg@5:0.35150	validation_1-ndcg@5:0.35856
[75]	validation_0-ndcg@5:0.35166	validation_1-ndcg@5:0.35884
[76]	validation_0-ndcg@5:0.35187	validation_1-ndcg@5:0.35924
[77]	validation_0-ndcg@5:0.35188	validation_1-ndcg@5:0.35929
[78]	validation_0-ndcg@5:0.35219	validation_1-ndcg@5:0.35957
[79]	validation_0-ndcg@5:0.35229	validation_1-ndcg@5:0.35965
[80]	validation_0-ndcg@5:0.35210	validation_1-ndcg@5:0.35969
[81]	validation_0-ndcg@5:0.35237	validation_1-ndcg@5:0.35970
[82]	validation_0-ndcg@5

[199]	validation_0-ndcg@5:0.36332	validation_1-ndcg@5:0.37562
[200]	validation_0-ndcg@5:0.36322	validation_1-ndcg@5:0.37576
[201]	validation_0-ndcg@5:0.36288	validation_1-ndcg@5:0.37565
[202]	validation_0-ndcg@5:0.36300	validation_1-ndcg@5:0.37578
[203]	validation_0-ndcg@5:0.36332	validation_1-ndcg@5:0.37589
[204]	validation_0-ndcg@5:0.36330	validation_1-ndcg@5:0.37594
[205]	validation_0-ndcg@5:0.36362	validation_1-ndcg@5:0.37600
[206]	validation_0-ndcg@5:0.36390	validation_1-ndcg@5:0.37602
[207]	validation_0-ndcg@5:0.36393	validation_1-ndcg@5:0.37609
[208]	validation_0-ndcg@5:0.36374	validation_1-ndcg@5:0.37614
[209]	validation_0-ndcg@5:0.36362	validation_1-ndcg@5:0.37627
[210]	validation_0-ndcg@5:0.36395	validation_1-ndcg@5:0.37636
[211]	validation_0-ndcg@5:0.36399	validation_1-ndcg@5:0.37640
[212]	validation_0-ndcg@5:0.36386	validation_1-ndcg@5:0.37640
[213]	validation_0-ndcg@5:0.36372	validation_1-ndcg@5:0.37640
[214]	validation_0-ndcg@5:0.36403	validation_1-ndcg@5:0.37654
[215]	va

In [None]:
train_data= X.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','rel'])]
y_train = train_data.loc[:, train_data.columns.isin(['rel'])]
train_groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

In [None]:
val_data= X.iloc[X_val_inds]
#We need to keep the id for later predictions
X_val = val_data.loc[:, ~val_data.columns.isin(['srch_id','rel'])]
X_val_with_srch_id = val_data.loc[:, ~val_data.columns.isin(['rel'])]
y_val = val_data.loc[:, val_data.columns.isin(['rel'])]
eval_groups = X_val_with_srch_id.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

In [None]:
model.feature_importances_.round(decimals=2)

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')

In [None]:
def recommend_properties(model, data):
    result = pd.DataFrame(columns=['srch_id', 'prop_id', 'scores'])
    df_list = []
    grouped_data = data.groupby('srch_id')
    for group_name, group in tqdm(grouped_data):
        scores = model.predict(group.loc[:, ~group.columns.isin(['srch_id'])])
        sorted_group = sort_properties(group_name, scores, group['prop_id'])
        df_list.append(sorted_group)
    result = pd.concat(df_list)
    return result

In [None]:
def sort_properties(srch_id, scores, prop_ids):
    sorted_group = pd.DataFrame(columns=['srch_id', 'prop_id', 'scores'])
    sorted_group['srch_id'] = len(scores)*[srch_id]
    sorted_group['prop_id'] = prop_ids.values
    sorted_group['scores'] = scores
    sorted_group = sorted_group.sort_values(by='scores', ascending=False)
    return sorted_group 

In [None]:
start = time.time()
recommendations = recommend_properties(model, preprocessed_test)
end = time.time()
print(f'Total Time for validation prediction:{end - start}')
submission = recommendations.drop(columns=['scores'])
submission.to_csv(f'submission3.csv', index=False)

In [None]:
def recommend_properties_group(model, group):
    scores = model.predict(group.loc[:, ~group.columns.isin(['srch_id'])])
    sorted_group = sort_properties(group.name, scores, group['prop_id'])
    return sorted_group

In [None]:
X_val.head()