In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('max_columns', None)

In [3]:
start = time.time()
df_train = pd.read_csv("raw_data/train_data.csv")
end = time.time()
print("Train set loaded in ", end - start, " seconds.")

Train set loaded in  11.412056684494019  seconds.


In [4]:
df_train.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,1,2.2,0.0149,5.03,26,170.74,0,23246,1,0,4,0,1,1,,,1,,,,,,,0.0,0.0,,,,,0.0,1.0,,,,,,,,0.0,0.0,,0,,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,1,2.2,0.0245,4.92,21,179.8,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,1,2.83,0.0125,4.39,34,602.77,0,23246,1,0,4,0,1,1,,,1,,,,-1.0,0.0,5.0,-1.0,0.0,5.0,,,,0.0,1.0,,,,,,,,-1.0,0.0,5.0,0,,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,1,2.64,0.1241,4.93,4,143.58,0,23246,1,0,4,0,1,1,,,1,,,,0.0,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0,0.0,,0,,0


In [5]:
df_train.shape

(4958347, 54)

In [6]:
df_train.isnull().sum()

srch_id                              0
date_time                            0
site_id                              0
visitor_location_country_id          0
visitor_hist_starrating        4706481
visitor_hist_adr_usd           4705359
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                 7364
prop_brand_bool                      0
prop_location_score1                 0
prop_location_score2           1090348
prop_log_historical_price            0
position                             0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
srch_query_affinity_score      4640941
orig_destination_distance

In [7]:
df_train['booking_bool'].value_counts()

0    4819957
1     138390
Name: booking_bool, dtype: int64

In [8]:
counts = pd.DataFrame(df_train['prop_id'].value_counts(sort=False).reset_index())
counts.columns = ['prop_id', 'counts']

In [9]:
counts

Unnamed: 0,prop_id,counts
0,4098,64
1,8196,1
2,12294,19
3,16392,12
4,20490,6
...,...,...
129108,24564,16
129109,16376,50
129110,12282,10
129111,8188,2


In [10]:
clicks = pd.DataFrame(df_train.groupby('prop_id')['click_bool'].sum().reset_index())
clicks.columns = ['prop_id', 'clicks']


In [11]:
clicks

Unnamed: 0,prop_id,clicks
0,1,1
1,2,1
2,3,2
3,4,1
4,5,2
...,...,...
129108,140817,0
129109,140818,0
129110,140819,0
129111,140820,0


In [12]:
bookings = pd.DataFrame(df_train.groupby('prop_id')['booking_bool'].sum().reset_index())
bookings.columns = ['prop_id', 'bookings']

In [13]:
bookings

Unnamed: 0,prop_id,bookings
0,1,0
1,2,1
2,3,2
3,4,1
4,5,0
...,...,...
129108,140817,0
129109,140818,0
129110,140819,0
129111,140820,0


In [14]:
temp = pd.merge(counts, clicks, left_on='prop_id', right_on='prop_id')
df_rates = pd.merge(temp, bookings, left_on='prop_id', right_on='prop_id')

In [15]:
df_rates

Unnamed: 0,prop_id,counts,clicks,bookings
0,4098,64,1,1
1,8196,1,0,0
2,12294,19,1,1
3,16392,12,0,0
4,20490,6,0,0
...,...,...,...,...
129108,24564,16,1,0
129109,16376,50,4,3
129110,12282,10,1,1
129111,8188,2,0,0


# Handle Missing Values

In [16]:
#fill NaNs in 'gross_bookings_usd with 0.
df_train.loc[df_train['gross_bookings_usd'].isnull(), 'gross_bookings_usd'] = 0

In [17]:
#fill NaNs with 0 for compet
comp_descr = {i: 0 for i in df_train.columns[27:51]}
df_train.fillna(comp_descr, inplace=True)

In [18]:
df_train[['visitor_hist_starrating','prop_starrating']]

Unnamed: 0,visitor_hist_starrating,prop_starrating
0,,3
1,,4
2,,3
3,,2
4,,4
...,...,...
4958342,,3
4958343,,3
4958344,,3
4958345,,3


In [19]:
df_train['visitor_hist_starrating'].value_counts()

4.00    23416
3.00    20879
3.50    17088
2.50    10534
2.00     9157
        ...  
2.01       16
1.96       11
1.92        7
4.96        6
4.65        6
Name: visitor_hist_starrating, Length: 312, dtype: int64

In [20]:
df_train['visitor_hist_starrating'].fillna(0, inplace=True)
df_train['visitor_hist_adr_usd'].fillna(0, inplace=True)

In [21]:
df_train['prop_review_score'].fillna(0, inplace=True)
df_train['prop_location_score2'].fillna(0, inplace=True)

In [22]:
df_train['orig_destination_distance'].mean()

1301.234405997199

If the mean is greater than the median, the distribution is positively skewed.

In [23]:
df_train['orig_destination_distance'].median()

386.6

In [24]:
df_train['orig_destination_distance'].fillna(df_train['orig_destination_distance'].mean(), inplace=True)

In [25]:
df_train['srch_query_affinity_score'].describe()

count    317406.000000
mean        -24.146418
std          15.743238
min        -326.567500
25%         -30.774775
50%         -20.451300
75%         -13.350625
max          -2.494100
Name: srch_query_affinity_score, dtype: float64

In [26]:
df_train['srch_query_affinity_score'].fillna(-330, inplace=True)

In [27]:
#Check if any more missing values in the dataset
df_train.isnull().sum()

srch_id                        0
date_time                      0
site_id                        0
visitor_location_country_id    0
visitor_hist_starrating        0
visitor_hist_adr_usd           0
prop_country_id                0
prop_id                        0
prop_starrating                0
prop_review_score              0
prop_brand_bool                0
prop_location_score1           0
prop_location_score2           0
prop_log_historical_price      0
position                       0
price_usd                      0
promotion_flag                 0
srch_destination_id            0
srch_length_of_stay            0
srch_booking_window            0
srch_adults_count              0
srch_children_count            0
srch_room_count                0
srch_saturday_night_bool       0
srch_query_affinity_score      0
orig_destination_distance      0
random_bool                    0
comp1_rate                     0
comp1_inv                      0
comp1_rate_percent_diff        0
comp2_rate

# Feature Engineering

Create a column called score which would be 
df['score'] = 4 * df.booking_bool + df.click_bool

In [28]:
df_train['score'] = 4 * df_train.booking_bool + df_train.click_bool

In [29]:
df_train[df_train['srch_id'] == 1].score.value_counts()

0    27
5     1
Name: score, dtype: int64

In [30]:
#df_train['starrating_diff'] = np.abs(df_train['visitor_hist_starrating'] - df_train['prop_starrating'])
#df_train['usd_diff'] = np.abs(df_train['visitor_hist_adr_usd'] - df_train['price_usd'])

In [31]:
del df_train['date_time']

In [32]:
df_train.shape

(4958347, 54)

In [33]:
y = df_train.score
y.shape

(4958347,)

In [46]:
X = df_train.drop(['score', 'booking_bool', 'click_bool'], axis=1)

# Train Test Split

In [36]:
#groups = np.array(df_train.srch_id.value_counts(sort=False).sort_index())

In [37]:
#groups

In [38]:
#xgb_rank = xgb.XGBRanker()

In [39]:
#model = xgb_rank.fit(X_train,y_train,groups[0:100], eval_set=[(X_test,y_test)], eval_group=[groups[149765:149865].tolist()],eval_metric='ndcg',early_stopping_rounds=20)

In [40]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 7).split(X, groups=df_train['srch_id'])

X_train_inds, X_test_inds = next(gss)

In [41]:
X_train_inds

array([      0,       1,       2, ..., 4958344, 4958345, 4958346],
      dtype=int64)

In [50]:
train_data= df_train.iloc[X_train_inds]
X_train = train_data.loc[:, ~train_data.columns.isin(['srch_id','score'])]
y_train = train_data.loc[:, train_data.columns.isin(['score'])]
groups = train_data.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

In [54]:
test_data= df_train.iloc[X_test_inds]
#We need to keep the id for later predictions
X_test = test_data.loc[:, ~test_data.columns.isin(['score'])]
y_test = test_data.loc[:, test_data.columns.isin(['score'])]

In [55]:
import xgboost as xgb

model = xgb.XGBRanker(  
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

model.fit(X_train, y_train, group=groups, verbose=True)

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.9, enable_categorical=False,
          eta=0.05, gamma=0, gpu_id=0, importance_type=None,
          interaction_constraints='', learning_rate=0.1, max_delta_step=0,
          max_depth=6, min_child_weight=1, missing=nan,
          monotone_constraints='()', n_estimators=110, n_jobs=16,
          num_parallel_tree=1, predictor='auto', random_state=42, reg_alpha=0,
          reg_lambda=1, scale_pos_weight=None, subsample=0.75,
          tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [56]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])
  
predictions = (X_test.groupby('srch_id')
               .apply(lambda x: predict(model, x)))

pandas.core.series.Series