## Imports

In [10]:
import pandas as pd
import numpy as np
from collections import Counter, OrderedDict
from xgboost import XGBRanker, XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle


## Ranking with XGBoost

In [3]:
%%time
df_train = pd.read_csv('~/Desktop/DMT/train_reduced.csv', index_col=0)
# df_valid = pd.read_csv('~/Desktop/DMT/valid_pc.csv', index_col=0)
df_test = pd.read_csv('~/Desktop/DMT/test_set_VU_DM.csv')

CPU times: user 26.7 s, sys: 7.35 s, total: 34.1 s
Wall time: 33.2 s


In [11]:
df_train = shuffle(df_train)

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(
    df_train, df_train.click_bool, test_size=.33)

In [17]:
groups_size_train = OrderedDict(
    sorted(dict(X_train.srch_id.value_counts()).items()))

groups_size_valid = OrderedDict(
    sorted(dict(X_valid.srch_id.value_counts()).items()))


The main idea is to normalize/standardize (mean = 0 and standard deviation = 1) all features before applying classification. Standard Scaler from scikit-learn will normalize the features individually so that each feature column have mean = 0 and standard deviation = 1.

[Importance of Feature Scaling](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py)

In [18]:
# Normalizing training set

model = StandardScaler()
y_train = X_train.click_bool.values

X_train = X_train.drop([
    'click_bool', 'gross_bookings_usd', 'booking_bool', 'date_time', 'position'
],
                        axis=1).values

X_train = model.fit_transform(X_train)


In [20]:
# Normalizing validation set

y_valid = X_valid.click_bool.values

X_valid = X_valid.drop([
    'click_bool', 'gross_bookings_usd', 'booking_bool', 'date_time', 'position'
],
                        axis=1).values
# X = X.replace([np.inf, -np.inf], np.nan).fillna(0).values

X_valid = model.transform(X_valid)


In [21]:
# Test set for predictions

X_test = df_test.drop(['date_time'], axis=1).values

X_test = model.transform(X_test)

Using LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized, evaluation with NDCG@5

In [22]:
# Gradient boosting algorithm

model = XGBRanker(
    objective='rank:ndcg', learning_rate=.1, max_depth=5, n_estimators=1500)

model.fit(
    X_train,
    y_train,
    group=list(groups_size_train.values()),
    eval_metric='ndcg@5',
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_group=[list(groups_size_train.values()),
                list(groups_size_valid.values())],
    early_stopping_rounds=100)


XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
     colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
     max_depth=5, min_child_weight=1, missing=None, n_estimators=1500,
     n_jobs=-1, nthread=None, objective='rank:ndcg', random_state=0,
     reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
     subsample=1)

In [23]:
model.best_score

0.981743

In [24]:
y_pred = model.predict(X_test)

In [25]:
df_pred = pd.DataFrame(y_pred, columns=['score'], index=df_test.index)

In [26]:
df_pred['srch_id'] = df_test.srch_id
df_pred['prop_id'] = df_test.prop_id

In [27]:
df_pred.sort_values(['srch_id', 'score'], ascending=[True, False], inplace=True)

In [28]:
df_pred.drop(['score'], axis=1, inplace=True)
df_pred.reset_index(drop=True, inplace=True)

In [30]:
df_pred.to_csv('~/Desktop/DMT/submission03.csv', index=False)