# Config.py

In [3]:
# TRAINING_PATH = "data/training_set_VU_DM.csv"
# TEST_PATH = "data/test_set_VU_DM.csv"

TRAINING_PATH = "train_subset.csv"
TEST_PATH = "test_subset.csv"

# Features.py

In [4]:
def add_datetime_features(df):
    df['month'] = df['date_time'].dt.month
    df['dayofweek'] = df['date_time'].dt.dayofweek
    df['hour'] = df['date_time'].dt.hour
    return df

def remove_travel_agents(df):
    """
    Remove travel agents defined as having more than 20 bookings
    """
    # Get all unique user ids
    unique_users = df.user_id.unique()
    # Remove all non-bookings to make counting easier
    t1 = df[df.is_booking != 0]
    for user in unique_users:
        # Count the number of rows under a single user
        bookings = len(t1.loc[t1['user_id'] == user])
        if bookings >= 20:
            # Remove the travel agent from dataset
            df = df[df.user_id != user]
    return df

# Preprocess.py

In [5]:
# Install relevant packages
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit


def add_features(df):
    # df = remove_travel_agents(df)
    # df = add_datetime_features(df)
    return df

def normalize(df):
    return df

def calculate_score(df):
    """
    calculates final score used to predict rank
    """
    df['booking_bool'] *= 5.0
    score = df[['booking_bool', 'click_bool']].max(axis=1)
    return score

def train_val_split(X, y, groups, val_size=.7):
    """
    Splits training data based on groups
    """
    gss = GroupShuffleSplit(n_splits=1, train_size=val_size, random_state=42)
    train_indices, val_indices = next(gss.split(X, y, groups))

    X_train, y_train = X.loc[train_indices], y.loc[train_indices]
    X_val, y_val = X.loc[val_indices], y.loc[val_indices]

    print("Training / Validation shape:")
    print((X_train.shape, y_train.shape), (X_val.shape, y_val.shape))
    return X_train, y_train, X_val, y_val

def load_train_val():
    df = pd.read_csv(TRAINING_PATH, parse_dates=['date_time'])
    df['score'] = calculate_score(df)

    # Add engineered features
    df = add_features(df)
    # Normalize features
    df = normalize(df)

    X, y = df.drop(['date_time', 'position', 'score', 'click_bool', 'booking_bool', 'gross_bookings_usd'], axis=1), df[['srch_id', 'score']]
    groups = df['srch_id']
    return train_val_split(X, y, groups)

def load_test():
    df = pd.read_csv(TEST_PATH, parse_dates=['date_time'])
    df = add_features(df)
    X = df.drop(['date_time'], axis=1)
    return X

# Main.py

In [28]:
from sklearn.metrics import ndcg_score
from xgboost import XGBRanker

def make_submission(model):
    X_test = load_test()
    with open('submission.csv', 'w') as fout:
        fout.write("srch_id, prop_id\n")
        for srch_id, group in X_test.groupby(['srch_id']):
            prop_ids = make_ranking(group, model)
            for prop_id in prop_ids:
                fout.write(f"{srch_id},{prop_id}\n")
    print("Made submission")

def make_ranking(X_test, model):
    predictions = predict(model, X_test)
    sorted_indices = np.argsort(predictions)
    prop_ids = X_test['prop_id'].iloc[sorted_indices]
    return prop_ids

def evaluate_model(X_data, y_data):
    # Evaluate predictions
    gt_values = y_data.groupby('srch_id')['score'].apply(np.array).values
    predictions = (X_data.groupby('srch_id').apply(lambda x: predict(model, x))).values
    # prop_id_predictions = X_train.groupby('srch_id')['prop_id'].apply(np.array).values

    ndcg_score_list = []
    for i in range(len(predictions)):
        score = ndcg_score(gt_values[i].reshape(1, -1), predictions[i].reshape(1, -1), k=5)
        ndcg_score_list.append(score)
    mean_score = np.mean(np.array(ndcg_score_list))
    print("NDCG@5 Train Score:", mean_score)

def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['srch_id'])])

def get_gt_values(df):
    return df['score']

def run():
    # Load the training, validation data
    X_train, y_train, X_val, y_val = load_train_val()
    groups = X_train.groupby('srch_id').size().to_frame('size')['size'].to_numpy()
    x_train_values = X_train.drop(['srch_id'], axis=1)
    # Train LambdaMART model
    model = XGBRanker(eval_metric='ndcg@5')
    y_train_scores = y_train['score']
    model.fit(x_train_values, y_train_scores, group=groups, verbose=True)

    # Evaluate training predictions
    evaluate_model(X_train, y_train)
    # Evaluate validation predictions
    evaluate_model(X_val, y_val)

    return model

# Run main script ---------------------------------------------------------------------------------------

In [8]:
load_train_val()

Training / Validation shape:
((174605, 49), (174605, 2)) ((74329, 49), (74329, 2))


In [10]:
X_train.groupby('srch_id').size().to_frame('size')['size'].to_numpy()

array([32,  5, 33, ..., 13, 32, 21], dtype=int64)

In [12]:
X_train.drop(['srch_id'], axis=1)

In [13]:
XGBRanker(eval_metric='ndcg@5')

In [16]:
y_train['score']

In [17]:
model.fit(x_train_values, y_train_scores, group=groups, verbose=True)

XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None,
          enable_categorical=False, eval_metric='ndcg@5', gamma=0, gpu_id=-1,
          grow_policy='depthwise', importance_type=None,
          interaction_constraints='', learning_rate=0.300000012, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
          min_child_weight=1, missing=nan, monotone_constraints='()',
          n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
          random_state=0, reg_alpha=0, reg_lambda=1, ...)

In [29]:
evaluate_model(X_train, y_train)

NDCG@5 Train Score: 0.5451838479277122


In [30]:
evaluate_model(X_val, y_val)

NDCG@5 Train Score: 0.34703839717622303


In [37]:
load_test()

In [39]:
predict(model, X_test)

In [51]:
np.argsort(predictions)

array([ 60780, 124320, 227419, ..., 169657, 158138, 196192], dtype=int64)

In [55]:
X_test['prop_id'].iloc[sorted_indices]

60780      23856
124320     23512
227419    138309
133040     39748
26201      60832
           ...  
132636    137997
147771     21886
169657    137997
158138     62726
196192     62726
Name: prop_id, Length: 247567, dtype: int64

# ---------------------------------------------------------------------------------------------------------------

In [56]:
model = run()
make_submission(model)

Training / Validation shape:
((174605, 49), (174605, 2)) ((74329, 49), (74329, 2))
NDCG@5 Train Score: 0.5451838479277122
NDCG@5 Train Score: 0.34703839717622303


XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None,
          enable_categorical=False, eval_metric='ndcg@5', gamma=0, gpu_id=-1,
          grow_policy='depthwise', importance_type=None,
          interaction_constraints='', learning_rate=0.300000012, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
          min_child_weight=1, missing=nan, monotone_constraints='()',
          n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
          random_state=0, reg_alpha=0, reg_lambda=1, ...)

In [59]:
make_submission(model)

Made submission
