In [1]:
from __future__ import annotations

import argparse
import json
import os
import pickle as pkl

import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file
from utils.common import CommonUtils

import xgboost as xgb
from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples

In [2]:
"""
Fold 1  - data
"""

train_path = "../data/MSLR-WEB10K/Fold1/train.txt"
valid_path = "../data/MSLR-WEB10K/Fold1/valid.txt"
test_path = "../data/MSLR-WEB10K/Fold1/test.txt"

X_train, y_train, qid_train = load_svmlight_file(
    train_path, query_id=True, dtype=np.float32
)
y_train = y_train.astype(np.int32)
qid_train = qid_train.astype(np.int32)

X_valid, y_valid, qid_valid = load_svmlight_file(
    valid_path, query_id=True, dtype=np.float32
)
y_valid = y_valid.astype(np.int32)
qid_valid = qid_valid.astype(np.int32)

X_test, y_test, qid_test = load_svmlight_file(
    test_path, query_id=True, dtype=np.float32
)
y_test = y_test.astype(np.int32)
qid_test = qid_test.astype(np.int32)

data = RelDataCV(
    train=(X_train, y_train, qid_train),
    test=(X_test, y_test, qid_test),
    max_rel=4,
)

In [3]:
"""
curate pandas dataframe - train data
"""

train_data = pd.DataFrame(data.train[0].toarray())
train_data["y"] = data.train[1]
train_data["qid"] = data.train[2]
CommonUtils.show_df(train_data, 5)

+----+-----+-----+-----+-----+-----+-----+-----+----------+-----+-----+------+------+------+------+------+---------+---------+---------+---------+---------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+---------+------+----------+------+----------+----------+------+----------+------+----------+----------+------+----------+------+----------+----------+------+------+------+----------+----------+------+----------+------+----------+----------+------+----------+------+----------+----------+------+----------+------+----------+----------+---------+---------+------+---------+----------+---------+---------+------+----------+----------+---------+---------+------+----------+----------+---------+---------+------+----------+-----------+---------+---------+------+----------+------+------+------+------+------+----------+-------+----------+-------+----------+----------+---------+---------+-------+----------+----------+-----------+-----------+-

In [4]:
# unique qids

qids = set(train_data["qid"])
print(f"unique qids: {len(qids)}")

unique qids: 6000


In [5]:
"""
check count of items for every relevance score
"""

# by qid
item_count = (train_data
                .groupby(["qid", "y"])
                .agg(nrows = ("y", "count"))
                .reset_index()
                )
CommonUtils.show_df(item_count[item_count["qid"]==1])

# total
item_count = (train_data
                .groupby(["y"])
                .agg(nrows = ("y", "count"))
                .reset_index()
                )
CommonUtils.show_df(item_count)                

+----+-------+-----+---------+
|    |   qid |   y |   nrows |
|----+-------+-----+---------|
|  0 |     1 |   0 |      57 |
|  1 |     1 |   1 |      16 |
|  2 |     1 |   2 |      12 |
|  3 |     1 |   3 |       1 |
+----+-------+-----+---------+
+----+-----+---------+
|    |   y |   nrows |
|----+-----+---------|
|  0 |   0 |  377957 |
|  1 |   1 |  232569 |
|  2 |   2 |   95082 |
|  3 |   3 |   12658 |
|  4 |   4 |    5146 |
+----+-----+---------+


In [24]:
# sort by qid

train_data = train_data.sort_values(by=["qid"], ascending=[True])
CommonUtils.show_df(train_data, 10)


+----+-----+-----+-----+-----+-----+-----+-----+-----+----------+-----+------+------+------+------+------+---------+---------+---------+---------+---------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+----------+------+------+----------+----------+-----------+------+------+----------+-----------+----------+------+------+----------+----------+----------+------+----------+-------+----------+----------+------+----------+----------+----------+----------+------+----------+----------+----------+---------+------+------+----------+---------+-----------+---------+---------+----------+----------+----------+---------+---------+---------+----------+----------+---------+---------+---------+----------+----------+---------+---------+---------+----------+------------+---------+---------+---------+------------+------+------+------+------+------+----------+-------+-------+----------+----------+---------+---------+---------+----------+---------+

In [26]:
"""
curate pandas dataframe - test data
"""

test_data = pd.DataFrame(data.test[0].toarray())
test_data["y"] = data.test[1]
test_data["qid"] = data.test[2]

test_data = test_data.sort_values(by=["qid"], ascending=[True])
CommonUtils.show_df(test_data, 5)

+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+---------+---------+---------+---------+---------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+--------+------+------+------+--------+----------+----------+----------+----------+----------+----------+----------+----------+------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+---------+----------+----------+----------+----------+--------+---------+------+----------+-----------+---------+---------+----------+----------+-----------+---------+---------+---------+----------+-------------+---------+-----------+---------+------------+------+------+------+------+------+----------+----------+---------+----------+----------+---------+---------+----------+----------

In [36]:
features = [col for col in train_data.columns if col not in ["y", "qid"]]


In [41]:
ranker = xgb.XGBRanker(
    tree_method="hist",
    device="cuda",
    lambdarank_pair_method="topk",
    lambdarank_num_pair_per_sample=13,
    eval_metric=["ndcg@1", "ndcg@8"],
)

X_train = train_data[features]
y_train = train_data["y"]
qid_train = train_data["qid"]
X_test = test_data[features]
y_test = test_data["y"]
qid_test = test_data["qid"]
ranker.fit(
    X_train,
    y_train,
    qid=qid_train,
    eval_set=[(X_test, y_test), (X_train, y_train)],
    eval_qid=[qid_test, qid_train],
    verbose=10,
)

[0]	validation_0-ndcg@1:0.38893	validation_0-ndcg@8:0.40805	validation_1-ndcg@1:0.39670	validation_1-ndcg@8:0.41129
[10]	validation_0-ndcg@1:0.46002	validation_0-ndcg@8:0.45898	validation_1-ndcg@1:0.50918	validation_1-ndcg@8:0.48017
[20]	validation_0-ndcg@1:0.48268	validation_0-ndcg@8:0.47900	validation_1-ndcg@1:0.54836	validation_1-ndcg@8:0.50738
[30]	validation_0-ndcg@1:0.48238	validation_0-ndcg@8:0.48600	validation_1-ndcg@1:0.56960	validation_1-ndcg@8:0.52347
[40]	validation_0-ndcg@1:0.49014	validation_0-ndcg@8:0.49161	validation_1-ndcg@1:0.58553	validation_1-ndcg@8:0.53628
[50]	validation_0-ndcg@1:0.49171	validation_0-ndcg@8:0.49360	validation_1-ndcg@1:0.59937	validation_1-ndcg@8:0.54734
[60]	validation_0-ndcg@1:0.49039	validation_0-ndcg@8:0.49470	validation_1-ndcg@1:0.60775	validation_1-ndcg@8:0.55509
[70]	validation_0-ndcg@1:0.49460	validation_0-ndcg@8:0.49595	validation_1-ndcg@1:0.61728	validation_1-ndcg@8:0.56256
[80]	validation_0-ndcg@1:0.49333	validation_0-ndcg@8:0.49597	vali