In [1]:
"""
Imports
"""

import xgboost as xgb
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.datasets import make_classification
from utils.ranking_metrics import RankingMetrics
from utils.common import CommonUtils

## synthetic data

In [2]:
"""
data
"""

seed = 1234
X, y = make_classification(random_state=seed)
rng = np.random.default_rng(seed=seed)
n_query_groups = 3
qid = rng.integers(0, 3, size=X.shape[0])

# sort the inputs based on query index
sorted_idx = np.argsort(qid)
X = X[sorted_idx, :]
y = y[sorted_idx]
qid=qid[sorted_idx] # must be sorted in non-decreasing order for training process to work (not in the documentation)

In [3]:
# create dataframe from ndarray

df = pd.DataFrame(data=X)
df["index"] = sorted_idx
df["y"] = y
df["qid"] = qid
CommonUtils.show_df(df)
print(df["y"].value_counts())

+----+----------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+-----------+-----------+---------+-----+-------+
|    |        0 |         1 |         2 |         3 |          4 |         5 |         6 |          7 |          8 |          9 |         10 |        11 |        12 |         13 |        14 |         15 |         16 |        17 |        18 |        19 |   index |   y |   qid |
|----+----------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+-----------+-----------+---------+-----+-------|
|  0 | 1.21112  |  0.663858 |  0.344323 | -1.02793  |  1.16554   |  0.903031 | -1.80463  |  0.633436  |  0.159223  |  0.457997  |  0.413614  | -0.176121 |  0.156078 |

In [4]:
# get 1's and 0's for every query

CommonUtils.show_df(df.groupby(["qid", "y"]).agg(nrows = ("y", "count")).reset_index(), 10)

+----+-------+-----+---------+
|    |   qid |   y |   nrows |
|----+-------+-----+---------|
|  0 |     0 |   0 |      16 |
|  1 |     0 |   1 |      15 |
|  2 |     1 |   0 |      13 |
|  3 |     1 |   1 |      13 |
|  4 |     2 |   0 |      21 |
|  5 |     2 |   1 |      22 |
+----+-------+-----+---------+


## ranking model

In [5]:
# model training

ranker = xgb.XGBRanker(tree_method="hist", lambdarank_num_pair_per_sample=8, objective="rank:ndcg", lambdarank_pair_method="topk")
ranker.fit(X, y, qid=qid)

In [6]:
# prediction

scores=ranker.predict(X)
df["scores"] = scores
CommonUtils.show_df(df)

+----+----------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+-----------+-----------+---------+-----+-------+-----------+
|    |        0 |         1 |         2 |         3 |          4 |         5 |         6 |          7 |          8 |          9 |         10 |        11 |        12 |         13 |        14 |         15 |         16 |        17 |        18 |        19 |   index |   y |   qid |    scores |
|----+----------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+-----------+-----------+---------+-----+-------+-----------|
|  0 | 1.21112  |  0.663858 |  0.344323 | -1.02793  |  1.16554   |  0.903031 | -1.80463  |  0.633436  |  0.159223  |  0.457997  | 

In [7]:
# check prediction accuracy

df["y"] = df["y"].astype(str)
fig = px.violin(x=df["y"], y=df["scores"], box=True)
fig.show()

In [8]:
"""
get ranking metric - mean average precision at 10
"""

sorted_df =  df.sort_values(by=["qid", "scores"], ascending=[True, False])
actual_df = df.copy(deep=False)
actual_df["y"] = actual_df["y"].astype(int)

average_precision = {}
for qid in set(df["qid"]):
    actual_index = actual_df[actual_df["qid"]==qid]["index"].tolist()
    sorted_index = sorted_df[sorted_df["qid"]==qid]["index"].tolist()
    clicked_flag = actual_df[actual_df["qid"]==qid]["y"].tolist()
    avg_precision, _ = RankingMetrics.average_precision_at_k(actual_index, sorted_index, clicked_flag, 10)
    average_precision[qid] = avg_precision

map = np.mean(list(average_precision.values()))
print(f"Mean Average Precision @ 10: {map}")


Mean Average Precision @ 10: 1.0


## classification model

In [9]:
classifier = xgb.XGBClassifier()
classifier.fit(X, y)

In [10]:
# prediction

pred=classifier.predict_proba(X)[:,1]
df["pred"] = pred
CommonUtils.show_df(df, 5)

+----+----------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+-----------+-----------+---------+-----+-------+-----------+-----------+
|    |        0 |         1 |         2 |         3 |          4 |         5 |         6 |          7 |          8 |          9 |         10 |        11 |        12 |         13 |        14 |         15 |         16 |        17 |        18 |        19 |   index |   y |   qid |    scores |      pred |
|----+----------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+-----------+-----------+---------+-----+-------+-----------+-----------|
|  0 | 1.21112  |  0.663858 |  0.344323 | -1.02793  |  1.16554   |  0.903031 | -1.80463  |  0.

In [11]:
# check prediction accuracy

df["y"] = df["y"].astype(str)
fig = px.violin(x=df["y"], y=df["pred"], box=True)
fig.show()

In [12]:
"""
get ranking metric - mean average precision at 10
"""

sorted_df = df.sort_values(by=["qid", "pred"], ascending=[True, False])
actual_df = df.copy(deep=False)
actual_df["y"] = actual_df["y"].astype(int)

average_precision = {}
for qid in set(df["qid"]):
    actual_index = actual_df[actual_df["qid"]==qid]["index"].tolist()
    sorted_index = sorted_df[sorted_df["qid"]==qid]["index"].tolist()
    clicked_flag = actual_df[actual_df["qid"]==qid]["y"].tolist()
    avg_precision, _ = RankingMetrics.average_precision_at_k(actual_index, sorted_index, clicked_flag, 10)
    average_precision[qid] = avg_precision

map = np.mean(list(average_precision.values()))
print(f"Mean Average Precision @ 10: {map}")


Mean Average Precision @ 10: 1.0
