In [69]:
"""
Imports
"""

import xgboost as xgb
import pandas as pd
import numpy as np
import plotly.express as px
from tabulate import tabulate
from sklearn.datasets import make_classification

## synthetic data

In [27]:
"""
data
"""

seed = 1234
X, y = make_classification(random_state=seed)
rng = np.random.default_rng(seed=seed)
n_query_groups = 3
qid = rng.integers(0, 3, size=X.shape[0])

# sort the inputs based on query index
sorted_idx = np.argsort(qid)
X = X[sorted_idx, :]
y = y[sorted_idx]
qid=qid[sorted_idx] # must be sorted in non-decreasing order for training process to work (not in the documentation)

In [47]:
# create dataframe from ndarray

df = pd.DataFrame(data=X)
df["y"] = y
df["qid"] = qid[sorted_idx]
print(tabulate(df.head(5), headers="keys", tablefmt="grid"))
print(df["y"].value_counts())

+----+----------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+-----------+-----------+-----+-------+
|    |        0 |         1 |         2 |         3 |          4 |         5 |         6 |          7 |          8 |          9 |         10 |        11 |        12 |         13 |        14 |         15 |         16 |        17 |        18 |        19 |   y |   qid |
|  0 | 1.21112  |  0.663858 |  0.344323 | -1.02793  |  1.16554   |  0.903031 | -1.80463  |  0.633436  |  0.159223  |  0.457997  |  0.413614  | -0.176121 |  0.156078 | -0.11388   |  0.942438 | -1.87078   |  0.509758  |  1.63681  | -0.647094 |  0.6611   |   1 |     0 |
+----+----------+-----------+-----------+-----------+------------+-----------+-----------+------------+------------+------------+------------+-----------+-----------+------------+-----------+-----

In [50]:
# get 1's and 0's for every query

df.groupby(["qid", "y"]).agg(nrows = ("y", "count")).reset_index()

Unnamed: 0,qid,y,nrows
0,0,0,16
1,0,1,15
2,1,0,13
3,1,1,13
4,2,0,21
5,2,1,22


## ranking model

In [54]:
# model training

ranker = xgb.XGBRanker(tree_method="hist", lambdarank_num_pair_per_sample=8, objective="rank:ndcg", lambdarank_pair_method="topk")
ranker.fit(X, y, qid=qid)

In [68]:
# prediction

scores=ranker.predict(X)
df["scores"] = scores
print(tabulate(df.head(10), headers="keys", tablefmt="grid"))


+----+-----------+------------+------------+-----------+------------+------------+------------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+------------+------------+-----+-------+-----------+
|    |         0 |          1 |          2 |         3 |          4 |          5 |          6 |          7 |          8 |          9 |         10 |        11 |        12 |         13 |        14 |         15 |         16 |        17 |         18 |         19 |   y |   qid |    scores |
|  0 |  1.21112  |  0.663858  |  0.344323  | -1.02793  |  1.16554   |  0.903031  | -1.80463   |  0.633436  |  0.159223  |  0.457997  |  0.413614  | -0.176121 |  0.156078 | -0.11388   |  0.942438 | -1.87078   |  0.509758  |  1.63681  | -0.647094  |  0.6611    |   1 |     0 |  0.942984 |
+----+-----------+------------+------------+-----------+------------+------------+------------+------------+------------+------------+-----

In [72]:
# check prediction accuracy

df["y"] = df["y"].astype(str)
fig = px.violin(x=df["y"], y=df["scores"], box=True)
fig.show()

## classification model

In [74]:
classifier = xgb.XGBClassifier()
classifier.fit(X, y)

In [78]:
# prediction

pred=classifier.predict_proba(X)[:,1]
df["pred"] = pred
print(tabulate(df.head(10), headers="keys", tablefmt="grid"))


+----+-----------+------------+------------+-----------+------------+------------+------------+------------+------------+------------+------------+-----------+-----------+------------+-----------+------------+------------+-----------+------------+------------+-----+-------+-----------+-----------+
|    |         0 |          1 |          2 |         3 |          4 |          5 |          6 |          7 |          8 |          9 |         10 |        11 |        12 |         13 |        14 |         15 |         16 |        17 |         18 |         19 |   y |   qid |    scores |      pred |
|  0 |  1.21112  |  0.663858  |  0.344323  | -1.02793  |  1.16554   |  0.903031  | -1.80463   |  0.633436  |  0.159223  |  0.457997  |  0.413614  | -0.176121 |  0.156078 | -0.11388   |  0.942438 | -1.87078   |  0.509758  |  1.63681  | -0.647094  |  0.6611    |   1 |     0 |  0.942984 | 0.968692  |
+----+-----------+------------+------------+-----------+------------+------------+------------+--------

In [79]:
# check prediction accuracy

df["y"] = df["y"].astype(str)
fig = px.violin(x=df["y"], y=df["pred"], box=True)
fig.show()