In [47]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from metrics import * 

In [48]:
df = pd.read_parquet("xgboost_dataset_ebnerd_small.parquet")
print(len(df))
train_data, test_data = train_test_split(df, test_size=0.2)


202230


In [49]:
train_data.head()

Unnamed: 0,session_id,user_id,article_id,clicked,sentiment_score,is_premium_user,readtime_avg
144904,62813,2378157,9771151,False,0.9856,True,0.0
88107,168675,1218901,9771223,False,0.7164,False,0.0
200159,94848,477103,9771576,True,0.9863,False,0.0
183042,162934,1845202,9769624,False,0.7449,False,0.0
124011,108143,470094,9771903,False,0.993,False,0.0


In [50]:


# Features and labels for training
feature_list = ["sentiment_score","is_premium_user", "readtime_avg"]
X_train = train_data[feature_list].values
y_train = train_data['clicked'].values

X_test = test_data[feature_list].values
y_test = test_data['clicked'].values

# Group (number of articles per session)
group_train = train_data.groupby('session_id').size().to_list()
group_test = test_data.groupby('session_id').size().to_list()


In [51]:
# Convert training data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)

# Convert test data to DMatrix
dtest = xgb.DMatrix(X_test)
dtest.set_group(group_test)


# Set parameters for XGBoost ranking
params = {
    'objective': 'rank:pairwise',  # Pairwise ranking objective
    'eta': 0.1,  # Learning rate
    'max_depth': 6,  # Max tree depth
    'eval_metric': 'ndcg'  # Evaluation metric: Normalized Discounted Cumulative Gain
}

# Train the XGBoost model
bst = xgb.train(params, dtrain, num_boost_round=500)


In [52]:
# Make predictions on the test set
y_pred = bst.predict(dtest)

# Add predictions to the test dataframe
test_data['predicted_score'] = y_pred


In [53]:
# Sort articles by predicted score within each session
test_data = test_data.sort_values(by=['session_id', 'predicted_score'], ascending=[True, False])

# Display the ranked articles
print(test_data[['session_id', 'article_id', 'predicted_score']])




        session_id  article_id  predicted_score
3643            22     9768866         0.955214
3640            22     9775493        -0.429968
3647            22     7213923        -0.551562
77247           24     9776041         0.039638
77244           24     9775673        -0.151200
...            ...         ...              ...
193950      212276     9774020        -0.144959
193951      212276     9770028        -0.478744
193929      212276     9769624        -0.533289
194106      212291     9772433        -0.288509
194108      212291     9391394        -1.306868

[40446 rows x 3 columns]


In [54]:

results = test_data.groupby('session_id', group_keys=False).apply(lambda x: pd.Series({
    'AUC': calculate_auc(x),
    'MRR': calculate_mrr(x),
    'NDCG@5': calculate_ndcg(x, 5),
    'NDCG@10': calculate_ndcg(x, 10)
}))

print(results)


               AUC  MRR    NDCG@5   NDCG@10
session_id                                 
22             NaN  0.0  0.000000  0.000000
24             NaN  0.0  0.000000  0.000000
32             NaN  1.0  1.000000  1.000000
279            NaN  0.0  0.000000  0.000000
280            NaN  0.0  0.000000  0.000000
...            ...  ...       ...       ...
212004      0.4375  0.5  0.386853  0.564092
212185      1.0000  1.0  1.000000  1.000000
212232         NaN  0.0  0.000000  0.000000
212276         NaN  0.0  0.000000  0.000000
212291         NaN  0.0  0.000000  0.000000

[8458 rows x 4 columns]


  results = test_data.groupby('session_id', group_keys=False).apply(lambda x: pd.Series({
