In [82]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [83]:
df = pd.read_csv("xgboost_dataset.csv")

train_data, test_data = train_test_split(df, test_size=0.2)


In [84]:
train_data.head()

Unnamed: 0,session_id,user_id,article_id,clicked,sentiment_score,is_premium_user,readtime_avg
195847,190540,1279115,9772957,0,0.8235,False,61.50575
53324,89359,2471663,9774517,0,0.9955,False,41.028038
12852,24089,1955113,9345280,0,0.9959,True,38.6631
194883,91992,301528,9769497,1,0.8852,False,20.974249
171729,41919,1462963,9770515,0,0.9877,False,179.09302


In [85]:


# Features and labels for training
feature_list = ["sentiment_score","is_premium_user", "readtime_avg"]
X_train = train_data[feature_list].values
y_train = train_data['clicked'].values

X_test = test_data[feature_list].values
y_test = test_data['clicked'].values

# Group (number of articles per session)
group_train = train_data.groupby('session_id').size().to_list()
group_test = test_data.groupby('session_id').size().to_list()


In [86]:
# Convert training data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)

# Convert test data to DMatrix
dtest = xgb.DMatrix(X_test)
dtest.set_group(group_test)


In [87]:
# Set parameters for XGBoost ranking
params = {
    'objective': 'rank:pairwise',  # Pairwise ranking objective
    'eta': 0.1,  # Learning rate
    'max_depth': 6,  # Max tree depth
    'eval_metric': 'ndcg'  # Evaluation metric: Normalized Discounted Cumulative Gain
}

# Train the XGBoost model
bst = xgb.train(params, dtrain, num_boost_round=500)


In [88]:
# Make predictions on the test set
y_pred = bst.predict(dtest)

# Add predictions to the test dataframe
test_data['predicted_score'] = y_pred


In [89]:
# Sort articles by predicted score within each session
test_data = test_data.sort_values(by=['session_id', 'predicted_score'], ascending=[True, False])

# Display the ranked articles
print(test_data[['session_id', 'article_id', 'predicted_score']])




        session_id  article_id  predicted_score
57013            1     9770082        -0.042609
57014            1     9771367        -0.101735
57015            1     9771576        -0.215594
190553           8     9772963        -0.038218
65734           13     9773364        -0.227060
...            ...         ...              ...
193931      212276     9773947        -0.047989
193955      212276     9738729        -0.154330
193962      212276     9746360        -1.451408
193956      212276     9746360        -1.451408
194114      212291     9772227         0.207687

[40446 rows x 3 columns]


In [90]:
from sklearn.metrics import ndcg_score

# Ensure the 'clicked' column represents the relevance
ndcg = ndcg_score([test_data['clicked']], [test_data['predicted_score']])
print(f"NDCG Score: {ndcg}")


from sklearn.metrics import average_precision_score

# Calculate MAP for the test set
map_score = average_precision_score(test_data['clicked'], test_data['predicted_score'])
print(f"MAP Score: {map_score}")


def precision_at_k(test_data, k):
    # Sort by predicted score in descending order
    df_sorted = test_data.sort_values(by='predicted_score', ascending=False)
    # Take the top K entries
    top_k = df_sorted.head(k)
    # Calculate precision at K
    precision = sum(top_k['clicked']) / k
    return precision

k = 30  # You can adjust K
p_at_k = precision_at_k(test_data, k)
print(f"Precision at {k}: {p_at_k}")


def reciprocal_rank(test_data):
    df_sorted = test_data.sort_values(by='predicted_score', ascending=False)
    for idx, clicked in enumerate(df_sorted['clicked'], 1):
        if clicked == 1:  # First relevant article
            return 1.0 / idx
    return 0

mrr = reciprocal_rank(test_data)
print(f"MRR Score: {mrr}")



NDCG Score: 0.7605003622392007
MAP Score: 0.11572225535753783
Precision at 30: 0.2
MRR Score: 0.25
