In [37]:
import pandas as pd
import numpy as np

# Generate a larger training dataset with more sessions and articles
np.random.seed(42)  # For reproducibility

# Constants
num_sessions = 50  # Number of sessions
articles_per_session = 5  # Number of articles per session
total_rows = num_sessions * articles_per_session

# Generate random session IDs
session_ids = np.repeat(np.arange(1, num_sessions + 1), articles_per_session)

# Generate random user IDs (50 unique users)
user_ids = np.random.choice([f"User_{i}" for i in range(1, 51)], size=total_rows)

# Generate random article IDs (200 unique articles)
article_ids = np.random.choice(np.arange(1, 201), size=total_rows, replace=True)

# Randomly choose 1 article per session as clicked (clicked = 1) and rest as not clicked (clicked = 0)
clicked = np.zeros(total_rows)
clicked[np.arange(0, total_rows, articles_per_session) + np.random.randint(0, articles_per_session, num_sessions)] = 1

# Generate random feature values (e.g., 3 features per article)
feature_1 = np.random.rand(total_rows)
feature_2 = np.random.rand(total_rows)
feature_3 = np.random.rand(total_rows)

# Create the DataFrame
train_data = pd.DataFrame({
    'session_id': session_ids,
    'user_id': user_ids,
    'article_id': article_ids,
    'clicked': clicked,
    'feature_1': feature_1,
    'feature_2': feature_2,
    'feature_3': feature_3
})


In [38]:
import pandas as pd
import xgboost as xgb


# Features and labels for training
X_train = train_data[['feature_1', 'feature_2']].values
y_train = train_data['clicked'].values

# Group (number of articles per session)
group_train = train_data.groupby('session_id').size().to_list()


In [39]:
# Example test data (for prediction)
test_data = pd.DataFrame({
    'session_id': [125, 125, 125, 126, 126],
    'user_id': ['C', 'C', 'C', 'D', 'D'],
    'article_id': [301, 302, 303, 401, 402],
    'clicked': [0, 0, 0, 0, 0],  # In real test data, you may not know the 'clicked' column
    'feature_1': [0.6, 0.7, 0.8, 0.9, 1.0],
    'feature_2': [1.6, 1.7, 1.8, 1.9, 2.0]
})

# Features for testing (same as for training)
X_test = test_data[['feature_1', 'feature_2']].values

# Group (number of articles per session) for the test set
group_test = test_data.groupby('session_id').size().to_list()

In [40]:
# Convert training data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)

# Convert test data to DMatrix
dtest = xgb.DMatrix(X_test)
dtest.set_group(group_test)


In [41]:
# Set parameters for XGBoost ranking
params = {
    'objective': 'rank:pairwise',  # Pairwise ranking objective
    'eta': 0.1,  # Learning rate
    'max_depth': 6,  # Max tree depth
    'eval_metric': 'ndcg'  # Evaluation metric: Normalized Discounted Cumulative Gain
}

# Train the XGBoost model
bst = xgb.train(params, dtrain, num_boost_round=100)


In [42]:
# Make predictions on the test set
y_pred = bst.predict(dtest)

# Add predictions to the test dataframe
test_data['predicted_score'] = y_pred


In [43]:
# Sort articles by predicted score within each session
test_data = test_data.sort_values(by=['session_id', 'predicted_score'], ascending=[True, False])

# Display the ranked articles
print(test_data[['session_id', 'article_id', 'predicted_score']])


   session_id  article_id  predicted_score
0         125         301        -1.539045
1         125         302        -2.029291
2         125         303        -2.162460
3         126         401         1.167572
4         126         402        -1.727297
