In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Flatten, TextVectorization, Attention
from sklearn.model_selection import train_test_split
from tensorflow.keras import mixed_precision, layers, metrics, backend as K, regularizers
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_curve
mixed_precision.set_global_policy('mixed_float16')

In [None]:
# load datasets
test_df = pd.read_parquet("test_dataset_ebnerd_small.parquet")
train_df = pd.read_parquet("train_dataset_ebnerd_small.parquet")
val_df = pd.read_parquet("val_dataset_ebnerd_small.parquet")

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
val_df.head()

In [None]:
print(f"Train set size: {len(train_df)} samples")
print(f"Test set size: {len(test_df)} samples")
print(f"Validation set size: {len(val_df)} samples")

In [None]:
# data preparation
target = ["clicked"]
ids = ["impression_id", "session_id", "article_id", "user_id"]
excluded_columns = target + ids

categorical_columns = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
boolean_columns = train_df.select_dtypes(include=['bool']).columns.tolist()
boolean_columns = [col for col in boolean_columns if col not in excluded_columns]
numerical_columns = [col for col in train_df.columns if col not in excluded_columns + categorical_columns + boolean_columns]

feature_list = list(set(categorical_columns + boolean_columns + numerical_columns))

print("Features used for modeling:", feature_list)

In [None]:
# encoding categorical columns
for col in categorical_columns:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    val_df[col] = le.transform(val_df[col])
    test_df[col] = le.transform(test_df[col])

In [None]:
# normalize numerical features
scaler = StandardScaler()
train_df[numerical_columns] = scaler.fit_transform(train_df[numerical_columns])
val_df[numerical_columns] = scaler.transform(val_df[numerical_columns])
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])

In [None]:
# define model inputs
categorical_input = layers.Input(shape=(len(categorical_columns),), name="categorical_input")
numerical_input = layers.Input(shape=(len(numerical_columns),), name="numerical_input")
boolean_input = layers.Input(shape=(len(boolean_columns),), name="boolean_input")

In [None]:
# embedding layers for categorical inputs
embedding_dim = 16
embedded_categorical = layers.Embedding(
    input_dim=train_df[categorical_columns].max().max() + 1, output_dim=embedding_dim
)(categorical_input)
flattened_categorical = layers.Flatten()(embedded_categorical)

In [None]:
# dense layers for numerical inputs
numerical_dense = layers.Dense(32, activation='relu')(numerical_input)

In [None]:
# dense layers for boolean inputs (treated as numerical)
boolean_dense = layers.Dense(16, activation='relu')(boolean_input)

In [None]:
# concatenate all features
concatenated = layers.Concatenate()([flattened_categorical, numerical_dense, boolean_dense])

x = layers.Dense(64, activation='relu')(concatenated)
x = layers.Dropout(0.3)(x)
x = layers.Dense(32, activation='relu')(x)
output = layers.Dense(1, activation='sigmoid')(x)

In [None]:
# define the model
model = Model(inputs=[categorical_input, numerical_input, boolean_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[metrics.AUC(), metrics.Precision(), metrics.Recall()])
model.summary()

In [None]:
# prepare training data
X_train = [
    train_df[categorical_columns].values,
    train_df[numerical_columns].values,
    train_df[boolean_columns].values
]
y_train = train_df['clicked'].values

# prepare validation data
X_val = [
    val_df[categorical_columns].values,
    val_df[numerical_columns].values,
    val_df[boolean_columns].values
]
y_val = val_df['clicked'].values

# prepare test data
X_test = [
    test_df[categorical_columns].values,
    test_df[numerical_columns].values,
    test_df[boolean_columns].values
]
y_test = test_df['clicked'].values

In [None]:
# fine tuning
# calculate class weights to balance the dataset
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc_9',
    patience=5,
    restore_best_weights=True,
    mode='max'
)

# train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=64,
    class_weight=class_weights_dict,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# evaluate the model
results = model.evaluate(X_test, y_test, verbose=1)
print(f"Test AUC: {results[1]:.4f}")

In [None]:
# predict on the test set
test_predictions = model.predict(X_test).flatten()
test_df['predicted_score'] = test_predictions

In [None]:
test_df['predicted_score'] = test_df['predicted_score'].astype('float32')
test_data = test_df.sort_values(by=['impression_id', 'predicted_score'], ascending=[True, False])
print(test_df[['impression_id', 'user_id', 'article_id','clicked', 'predicted_score']])

In [None]:
test_df = test_df.dropna(subset=['clicked', 'predicted_score'])

In [None]:
#metrics
# AUC Calculation
def calculate_auc(data):
    """Calculate AUC for a given dataset."""
    if data['clicked'].nunique() < 2:
        return np.nan
    try:
        return roc_auc_score(data['clicked'], data['predicted_score'])
    except ValueError:
        return np.nan

# MRR Calculation
def calculate_mrr(data):
    """Calculate MRR for a given dataset."""
    sorted_data = data.sort_values(by='predicted_score', ascending=False)
    ranks = sorted_data['clicked'].values
    for rank, interaction in enumerate(ranks, start=1):
        if interaction == 1:
            return 1 / rank
    return 0

# NDCG Calculation
def dcg(scores, k):
    """Calculate Discounted Cumulative Gain."""
    return sum([score / np.log2(idx + 2) for idx, score in enumerate(scores[:k])])

def calculate_ndcg(data, k):
    """Calculate NDCG for a given dataset and cutoff k."""
    sorted_data = data.sort_values(by='predicted_score', ascending=False)
    ideal_sorted_data = data.sort_values(by='clicked', ascending=False)
    dcg_k = dcg(sorted_data['clicked'].values, k)
    idcg_k = dcg(ideal_sorted_data['clicked'].values, k)
    return dcg_k / idcg_k if idcg_k > 0 else 0


In [None]:
def evaluate_by_impression(test_data):
    """Evaluate the model using AUC, MRR, and NDCG metrics."""
    results = []
    grouped = test_data.groupby('impression_id')

    for impression_id, group in grouped:
        auc = calculate_auc(group)
        mrr = calculate_mrr(group)
        ndcg3 = calculate_ndcg(group, 3)
        ndcg5 = calculate_ndcg(group, 5)
        ndcg10 = calculate_ndcg(group, 10)

        # Append the result as a dictionary
        results.append({
            'impression_id': impression_id,
            'AUC': auc,
            'MRR': mrr,
            'NDCG@3': ndcg3,
            'NDCG@5': ndcg5,
            'NDCG@10': ndcg10
        })

    results_df = pd.DataFrame(results)
    return results_df.dropna()

# Evaluate the model
results = evaluate_by_impression(test_df)
print(results)

In [None]:
# Calculate mean metrics
mean_metrics = {
      'Mean AUC': results['AUC'].mean(),
      'Mean MRR': results['MRR'].mean(),
      'Mean NDCG@3': results['NDCG@3'].mean(),
      'Mean NDCG@5': results['NDCG@5'].mean(),
      'Mean NDCG@10': results['NDCG@10'].mean()
}

print("Mean Metrics:")
for metric, value in mean_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
for k in [3, 5, 10]:
    unique_values = results[f'NDCG@{k}'].nunique()
    print(f"Unique NDCG@{k} scores: {unique_values}")

In [None]:
# plotting Results
def plot_metrics_distribution(results, metric, color, bins=20):
    """Plot the distribution of a metric."""
    plt.figure(figsize=(8, 6))
    sns.histplot(results[metric].dropna(), kde=True, color=color, bins=bins)
    plt.title(f"{metric} Distribution Across Sessions")
    plt.xlabel(metric)
    plt.ylabel("Frequency")
    plt.grid(axis='y')
    plt.show()

# plot Distributions
hist_plot = {
    'AUC': 'skyblue',
    'MRR': 'salmon',

}

for metric, color in hist_plot.items():
    plot_metrics_distribution(results, metric, color)

bar_plot = {
    'NDCG@3': 'lightgreen',
    'NDCG@5': 'orange',
    'NDCG@10': 'purple'
}


for metric, color in bar_plot.items():
    # NDCG@10 Distribution as a Bar Plot
    counts = results[metric].value_counts().sort_index()
    plt.figure(figsize=(8, 6))
    sns.barplot(x=counts.index, y=counts.values, color=color)
    plt.title(f"{metric} Distribution Across Sessions")
    plt.xlabel(f"{metric} Score")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.show()
