In [88]:
# imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Flatten
from sklearn.model_selection import train_test_split
from tensorflow.keras import mixed_precision
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime
mixed_precision.set_global_policy('mixed_float16')
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [89]:
ebnerd_type = "ebnerd_small"

In [90]:
# load datasets
train_behaviors = pd.read_parquet(f"drive/MyDrive/{ebnerd_type}/train/behaviors.parquet")
train_history = pd.read_parquet(f"drive/MyDrive/{ebnerd_type}/train/history.parquet")
articles_embeddings = pd.read_parquet(f"drive/MyDrive/{ebnerd_type}/bert_base_multilingual_cased.parquet")

In [106]:
# generate user-item interactions with positive and negative samples
user_item_pairs = []
unique_users = train_behaviors['user_id'].unique()
unique_items = train_behaviors['article_id'].unique()

for user_id in unique_users:
    # get all sessions for this user
    user_data = train_behaviors[train_behaviors['user_id'] == user_id]

    # initialize a set to keep track of all positive items (clicked articles)
    positive_items = set()

    # iterate over each session to gather clicked and non-clicked inview articles
    for _, session in user_data.iterrows():
        # extract clicked and inview article IDs for the session
        clicked_articles = set(session['article_ids_clicked']) if len(session['article_ids_clicked']) > 0 else set()
        inview_articles = set(session['article_ids_inview']) if len(session['article_ids_inview']) > 0 else set()

        # add clicked articles to positive items
        positive_items.update(clicked_articles)

        # add positive samples (clicked articles)
        for article_id in clicked_articles:
            user_item_pairs.append([user_id, article_id, 1])  # Interaction = 1 for positive samples

        # identify non-clicked inview articles as additional negative samples
        non_clicked_inview = inview_articles - clicked_articles
        for article_id in non_clicked_inview:
            user_item_pairs.append([user_id, article_id, 0])  # interaction = 0 for non-clicked inview articles

    # Generate additional random negative samples from inview articles
    potential_negatives = list(inview_articles - positive_items)
    sampled_negatives = random.sample(potential_negatives, min(10, len(potential_negatives)))

    for article_id in sampled_negatives:
        user_item_pairs.append([user_id, article_id, 0])  # interaction = 0 for additional random negatives

# Convert to DataFrame
interactions_df = pd.DataFrame(user_item_pairs, columns=['user_id', 'article_id', 'interaction'])

In [107]:
interactions_df.head()

Unnamed: 0,user_id,article_id,interaction
0,139836,9778657,1
1,139836,9778728,0
2,139836,9778669,0
3,139836,9778736,0
4,139836,9778682,0


In [62]:
interactions_df.to_parquet(f"drive/MyDrive/{ebnerd_type}/interactions.parquet")

In [108]:
# encode User and Item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

interactions_df['user'] = user_encoder.fit_transform(interactions_df['user_id'])
interactions_df['item'] = item_encoder.fit_transform(interactions_df['article_id'])

In [109]:
# prepare article content features
relevant_article_ids = interactions_df['article_id'].unique()
filtered_article_features = articles_embeddings[articles_embeddings['article_id'].isin(relevant_article_ids)][['article_id', 'google-bert/bert-base-multilingual-cased']].copy()

filtered_article_features['item'] = item_encoder.transform(filtered_article_features['article_id'])

article_features = filtered_article_features.set_index('item')['google-bert/bert-base-multilingual-cased'].to_dict()

interaction_data = interactions_df.copy()
interaction_data['article_features'] = interaction_data['item'].map(article_features)

In [110]:
interaction_data.head()

Unnamed: 0,user_id,article_id,interaction,user,item,article_features
0,139836,9778657,1,747,3876,"[-0.12876706, 0.1104751, -0.23633456, 0.072599..."
1,139836,9778728,0,747,3887,"[0.014924746, 0.03838996, -0.25095826, 0.10139..."
2,139836,9778669,0,747,3880,"[-0.22583604, 0.012108764, -0.26536357, 0.1470..."
3,139836,9778736,0,747,3889,"[-0.18424013, 0.15083934, -0.25177914, 0.12022..."
4,139836,9778682,0,747,3881,"[-0.033458985, 0.16076402, -0.16686285, 0.1048..."


In [96]:
interaction_data.to_parquet(f"drive/MyDrive/{ebnerd_type}/interaction_data.parquet")

KeyboardInterrupt: 

In [111]:
train_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15143 entries, 0 to 15142
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   user_id                  15143 non-null  uint32
 1   impression_time_fixed    15143 non-null  object
 2   scroll_percentage_fixed  15143 non-null  object
 3   article_id_fixed         15143 non-null  object
 4   read_time_fixed          15143 non-null  object
dtypes: object(4), uint32(1)
memory usage: 532.5+ KB


In [112]:
train_history.head()

Unnamed: 0,user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
0,13538,"[2023-04-27T10:17:43.000000, 2023-04-27T10:18:...","[100.0, 35.0, 100.0, 24.0, 100.0, 23.0, 100.0,...","[9738663, 9738569, 9738663, 9738490, 9738663, ...","[17.0, 12.0, 4.0, 5.0, 4.0, 9.0, 5.0, 46.0, 11..."
1,14241,"[2023-04-27T09:40:18.000000, 2023-04-27T09:40:...","[100.0, 46.0, 100.0, 70.0, 100.0, 100.0, 100.0...","[9738557, 9738528, 9738533, 9738684, 9739035, ...","[8.0, 9.0, 28.0, 17.0, 91.0, 21.0, 14.0, 27.0,..."
2,20396,"[2023-04-27T12:30:44.000000, 2023-04-27T12:31:...","[100.0, 59.0, nan, nan, 100.0, 100.0, nan, nan...","[9738760, 9738355, 9738355, 9739864, 9741788, ...","[49.0, 34.0, 0.0, 60.0, 180.0, 49.0, 0.0, 0.0,..."
3,34912,"[2023-04-29T07:12:49.000000, 2023-04-29T13:01:...","[100.0, 35.0, 44.0, 31.0, 100.0, 100.0, 100.0,...","[9741802, 9741804, 9741803, 9740087, 9742039, ...","[153.0, 7.0, 5.0, 6.0, 44.0, 44.0, 108.0, 10.0..."
4,37953,"[2023-04-27T19:17:10.000000, 2023-04-27T19:17:...","[14.0, 28.0, 29.0, nan, 36.0, 33.0, 50.0, 100....","[9739205, 9739202, 9737084, 9739274, 9739358, ...","[4.0, 16.0, 4.0, 0.0, 5.0, 5.0, 25.0, 48.0, 6...."


In [113]:
# process user history features
history_features = train_history.groupby('user_id').agg({
    'read_time_fixed': lambda x: np.nanmean([t for t in x if t is not None]),  # average read time
    'scroll_percentage_fixed': lambda x: np.nanmean([s for s in x if s is not None]),  # average scroll percentage
    'article_id_fixed': 'count'  # total clicks
}).reset_index()


  'scroll_percentage_fixed': lambda x: np.nanmean([s for s in x if s is not None]),  # average scroll percentage


In [114]:
history_features.head()

Unnamed: 0,user_id,read_time_fixed,scroll_percentage_fixed,article_id_fixed
0,10068,40.842106,92.833336,1
1,10200,42.466667,58.857143,1
2,10201,22.47826,58.977272,1
3,10623,67.293709,51.28788,1
4,10701,28.258907,67.483788,1


In [115]:
history_features.columns = ['user_id', 'avg_read_time', 'avg_scroll_percentage', 'total_clicks']

In [116]:
# encode user IDs in history features to align with interactions_df
history_features['user'] = user_encoder.transform(history_features['user_id'])
history_features = history_features.set_index('user')

In [117]:
# Map user history features to the interaction dataset
interaction_data['avg_read_time'] = interaction_data['user'].map(history_features['avg_read_time']).fillna(0)
interaction_data['avg_scroll_percentage'] = interaction_data['user'].map(history_features['avg_scroll_percentage']).fillna(0)
interaction_data['total_clicks'] = interaction_data['user'].map(history_features['total_clicks']).fillna(0)

In [118]:
interaction_data.head()

Unnamed: 0,user_id,article_id,interaction,user,item,article_features,avg_read_time,avg_scroll_percentage,total_clicks
0,139836,9778657,1,747,3876,"[-0.12876706, 0.1104751, -0.23633456, 0.072599...",21.0,52.823528,1
1,139836,9778728,0,747,3887,"[0.014924746, 0.03838996, -0.25095826, 0.10139...",21.0,52.823528,1
2,139836,9778669,0,747,3880,"[-0.22583604, 0.012108764, -0.26536357, 0.1470...",21.0,52.823528,1
3,139836,9778736,0,747,3889,"[-0.18424013, 0.15083934, -0.25177914, 0.12022...",21.0,52.823528,1
4,139836,9778682,0,747,3881,"[-0.033458985, 0.16076402, -0.16686285, 0.1048...",21.0,52.823528,1


In [23]:
interaction_data.to_parquet(f"drive/MyDrive/{ebnerd_type}/interaction_data_complete.parquet")

In [119]:
num_users = interactions_df['user'].nunique()
num_items = interactions_df['item'].nunique()
embedding_dim = 32
article_embedding_dim = len(next(iter(article_features.values())))  # assuming fixed-size article embeddings

In [120]:
# user embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
user_embedding = Flatten()(user_embedding)

In [121]:
# item embedding
item_input = Input(shape=(1,), name='item_input')
item_embedding = Embedding(input_dim=num_items, output_dim=embedding_dim, name='item_embedding')(item_input)
item_embedding = Flatten()(item_embedding)

In [122]:
# article features (precomputed embeddings)
article_features_input = Input(shape=(article_embedding_dim,), name='article_features_input')

In [123]:
# additional user history features
avg_read_time_input = Input(shape=(1,), name='avg_read_time_input')
avg_scroll_percentage_input = Input(shape=(1,), name='avg_scroll_percentage_input')
total_clicks_input = Input(shape=(1,), name='total_clicks_input')

In [124]:
# concatenate all inputs
combined = Concatenate()([
    user_embedding,
    item_embedding,
    article_features_input,
    avg_read_time_input,
    avg_scroll_percentage_input,
    total_clicks_input
])

In [125]:
# add dense layers for non-linear transformation
dense_1 = Dense(64, activation='relu')(combined)
output = Dense(1, activation='sigmoid')(dense_1)  # Use sigmoid for binary interactions

In [126]:
# define and compile model
model = Model(inputs=[
    user_input,
    item_input,
    article_features_input,
    avg_read_time_input,
    avg_scroll_percentage_input,
    total_clicks_input
], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [127]:
# prepare features
user_ids = interaction_data['user'].values
item_ids = interaction_data['item'].values
article_features = interaction_data['article_features'].values
avg_read_times = interaction_data['avg_read_time'].values
avg_scroll_percentages = interaction_data['avg_scroll_percentage'].values
total_clicks = interaction_data['total_clicks'].values
interactions = interaction_data['interaction'].values

In [128]:
X = np.column_stack((
    user_ids,
    item_ids,
    article_features,  # Assumes article_features has shape (2815607, embedding_dim)
    avg_read_times,
    avg_scroll_percentages,
    total_clicks
))

In [129]:
# split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, interactions, test_size=0.2, random_state=42)

In [130]:
# Separate inputs for the model
(user_train, item_train, article_features_train, avg_read_times_train,
 avg_scroll_percentages_train, total_clicks_train) = list(zip(*X_train))

(user_val, item_val, article_features_val, avg_read_times_val,
 avg_scroll_percentages_val, total_clicks_val) = list(zip(*X_val))

In [None]:
# continue training and validation data preparation
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
history = model.fit(
    [user_train, item_train, article_features_train, avg_read_times_train, avg_scroll_percentages_train,
     total_clicks_train],
    y_train,
    validation_data=(
        [user_val, item_val, article_features_val, avg_read_times_val, avg_scroll_percentages_val,
         total_clicks_val],
        y_val
    ),
    epochs=10,  # Adjust based on performance
    batch_size=512,
    verbose=1,
    callbacks=[early_stopping]
)

In [None]:
# Evaluate on validation set
val_loss, val_accuracy = model.evaluate(
    [user_val, item_val, article_features_val, avg_read_times_val, avg_scroll_percentages_val,
     total_clicks_val],
    y_val
)
print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")