### Shell

Installing necessary packages for the notebook. We recommend using a Conda virtual environment to ensure reliability.

In [1]:
%pip install pytorch_lightning
%pip install torchmetrics
%pip install --upgrade tensorboard
%pip install pandas
%pip install nbconvert

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Imports

These libraries are essential for data manipulation, neural network building, and training.

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
import os
from collections import Counter

There was a problem when trying to write in your cache folder (/Users/benceszabo/.cache/huggingface/hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.


### Import for TensorBoard

This part sets up TensorBoard logger, which is used for visualization and monitoring of the model's training progress.

In [3]:
from pytorch_lightning.loggers import TensorBoardLogger

logger = TensorBoardLogger("tb_logs", name="my_model")

### Data Preprocessing

In this section, we load and preprocess the data. It includes loading data from Parquet files, joining tables, generating binary labels, building indexes for items and users, and splitting the data into train and validation sets.

In [4]:
# Load EBNeRD behaviors dataset for both train and validation
train_behaviour = pd.read_parquet("./ebnerd_small/train/behaviors.parquet")
valid_behaviour = pd.read_parquet("./ebnerd_small/validation/behaviors.parquet")
behaviors = pd.concat([train_behaviour, valid_behaviour], ignore_index=True)

behaviors.head()

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
0,149474,,2023-05-24 07:47:53,13.0,,2,"[9778623, 9778682, 9778669, 9778657, 9778736, ...",[9778657],139836,False,,,,False,759,7.0,22.0
1,150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, 9778745, 9778669, 9778657, ...",[9778623],143471,False,,,,False,1240,287.0,100.0
2,153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, 9772866, 9776259, 9756397, ...",[9778669],151570,False,,,,False,1976,45.0,100.0
3,153070,9777492.0,2023-05-24 07:13:14,26.0,100.0,1,"[9020783, 9778444, 9525589, 7213923, 9777397, ...",[9778628],151570,False,,,,False,1976,4.0,18.0
4,153071,9778623.0,2023-05-24 07:11:08,125.0,100.0,1,"[9777492, 9774568, 9565836, 9335113, 9771223, ...",[9777492],151570,False,,,,False,1976,26.0,100.0


In [5]:
# Load EBNeRD history dataset for both train and validation
train_history = pd.read_parquet("./ebnerd_small/train/history.parquet")
valid_history = pd.read_parquet("./ebnerd_small/validation/history.parquet")
history = pd.concat([train_history, valid_history], ignore_index=True)

history.head()

Unnamed: 0,user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
0,13538,"[2023-04-27T10:17:43.000000, 2023-04-27T10:18:...","[100.0, 35.0, 100.0, 24.0, 100.0, 23.0, 100.0,...","[9738663, 9738569, 9738663, 9738490, 9738663, ...","[17.0, 12.0, 4.0, 5.0, 4.0, 9.0, 5.0, 46.0, 11..."
1,14241,"[2023-04-27T09:40:18.000000, 2023-04-27T09:40:...","[100.0, 46.0, 100.0, 70.0, 100.0, 100.0, 100.0...","[9738557, 9738528, 9738533, 9738684, 9739035, ...","[8.0, 9.0, 28.0, 17.0, 91.0, 21.0, 14.0, 27.0,..."
2,20396,"[2023-04-27T12:30:44.000000, 2023-04-27T12:31:...","[100.0, 59.0, nan, nan, 100.0, 100.0, nan, nan...","[9738760, 9738355, 9738355, 9739864, 9741788, ...","[49.0, 34.0, 0.0, 60.0, 180.0, 49.0, 0.0, 0.0,..."
3,34912,"[2023-04-29T07:12:49.000000, 2023-04-29T13:01:...","[100.0, 35.0, 44.0, 31.0, 100.0, 100.0, 100.0,...","[9741802, 9741804, 9741803, 9740087, 9742039, ...","[153.0, 7.0, 5.0, 6.0, 44.0, 44.0, 108.0, 10.0..."
4,37953,"[2023-04-27T19:17:10.000000, 2023-04-27T19:17:...","[14.0, 28.0, 29.0, nan, 36.0, 33.0, 50.0, 100....","[9739205, 9739202, 9737084, 9739274, 9739358, ...","[4.0, 16.0, 4.0, 0.0, 5.0, 5.0, 25.0, 48.0, 6...."


In [6]:
# Load EBNeRD news dataset
news = pd.read_parquet("./ebnerd_small/articles.parquet")

news.head()

Unnamed: 0,article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,...,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
0,3001353,Natascha var ikke den første,"Politiet frygter nu, at Nataschas bortfører ha...",2023-06-29 06:20:33,False,Sagen om den østriske Natascha og hendes bortf...,2006-08-31 08:06:45,[3150850],article_default,https://ekstrabladet.dk/krimi/article3001353.ece,...,[],"[Kriminalitet, Personfarlig kriminalitet]",140,[],krimi,,,,0.9955,Negative
1,3003065,Kun Star Wars tjente mere,Biografgængerne strømmer ind for at se 'Da Vin...,2023-06-29 06:20:35,False,Vatikanet har opfordret til at boykotte filmen...,2006-05-21 16:57:00,[3006712],article_default,https://ekstrabladet.dk/underholdning/filmogtv...,...,[],"[Underholdning, Film og tv, Økonomi]",414,"[433, 434]",underholdning,,,,0.846,Positive
2,3012771,Morten Bruun fyret i SønderjyskE,FODBOLD: Morten Bruun fyret med øjeblikkelig v...,2023-06-29 06:20:39,False,Kemien mellem spillerne i Superligaklubben Søn...,2006-05-01 14:28:40,[3177953],article_default,https://ekstrabladet.dk/sport/fodbold/dansk_fo...,...,[],"[Erhverv, Kendt, Sport, Fodbold, Ansættelsesfo...",142,"[196, 199]",sport,,,,0.8241,Negative
3,3023463,Luderne flytter på landet,I landets tyndest befolkede områder skyder bor...,2023-06-29 06:20:43,False,Det frække erhverv rykker på landet. I den tyn...,2007-03-24 08:27:59,[3184029],article_default,https://ekstrabladet.dk/nyheder/samfund/articl...,...,[],"[Livsstil, Erotik]",118,[133],nyheder,,,,0.7053,Neutral
4,3032577,Cybersex: Hvornår er man utro?,En flirtende sms til den flotte fyr i regnskab...,2023-06-29 06:20:46,False,"De fleste af os mener, at et tungekys er utros...",2007-01-18 10:30:37,[3030463],article_default,https://ekstrabladet.dk/sex_og_samliv/article3...,...,[],"[Livsstil, Partnerskab]",565,[],sex_og_samliv,,,,0.9307,Neutral


### Join history and behaviour tables

In [7]:
# Left join on 'user_id'
behaviour_history_merged= pd.merge(behaviors, history, on='user_id', how='left')

# Display the merged data
behaviour_history_merged.head()

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,...,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
0,149474,,2023-05-24 07:47:53,13.0,,2,"[9778623, 9778682, 9778669, 9778657, 9778736, ...",[9778657],139836,False,...,,,False,759,7.0,22.0,"[2023-05-03T19:04:15.000000, 2023-05-03T19:05:...","[100.0, 89.0, 27.0, 33.0, 100.0, 75.0, 39.0, 2...","[9745590, 9748574, 9748432, 9748080, 9750687, ...","[60.0, 11.0, 1.0, 15.0, 37.0, 15.0, 4.0, 8.0, ..."
1,150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, 9778745, 9778669, 9778657, ...",[9778623],143471,False,...,,,False,1240,287.0,100.0,"[2023-04-27T08:05:09.000000, 2023-04-27T10:05:...","[21.0, 100.0, 34.0, 85.0, 92.0, 75.0, 52.0, 66...","[9737881, 9738659, 9738569, 9738490, 9738528, ...","[7.0, 24.0, 28.0, 65.0, 16.0, 41.0, 59.0, 24.0..."
2,150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, 9778745, 9778669, 9778657, ...",[9778623],143471,False,...,,,False,1240,287.0,100.0,"[2023-05-04T07:10:24.000000, 2023-05-04T07:10:...","[77.0, 80.0, 28.0, 11.0, 94.0, 54.0, 74.0, 30....","[9748977, 9748976, 9747490, 9745484, 9747959, ...","[3.0, 29.0, 2.0, 3.0, 16.0, 30.0, 4.0, 3.0, 4...."
3,153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, 9772866, 9776259, 9756397, ...",[9778669],151570,False,...,,,False,1976,45.0,100.0,"[2023-04-27T14:07:16.000000, 2023-04-27T14:08:...","[100.0, nan, 100.0, 14.0, 100.0, 100.0, 100.0,...","[9738303, 9738993, 9738303, 9738902, 9738303, ...","[59.0, 1.0, 2.0, 8.0, 4.0, 28.0, 51.0, 7.0, 7...."
4,153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, 9772866, 9776259, 9756397, ...",[9778669],151570,False,...,,,False,1976,45.0,100.0,"[2023-05-04T20:50:44.000000, 2023-05-04T20:51:...","[100.0, nan, 100.0, 100.0, 100.0, 18.0, 100.0,...","[9750389, 9749756, 9750389, 9750318, 9749582, ...","[27.0, 8.0, 10.0, 24.0, 13.0, 7.0, 5.0, 34.0, ..."


### Generate Binary Labels

Generating binary labels enables us to tackle the binary classification problem. Note that this operation could be optimised.

In [26]:
# Function to create binary labels column
def create_binary_labels_column(df):
    # Define the column names
    clicked_col = "article_ids_clicked"
    inview_col = "article_ids_inview"
    labels_col = "labels"

    # Create a new column with binary labels
    df[labels_col] = df.apply(lambda row: [1 if article_id in row[clicked_col] else 0 for article_id in row[inview_col]], axis=1)

    # Shuffle the data
    df = df.sample(frac=1, random_state=123)

    # Add a column with the length of the labels list
    df[labels_col + "_len"] = df[labels_col].apply(len)

    return df

# Apply the function to your merged dataset
behaviour_history_merged = create_binary_labels_column(behaviour_history_merged)

# Display the updated dataset
behaviour_history_merged.head()

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,...,next_read_time,next_scroll_percentage,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed,labels,labels_len,userIdx,articleIdx
193433,256591377,,2023-05-23 15:57:24,30.0,,2,"[9759955, 9777505, 9776420, 9777565, 9777768, ...",[9775042],2096171,False,...,74.0,100.0,"[2023-05-04T07:59:29.000000, 2023-05-04T08:12:...","[nan, 100.0, 27.0, 30.0, nan, 100.0, 23.0, nan...","[9748990, 9748980, 9747490, 9749349, 9749916, ...","[0.0, 46.0, 3.0, 6.0, 1.0, 118.0, 13.0, 0.0, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",23,3904,0
7484,7510741,,2023-05-19 09:06:55,10.0,,1,"[9557348, 9772193, 9440508, 9772115, 9417521, ...",[9772221],1116443,False,...,352.0,100.0,"[2023-05-04T08:48:35.000000, 2023-05-04T08:48:...","[36.0, 100.0, 29.0, 54.0, 32.0, 41.0, 86.0, 10...","[9749114, 9748916, 9748976, 9749025, 9747490, ...","[4.0, 545.0, 7.0, 6.0, 8.0, 68.0, 111.0, 99.0,...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",10,13240,0
738463,390191481,,2023-05-26 16:57:50,36.0,,2,"[9778732, 9783159, 9783043, 9220931, 9783073, ...",[9783051],2032861,False,...,2.0,51.0,"[2023-05-04T07:12:54.000000, 2023-05-04T07:13:...","[88.0, 27.0, 100.0, 29.0, 17.0, 100.0, 88.0, 7...","[9749025, 9747490, 9749224, 9749092, 9748213, ...","[2.0, 3.0, 9.0, 8.0, 7.0, 55.0, 18.0, 9.0, 6.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",14,3068,0
352984,464818085,9767909.0,2023-05-19 06:11:32,37.0,100.0,2,"[9771787, 9771473, 9219607, 9593037, 9771999, ...",[9771916],1545564,False,...,2.0,29.0,"[2023-04-27T08:16:43.000000, 2023-04-27T18:07:...","[61.0, 28.0, 100.0, 29.0, nan, 62.0, 27.0, 100...","[9737023, 9739179, 9737084, 9739035, 9739035, ...","[18.0, 3.0, 9.0, 14.0, 2.0, 17.0, 39.0, 36.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",16,578,18013
604011,209409427,9788251.0,2023-05-31 02:37:15,10.0,100.0,2,"[9788225, 9780986, 9484153, 9777321, 9436758, ...",[9788352],2079241,False,...,21.0,100.0,"[2023-05-22T18:22:41.000000, 2023-05-22T19:21:...","[100.0, 59.0, 100.0, 30.0, 100.0, 19.0, 100.0,...","[9776394, 9776449, 9776148, 9769605, 9776438, ...","[25.0, 24.0, 43.0, 4.0, 111.0, 8.0, 111.0, 23....","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",11,8175,19696


In [9]:
# Build index of items    
ind2article = {idx + 1: itemid for idx, itemid in enumerate(news['article_id'].values)}
article2ind = {itemid: idx for idx, itemid in ind2article.items()}

# Build index of users
unique_userIds = behaviour_history_merged['user_id'].unique()
ind2user = {idx + 1: itemid for idx, itemid in enumerate(unique_userIds)}
user2ind = {itemid: idx for idx, itemid in ind2user.items()}

behaviour_history_merged['userIdx'] = behaviour_history_merged['user_id'].map(lambda x: user2ind.get(x, 0))
behaviour_history_merged['articleIdx'] = behaviour_history_merged['article_id'].map(lambda x: article2ind.get(x, 0))
print(f"We have {len(article2ind)} unique articles in the dataset")
print(f"We have {len(user2ind)} unique users in the dataset")

We have 20738 unique articles in the dataset
We have 18827 unique users in the dataset


In [10]:
# Split data into train and validation
test_time_threshold = behaviour_history_merged['impression_time'].quantile(0.9)
train_data = behaviour_history_merged[behaviour_history_merged['impression_time'] < test_time_threshold]
valid_data = behaviour_history_merged[behaviour_history_merged['impression_time'] >= test_time_threshold]

### Dataset Model

Defining the dataset model.

In [11]:
class EBNeRDMindDataset(Dataset):
    def __init__(self, df):
        self.data = {
            'userIdx': torch.tensor(df.userIdx.values),
            'articleIdx': torch.tensor(df.articleIdx.values),
            'labels': torch.tensor([item for sublist in df.labels for item in sublist], dtype=torch.float32),
        }

    def __len__(self):
        return len(self.data['userIdx'])

    def __getitem__(self, idx):
        return {
            'userIdx': self.data['userIdx'][idx],
            'articleIdx': self.data['articleIdx'][idx],
            'click': self.data['labels'][idx].long(),
            'noclick': 1 - self.data['labels'][idx].long(),
        }

In [12]:
# Build datasets and dataloaders for train and validation dataframes
bs = 1024
ds_train = EBNeRDMindDataset(train_data)
train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True)
ds_valid = EBNeRDMindDataset(valid_data)
valid_loader = DataLoader(ds_valid, batch_size=bs, shuffle=False)


### Model

This section defines our neural network model. It includes creating data loaders, defining the model architecture (NewsMF), specifying training steps, validation steps, optimizer, and training configurations.

In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from torchmetrics.classification import BinaryF1Score, BinaryAUROC

class NewsMF(pl.LightningModule):
    def __init__(self, num_users, num_items, dim=10):
        super().__init__()
        self.dim = dim
        self.useremb = nn.Embedding(num_embeddings=num_users, embedding_dim=dim)
        self.itememb = nn.Embedding(num_embeddings=num_items, embedding_dim=dim)
        
        # BinaryF1Score metric
        self.f1_metric = BinaryF1Score()
        self.train_step_f1_outputs = []
        self.validation_step_f1_outputs = []

        # BinaryAUROC metric
        self.binary_auroc = BinaryAUROC()
        self.train_step_auroc_outputs = []
        self.validation_step_auroc_outputs = []


    def forward(self, user, item):
        batch_size = user.size(0)
        uservec = self.useremb(user)
        itemvec = self.itememb(item)

        score = (uservec * itemvec).sum(-1).unsqueeze(-1)

        return score

    def training_step(self, batch, batch_idx):
        batch_size = batch['userIdx'].size(0)

        # Compute loss as cross entropy (categorical distribution between the clicked and the no-clicked item)
        score_click = self.forward(batch['userIdx'], batch['click'])
        score_noclick = self.forward(batch['userIdx'], batch['noclick'])

        loss = F.cross_entropy(input=torch.cat((score_click, score_noclick), dim=1),
                               target=torch.zeros(batch_size, device=score_click.device).long())
        
        # Compute F1-score
        f1_click = self.f1_metric(score_click.squeeze(), torch.ones_like(batch['click']))
        f1_noclick = self.f1_metric(score_noclick.squeeze(), torch.zeros_like(batch['noclick']))

        # Average F1-scores
        f1 = (f1_click + f1_noclick) / 2.0

        self.train_step_f1_outputs.append(f1)

        # Calculate Binary AUROC
        binary_auroc_score = self.binary_auroc(torch.cat((score_click, score_noclick), dim=1),
                                                torch.cat((torch.ones_like(batch['click']),
                                                           torch.zeros_like(batch['noclick'])))
                                               )
        
        self.train_step_auroc_outputs.append(binary_auroc_score)

        # Log metrics to TensorBoard
        self.log('train_loss', loss)
        self.log('train_f1', f1)
        self.log('train_auroc', binary_auroc_score)

        return {'loss': loss, 'f1': f1, 'auroc': binary_auroc_score}
    
    def on_train_epoch_end(self):
        epoch_average_f1 = torch.stack(self.train_step_f1_outputs).mean()
        print(f'Epoch {self.current_epoch}: Training F1 Score: {epoch_average_f1.item()}')
        self.log("train_epoch_average_f1", epoch_average_f1)
        self.train_step_f1_outputs.clear()  # free memory

        epoch_average_auroc = torch.stack(self.train_step_auroc_outputs).mean()
        print(f'Epoch {self.current_epoch}: Training AUROC Score: {epoch_average_auroc.item()}')
        self.log("train_epoch_average_auroc", epoch_average_auroc)
        self.validation_step_auroc_outputs.clear()  # free memory

    def validation_step(self, batch, batch_idx):
        # Compute loss as cross-entropy (categorical distribution between clicked and non-clicked items)
        score_click = self.forward(batch['userIdx'], batch['click'])
        score_noclick = self.forward(batch['userIdx'], batch['noclick'])

        loss = F.cross_entropy(input=torch.cat((score_click, score_noclick), dim=1),
                            target=torch.zeros(batch['userIdx'].size(0), device=score_click.device).long())
        
        # F1 Score
        f1_click = self.f1_metric(score_click.squeeze(), torch.ones_like(batch['click']))
        f1_noclick = self.f1_metric(score_noclick.squeeze(), torch.zeros_like(batch['noclick']))
        f1 = (f1_click + f1_noclick) / 2.0 # Average F1-scores

        self.validation_step_f1_outputs.append(f1)

        # Calculate Binary AUROC
        binary_auroc_score = self.binary_auroc(torch.cat((score_click, score_noclick), dim=1),
                                                torch.cat((torch.ones_like(batch['click']),
                                                           torch.zeros_like(batch['noclick'])))
                                               )
        
        self.validation_step_auroc_outputs.append(binary_auroc_score)

        # Log metrics to TensorBoard
        self.log('val_loss', loss)
        self.log('val_f1', f1)
        self.log('val_auroc', binary_auroc_score)
                
        return {'loss': loss, 'f1': f1, 'auroc': binary_auroc_score}

    def on_validation_epoch_end(self):
        epoch_average_f1 = torch.stack(self.validation_step_f1_outputs).mean()
        print(f'Epoch {self.current_epoch}: Validation F1 Score: {epoch_average_f1.item()}')
        self.log("validation_epoch_average_f1", epoch_average_f1)
        self.validation_step_f1_outputs.clear()  # free memory

        epoch_average_auroc = torch.stack(self.validation_step_auroc_outputs).mean()
        print(f'Epoch {self.current_epoch}: Validation AUROC Score: {epoch_average_auroc.item()}')
        self.log("validation_epoch_average_auroc", epoch_average_auroc)
        self.validation_step_auroc_outputs.clear()  # free memory

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [31]:
ebnerd_model = NewsMF(num_users=len(user2ind) + 1, num_items=len(article2ind) + 1)

In [15]:
# Instantiate the trainer
trainer = pl.Trainer(max_epochs=10, logger=logger)

# Train the model
trainer.fit(model=ebnerd_model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params
-----------------------------------------------
0 | useremb      | Embedding     | 188 K 
1 | itememb      | Embedding     | 207 K 
2 | f1_metric    | BinaryF1Score | 0     
3 | binary_auroc | BinaryAUROC   | 0     
-----------------------------------------------
395 K     Trainable params
0         Non-trainable params
395 K     Total params
1.583     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/benceszabo/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Epoch 0: Validation F1 Score: 0.33549195528030396
Epoch 0: Validation AUROC Score: 0.48946571350097656


/Users/benceszabo/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0: Validation F1 Score: 0.34851565957069397
Epoch 0: Validation AUROC Score: 0.5033974647521973
Epoch 0: Training F1 Score: 0.3419516980648041
Epoch 0: Training AUROC Score: 0.49968552589416504


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1: Validation F1 Score: 0.3702296018600464
Epoch 1: Validation AUROC Score: 0.5035315155982971
Epoch 1: Training F1 Score: 0.35778793692588806
Epoch 1: Training AUROC Score: 0.49964776635169983


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2: Validation F1 Score: 0.38648325204849243
Epoch 2: Validation AUROC Score: 0.5033668279647827
Epoch 2: Training F1 Score: 0.3793454170227051
Epoch 2: Training AUROC Score: 0.4997762441635132


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3: Validation F1 Score: 0.3947860896587372
Epoch 3: Validation AUROC Score: 0.5032120943069458
Epoch 3: Training F1 Score: 0.3919564485549927
Epoch 3: Training AUROC Score: 0.4996449649333954


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4: Validation F1 Score: 0.39743658900260925
Epoch 4: Validation AUROC Score: 0.5031260848045349
Epoch 4: Training F1 Score: 0.396745890378952
Epoch 4: Training AUROC Score: 0.4996395707130432


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5: Validation F1 Score: 0.3986334502696991
Epoch 5: Validation AUROC Score: 0.5030762553215027
Epoch 5: Training F1 Score: 0.3988230526447296
Epoch 5: Training AUROC Score: 0.4995845854282379


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6: Validation F1 Score: 0.39917171001434326
Epoch 6: Validation AUROC Score: 0.5030227899551392
Epoch 6: Training F1 Score: 0.40000370144844055
Epoch 6: Training AUROC Score: 0.49969977140426636


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7: Validation F1 Score: 0.3996926546096802
Epoch 7: Validation AUROC Score: 0.5030072331428528
Epoch 7: Training F1 Score: 0.40086811780929565
Epoch 7: Training AUROC Score: 0.4997261166572571


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8: Validation F1 Score: 0.40019991993904114
Epoch 8: Validation AUROC Score: 0.5029613375663757
Epoch 8: Training F1 Score: 0.40120041370391846
Epoch 8: Training AUROC Score: 0.49972003698349


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: Validation F1 Score: 0.4001691937446594
Epoch 9: Validation AUROC Score: 0.5029410123825073
Epoch 9: Training F1 Score: 0.4015786647796631
Epoch 9: Training AUROC Score: 0.49980100989341736


In [16]:
logs = trainer.logged_metrics

# Print or inspect the logs
print("Training and validation logs:", logs)

Training and validation logs: {'train_loss': tensor(0.3250), 'train_f1': tensor(0.3943), 'train_auroc': tensor(0.5005), 'val_loss': tensor(0.3074), 'val_f1': tensor(0.4001), 'val_auroc': tensor(0.5029), 'validation_epoch_average_f1': tensor(0.4002), 'validation_epoch_average_auroc': tensor(0.5029), 'train_epoch_average_f1': tensor(0.4016), 'train_epoch_average_auroc': tensor(0.4998)}


### Prediction test

Here, we perform a prediction test using our trained model. It involves selecting a random user, generating predictions for item recommendations, and filtering the top recommended items.

In [17]:
USER_ID = 2350 # Random user id
# Create item_ids and user ids list
item_id = list(ind2article.keys())
userIdx =  [USER_ID]*len(item_id)

preditions = ebnerd_model.forward(torch.IntTensor(userIdx), torch.IntTensor(item_id))

# Select top 10 argmax
top_index = torch.topk(preditions.flatten(), 10).indices

# Filter for top 10 suggested items
filters = [ind2article[ix.item()] for ix in top_index]
news[news["article_id"].isin(filters)]

Unnamed: 0,article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,...,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
541,4135980,Smykkefirma stævner kendis-designer,Smykkefirmaet Shamballa Jewels går til angreb ...,2023-06-29 06:38:18,False,Danske Jannik Olander står bag en smykkesucces...,2010-10-26 09:07:54,"[3721751, 3599297]",article_default,https://ekstrabladet.dk/forbrug/forbrug/articl...,...,[],"[Erhverv, Privat virksomhed, Kendt]",457,[491],forbrug,,,,0.9033,Negative
1728,4840847,Her kan du føle dig som en royal,På Hvedholm Slot i Faaborg kan du leve prinses...,2023-06-29 07:13:45,False,"Den brede, grønne allé til Hvedholm Slot er så...",2014-03-16 09:23:23,"[4668281, 4668280, 4668279]",article_default,https://ekstrabladet.dk/ferie/ferie-i-danmark/...,...,[],"[Underholdning, Kultur, Museum og seværdighed]",539,[540],ferie,,,,0.7162,Neutral
3012,7082307,Skab det perfekte kærlighedsliv,"Vi har flere skilsmisser, singler og utroskab ...",2023-06-29 07:34:54,True,"Vi lever i en guldalder, når det kommer til vo...",2018-03-26 15:19:10,[7082340],article_default,https://ekstrabladet.dk/sex_og_samliv/skab-det...,...,[],"[Livsstil, Partnerskab]",565,[],sex_og_samliv,,,,0.796,Neutral
5653,8922204,'Sygeplejerske nød at dræbe',Sygeplejersken William Davis fra Texas er fund...,2023-06-29 06:38:20,False,Den 37-årige sygeplejerske William Davis er bl...,2021-10-21 15:58:34,[8922246],article_default,https://ekstrabladet.dk/krimi/sygeplejerske-no...,...,"[ORG, ORG, PER, LOC, LOC, PER]","[Kriminalitet, Personfarlig kriminalitet, Sund...",140,[],krimi,,,,0.9942,Negative
6706,9236448,"Sofie, 27 år og fra Ishøj",,2023-06-29 06:41:28,True,Sofie 6 maj,2022-05-05 20:00:00,[9236444],article_page_nine_girl,https://ekstrabladet.dk/side9/sofie/9236448,...,[],"[Kendt, Livsstil]",572,[],side9,,,,0.6653,Neutral
7375,9329631,Uhyggelige detaljer: 18 blodige minutter,Riffelmanden bevægede sig rundt på samtlige et...,2023-06-29 06:42:30,True,black\nSe det ske: Den formodet gerningsmand b...,2022-07-06 13:30:36,"[9329712, 9329759, 9326958, 9328690, 9327561, ...",article_standard_feature,https://ekstrabladet.dk/krimi/uhyggelige-detal...,...,"[LOC, ORG, LOC, ORG, LOC, LOC, PER, LOC, LOC, ...","[Kriminalitet, Personfarlig kriminalitet, Kata...",140,[],krimi,,,,0.9808,Negative
7431,9337013,Ukrainsk liga starter trods krig,"Sportsminister har annonceret, at landets fodb...",2023-06-29 06:42:34,False,Ruslands fortsatte invasion kommer ikke til at...,2022-07-11 19:18:39,[9337026],article_default,https://ekstrabladet.dk/sport/fodbold/ukrainsk...,...,"[ORG, PER, PER, ORG, ORG, ORG, ORG, EVENT, ORG...","[Sport, Fodbold, Konflikt og krig, Væbnet konf...",142,[196],sport,,,,0.8866,Neutral
7811,9383227,"Dronebilleder fra Sallingsund Færgekro, 15/6-22",,2023-06-29 06:43:17,False,,2022-06-15 13:28:20,,article_video_standalone,https://ekstrabladet.dk/nyheder/article9383227...,...,[LOC],"[Transportmiddel, Større transportmiddel]",118,[],nyheder,,,,0.7427,Neutral
16196,9750389,Politiet beder om hjælp: Var det dig?,Nordjyllands Politi vil gerne høre fra flere b...,2023-06-29 06:48:34,False,Kørte du ad motorvej E45 i nordlig retning mel...,2023-05-04 20:29:19,[9745227],article_default,https://ekstrabladet.dk/krimi/politiet-beder-o...,...,"[LOC, LOC, LOC, ORG]","[Transportmiddel, Bil, Katastrofe, Mindre ulykke]",140,[],krimi,698589.0,182347.0,6807213.0,0.8405,Negative
18004,9767751,Holger Runes showkamp i Royal Arena er aflyst,Nick Kyrgios er skadet og kan derfor ikke spil...,2023-06-29 06:48:51,False,"Øv, øv, øv.\nDer var lagt op til en kæmpe tenn...",2023-05-16 10:05:43,"[9764366, 9748289]",article_default,https://ekstrabladet.dk/sport/anden_sport/tenn...,...,"[LOC, MISC, LOC, PER, PER, ORG]","[Kendt, Begivenhed, Sport, Sundhed, Sygdom og ...",142,"[327, 349]",sport,1261985.0,199814.0,6064868.0,0.6054,Negative


### Model Save

This section saves the trained model's state dictionary to a specified directory.

In [18]:
# Specify the relative directory path
relative_directory = "Saved_Model/"

# Create the full directory path
directory_path = os.path.join(relative_directory)

# Create the directory if it does not exist
os.makedirs(directory_path, exist_ok=True)

# Save the state dictionary of the model to the specified directory
model_save_path = os.path.join(directory_path, "EBNERD_collaborative_filtering_model.pth")
torch.save(ebnerd_model.state_dict(), model_save_path)

### Model Load

Here, we load the saved model from the directory.

In [19]:
# Load the state dictionary from the specified directory
loaded_model = NewsMF(num_users=len(ind2user)+1, num_items=len(ind2article)+1)

# Use a relative path when loading the model
model_load_path = os.path.join("Saved_Model", "EBNERD_collaborative_filtering_model.pth")
loaded_model.load_state_dict(torch.load(model_load_path))

<All keys matched successfully>

### Loaded Model Single Prediction

Similar to the prediction test, but this time, it involves loading the saved model and making predictions for a specific user.

In [20]:
# Specify the user ID for prediction
USER_ID = 1234
PREDICTION_COUNT = 10

# Create item_ids and user ids list
article_id = list(ind2article.keys())
userIdx = [USER_ID] * len(article_id)

# Convert lists to PyTorch tensors
user_tensor = torch.IntTensor(userIdx)
item_tensor = torch.IntTensor(article_id)

# Forward pass to get predictions
predictions = loaded_model.forward(user_tensor, item_tensor)

# Select top 10 indices
top_indices = torch.topk(predictions.flatten(), PREDICTION_COUNT).indices

# Get corresponding item IDs
top_item_ids = [ind2article[ix.item()] for ix in top_indices]

# Filter for top 10 suggested items
recommended_items = news[news["article_id"].isin(top_item_ids)]

# Display the recommended items
print(recommended_items)

       article_id                                              title  \
813       4248342                       Saida, 23 år og fra Brønshøj   
1240      4490369    Forskere undres: Hvorfor ta'r hun den i munden?   
1449      4693144     Peter: Amalie får mange penge for Luksusfælden   
11793     9678305  Brand og drabsforsøg: Sådan kendte sigtede kvi...   
13244     9719885   Auktionshus skylder Flemming tusindvis af kroner   
13731     9727696  Lejlighed på Allékredsen solgt efter voldsom p...   
14923     9739785  Stor prisstigning: Lejlighed solgt i Halsskovgade   
15101     9741379         Massivt til stede: Overfald på Christiania   
15960     9748683  Politiet anholder mistænkt i Atlanta-skyderi e...   
18455     9773364  Værste oversvømmelser i 100 år har ramt Nordit...   

                                                subtitle  last_modified_time  \
813                                                      2023-06-29 06:45:58   
1240   Ifølge videnskaben dyrker kvinder ikke o

### Tensorboard

This section loads and starts TensorBoard to visualize training metrics.

In [21]:
# Load the extension and start TensorBoard
%load_ext tensorboard
%tensorboard --logdir tb_logs

### Utilities

This section contains various utility functions and commands. It includes converting the notebook to a Python script, getting a random user ID, and validating index mappings.

### Convert to Python Script (not needed right now but keep as utility)

In [22]:
!python -m nbconvert --to script EBNERD_Notebook.ipynb

[NbConvertApp] Converting notebook EBNERD_Notebook.ipynb to script
[NbConvertApp] Writing 14865 bytes to EBNERD_Notebook.py


### Get random user id

In [23]:
random_user_index = np.random.randint(0, len(behaviors))
random_user_id = behaviors.iloc[random_user_index]['user_id']

print(f"Randomly selected user ID: {random_user_id}")

Randomly selected user ID: 1153348


### Validate conversion consistency

In [24]:
def validate_mapping_consistency(user2ind, ind2user, article2ind, ind2article):
    # Choose a random user and article ID for validation
    random_user_id = np.random.choice(list(user2ind.keys()))
    random_article_id = np.random.choice(list(article2ind.keys()))
    print(f"Randomly selected user ID: {random_user_id}")
    print(f"Randomly selected article ID: {random_article_id}")

    # Validate user mapping
    user_index = user2ind.get(random_user_id)
    retrieved_user_id = ind2user.get(user_index)
    print(f"User index: {user_index}")
    print(f"Retrieved user ID: {retrieved_user_id}")
    
    user_mapping_consistent = random_user_id == retrieved_user_id

    # Validate article mapping
    article_index = article2ind.get(random_article_id)
    retrieved_article_id = ind2article.get(article_index)
    print(f"Article index: {article_index}")
    print(f"Retrieved article ID: {retrieved_article_id}")

    article_mapping_consistent = random_article_id == retrieved_article_id

    return user_mapping_consistent, article_mapping_consistent

# Perform validation
user_consistency, article_consistency = validate_mapping_consistency(user2ind, ind2user, article2ind, ind2article)

# Print results
print(f"User Mapping Consistency: {user_consistency}")
print(f"Article Mapping Consistency: {article_consistency}")

Randomly selected user ID: 1226005
Randomly selected article ID: 9725586
User index: 13623
Retrieved user ID: 1226005
Article index: 13588
Retrieved article ID: 9725586
User Mapping Consistency: True
Article Mapping Consistency: True
