### Shell

In [74]:
%pip install pytorch_lightning
%pip install torchmetrics
%pip install --upgrade tensorboard
%pip install pandas
%pip install nbconvert

Note: you may need to restart the kernel to use updated packages.



Note: you may need to restart the kernel to use updated packages.


### Imports

In [75]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
import os
from collections import Counter

### Import for TensorBoard

In [76]:
from pytorch_lightning.loggers import TensorBoardLogger

logger = TensorBoardLogger("tb_logs", name="my_model")

### Data Preprocessing

In [77]:
# Load EBNeRD behaviors dataset for both train and validation
train_behaviour = pd.read_parquet("./ebnerd_small/train/behaviors.parquet")
valid_behaviour = pd.read_parquet("./ebnerd_small/validation/behaviors.parquet")
behaviors = pd.concat([train_behaviour, valid_behaviour], ignore_index=True)

behaviors.head()

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
0,149474,,2023-05-24 07:47:53,13.0,,2,"[9778623, 9778682, 9778669, 9778657, 9778736, ...",[9778657],139836,False,,,,False,759,7.0,22.0
1,150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, 9778745, 9778669, 9778657, ...",[9778623],143471,False,,,,False,1240,287.0,100.0
2,153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, 9772866, 9776259, 9756397, ...",[9778669],151570,False,,,,False,1976,45.0,100.0
3,153070,9777492.0,2023-05-24 07:13:14,26.0,100.0,1,"[9020783, 9778444, 9525589, 7213923, 9777397, ...",[9778628],151570,False,,,,False,1976,4.0,18.0
4,153071,9778623.0,2023-05-24 07:11:08,125.0,100.0,1,"[9777492, 9774568, 9565836, 9335113, 9771223, ...",[9777492],151570,False,,,,False,1976,26.0,100.0


In [78]:
# Load EBNeRD history dataset for both train and validation
train_history = pd.read_parquet("./ebnerd_small/train/history.parquet")
valid_history = pd.read_parquet("./ebnerd_small/validation/history.parquet")
history = pd.concat([train_history, valid_history], ignore_index=True)

history.head()

Unnamed: 0,user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
0,13538,"[2023-04-27T10:17:43.000000, 2023-04-27T10:18:...","[100.0, 35.0, 100.0, 24.0, 100.0, 23.0, 100.0,...","[9738663, 9738569, 9738663, 9738490, 9738663, ...","[17.0, 12.0, 4.0, 5.0, 4.0, 9.0, 5.0, 46.0, 11..."
1,14241,"[2023-04-27T09:40:18.000000, 2023-04-27T09:40:...","[100.0, 46.0, 100.0, 70.0, 100.0, 100.0, 100.0...","[9738557, 9738528, 9738533, 9738684, 9739035, ...","[8.0, 9.0, 28.0, 17.0, 91.0, 21.0, 14.0, 27.0,..."
2,20396,"[2023-04-27T12:30:44.000000, 2023-04-27T12:31:...","[100.0, 59.0, nan, nan, 100.0, 100.0, nan, nan...","[9738760, 9738355, 9738355, 9739864, 9741788, ...","[49.0, 34.0, 0.0, 60.0, 180.0, 49.0, 0.0, 0.0,..."
3,34912,"[2023-04-29T07:12:49.000000, 2023-04-29T13:01:...","[100.0, 35.0, 44.0, 31.0, 100.0, 100.0, 100.0,...","[9741802, 9741804, 9741803, 9740087, 9742039, ...","[153.0, 7.0, 5.0, 6.0, 44.0, 44.0, 108.0, 10.0..."
4,37953,"[2023-04-27T19:17:10.000000, 2023-04-27T19:17:...","[14.0, 28.0, 29.0, nan, 36.0, 33.0, 50.0, 100....","[9739205, 9739202, 9737084, 9739274, 9739358, ...","[4.0, 16.0, 4.0, 0.0, 5.0, 5.0, 25.0, 48.0, 6...."


In [79]:
# Load EBNeRD news dataset
news = pd.read_parquet("./ebnerd_small/articles.parquet")

news.head()

Unnamed: 0,article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,...,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
0,3001353,Natascha var ikke den første,"Politiet frygter nu, at Nataschas bortfører ha...",2023-06-29 06:20:33,False,Sagen om den østriske Natascha og hendes bortf...,2006-08-31 08:06:45,[3150850],article_default,https://ekstrabladet.dk/krimi/article3001353.ece,...,[],"[Kriminalitet, Personfarlig kriminalitet]",140,[],krimi,,,,0.9955,Negative
1,3003065,Kun Star Wars tjente mere,Biografgængerne strømmer ind for at se 'Da Vin...,2023-06-29 06:20:35,False,Vatikanet har opfordret til at boykotte filmen...,2006-05-21 16:57:00,[3006712],article_default,https://ekstrabladet.dk/underholdning/filmogtv...,...,[],"[Underholdning, Film og tv, Økonomi]",414,"[433, 434]",underholdning,,,,0.846,Positive
2,3012771,Morten Bruun fyret i SønderjyskE,FODBOLD: Morten Bruun fyret med øjeblikkelig v...,2023-06-29 06:20:39,False,Kemien mellem spillerne i Superligaklubben Søn...,2006-05-01 14:28:40,[3177953],article_default,https://ekstrabladet.dk/sport/fodbold/dansk_fo...,...,[],"[Erhverv, Kendt, Sport, Fodbold, Ansættelsesfo...",142,"[196, 199]",sport,,,,0.8241,Negative
3,3023463,Luderne flytter på landet,I landets tyndest befolkede områder skyder bor...,2023-06-29 06:20:43,False,Det frække erhverv rykker på landet. I den tyn...,2007-03-24 08:27:59,[3184029],article_default,https://ekstrabladet.dk/nyheder/samfund/articl...,...,[],"[Livsstil, Erotik]",118,[133],nyheder,,,,0.7053,Neutral
4,3032577,Cybersex: Hvornår er man utro?,En flirtende sms til den flotte fyr i regnskab...,2023-06-29 06:20:46,False,"De fleste af os mener, at et tungekys er utros...",2007-01-18 10:30:37,[3030463],article_default,https://ekstrabladet.dk/sex_og_samliv/article3...,...,[],"[Livsstil, Partnerskab]",565,[],sex_og_samliv,,,,0.9307,Neutral


### Join history and behaviour tables

In [80]:
# Left join on 'user_id'
behaviour_history_merged= pd.merge(behaviors, history, on='user_id', how='left')

# Display the merged data
behaviour_history_merged.head()

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,...,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
0,149474,,2023-05-24 07:47:53,13.0,,2,"[9778623, 9778682, 9778669, 9778657, 9778736, ...",[9778657],139836,False,...,,,False,759,7.0,22.0,"[2023-05-03T19:04:15.000000, 2023-05-03T19:05:...","[100.0, 89.0, 27.0, 33.0, 100.0, 75.0, 39.0, 2...","[9745590, 9748574, 9748432, 9748080, 9750687, ...","[60.0, 11.0, 1.0, 15.0, 37.0, 15.0, 4.0, 8.0, ..."
1,150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, 9778745, 9778669, 9778657, ...",[9778623],143471,False,...,,,False,1240,287.0,100.0,"[2023-04-27T08:05:09.000000, 2023-04-27T10:05:...","[21.0, 100.0, 34.0, 85.0, 92.0, 75.0, 52.0, 66...","[9737881, 9738659, 9738569, 9738490, 9738528, ...","[7.0, 24.0, 28.0, 65.0, 16.0, 41.0, 59.0, 24.0..."
2,150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, 9778745, 9778669, 9778657, ...",[9778623],143471,False,...,,,False,1240,287.0,100.0,"[2023-05-04T07:10:24.000000, 2023-05-04T07:10:...","[77.0, 80.0, 28.0, 11.0, 94.0, 54.0, 74.0, 30....","[9748977, 9748976, 9747490, 9745484, 9747959, ...","[3.0, 29.0, 2.0, 3.0, 16.0, 30.0, 4.0, 3.0, 4...."
3,153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, 9772866, 9776259, 9756397, ...",[9778669],151570,False,...,,,False,1976,45.0,100.0,"[2023-04-27T14:07:16.000000, 2023-04-27T14:08:...","[100.0, nan, 100.0, 14.0, 100.0, 100.0, 100.0,...","[9738303, 9738993, 9738303, 9738902, 9738303, ...","[59.0, 1.0, 2.0, 8.0, 4.0, 28.0, 51.0, 7.0, 7...."
4,153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, 9772866, 9776259, 9756397, ...",[9778669],151570,False,...,,,False,1976,45.0,100.0,"[2023-05-04T20:50:44.000000, 2023-05-04T20:51:...","[100.0, nan, 100.0, 100.0, 100.0, 18.0, 100.0,...","[9750389, 9749756, 9750389, 9750318, 9749582, ...","[27.0, 8.0, 10.0, 24.0, 13.0, 7.0, 5.0, 34.0, ..."


### Generate binary labels

In [81]:
# Function to create binary labels column
def create_binary_labels_column(df):
    # Define the column names
    clicked_col = "article_ids_clicked"
    inview_col = "article_ids_inview"
    labels_col = "labels"

    # Create a new column with binary labels
    df[labels_col] = df.apply(lambda row: [1 if article_id in row[clicked_col] else 0 for article_id in row[inview_col]], axis=1)

    # Shuffle the data
    df = df.sample(frac=1, random_state=123)

    # Add a column with the length of the labels list
    df[labels_col + "_len"] = df[labels_col].apply(len)

    return df

# Apply the function to your merged dataset
behaviour_history_merged = create_binary_labels_column(behaviour_history_merged)

# Display the updated dataset
behaviour_history_merged.head()

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,...,is_subscriber,session_id,next_read_time,next_scroll_percentage,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed,labels,labels_len
578909,182440184,,2023-05-28 09:02:31,7.0,,2,"[9784044, 9784679, 9784058, 9142564, 9782809, ...",[9784591],437088,False,...,False,1626986,84.0,100.0,"[2023-05-20T21:32:46.000000, 2023-05-20T21:32:...","[36.0, 100.0, 20.0, 100.0, 100.0, 100.0, 100.0...","[9774079, 9774074, 9772453, 9774120, 9773638, ...","[6.0, 39.0, 7.0, 39.0, 71.0, 8.0, 99.0, 16.0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",10
200232,263268931,,2023-05-22 05:21:33,19.0,,1,"[9754160, 9775430, 9774595, 9775402, 7460419, ...",[9775402],1327305,False,...,False,1519807,4.0,17.0,"[2023-05-08T05:34:30.000000, 2023-05-10T07:40:...","[16.0, 48.0, 26.0, 52.0, 100.0, 100.0, 100.0, ...","[9753521, 9757183, 9759154, 9759355, 9759418, ...","[3.0, 8.0, 6.0, 22.0, 32.0, 95.0, 7.0, 87.0, 3...","[0, 0, 0, 1, 0, 0, 0]",7
194891,258249876,,2023-05-21 15:40:24,46.0,,1,"[9774598, 9770028, 9774404, 9774708, 9746360, ...",[9774015],720141,False,...,False,375748,28.0,40.0,"[2023-05-04T06:50:57.000000, 2023-05-04T06:51:...","[56.0, 13.0, 26.0, 70.0, 28.0, 25.0, nan, 26.0...","[9748977, 9745484, 9747490, 9748918, 9748942, ...","[20.0, 27.0, 8.0, 8.0, 12.0, 20.0, 22.0, 17.0,...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",32
503093,87132561,,2023-05-26 17:42:02,25.0,,1,"[9782616, 9780651, 9783043, 9782495, 9783056, ...",[9783043],1447383,False,...,False,1162569,21.0,37.0,"[2023-04-27T12:47:48.000000, 2023-04-27T12:48:...","[21.0, 1.0, 70.0, 100.0, 83.0, 79.0, 100.0, 19...","[9733845, 9733713, 9738684, 9738533, 9737521, ...","[21.0, 9.0, 32.0, 115.0, 101.0, 16.0, 22.0, 13...","[0, 0, 1, 0, 0, 0, 0]",7
858950,547070818,,2023-05-29 05:00:09,8.0,,2,"[9785992, 9785835, 9786111, 9785017, 9785986, ...",[9786111],885672,False,...,False,1375847,87.0,100.0,"[2023-04-27T07:30:17.000000, 2023-04-27T09:37:...","[100.0, 94.0, nan, 23.0, 69.0, 15.0, 47.0, 13....","[9738334, 9738569, 9738364, 9738490, 9738760, ...","[1.0, 22.0, 3.0, 4.0, 1276.0, 2.0, 10.0, 4.0, ...","[0, 0, 1, 0, 0, 0]",6


In [82]:
# Build index of items    
ind2article = {idx + 1: itemid for idx, itemid in enumerate(news['article_id'].values)}
article2ind = {itemid: idx for idx, itemid in ind2article.items()}

# Build index of users
unique_userIds = behaviour_history_merged['user_id'].unique()
ind2user = {idx + 1: itemid for idx, itemid in enumerate(unique_userIds)}
user2ind = {itemid: idx for idx, itemid in ind2user.items()}

behaviour_history_merged['userIdx'] = behaviour_history_merged['user_id'].map(lambda x: user2ind.get(x, 0))
behaviour_history_merged['articleIdx'] = behaviour_history_merged['article_id'].map(lambda x: article2ind.get(x, 0))
print(f"We have {len(article2ind)} unique articles in the dataset")
print(f"We have {len(user2ind)} unique users in the dataset")

We have 20738 unique articles in the dataset
We have 18827 unique users in the dataset


In [83]:
# Split data into train and validation
test_time_threshold = behaviour_history_merged['impression_time'].quantile(0.9)
train_data = behaviour_history_merged[behaviour_history_merged['impression_time'] < test_time_threshold]
valid_data = behaviour_history_merged[behaviour_history_merged['impression_time'] >= test_time_threshold]

In [84]:
class EBNeRDMindDataset(Dataset):
    def __init__(self, df):
        self.data = {
            'userIdx': torch.tensor(df.userIdx.values),
            'articleIdx': torch.tensor(df.articleIdx.values),
            'labels': torch.tensor([item for sublist in df.labels for item in sublist], dtype=torch.float32),
        }

    def __len__(self):
        return len(self.data['userIdx'])

    def __getitem__(self, idx):
        return {
            'userIdx': self.data['userIdx'][idx],
            'articleIdx': self.data['articleIdx'][idx],
            'click': self.data['labels'][idx].long(),
            'noclick': 1 - self.data['labels'][idx].long(),
        }

In [85]:
# Build datasets and dataloaders for train and validation dataframes
bs = 1024
ds_train = EBNeRDMindDataset(train_data)
train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True)
ds_valid = EBNeRDMindDataset(valid_data)
valid_loader = DataLoader(ds_valid, batch_size=bs, shuffle=False)


### Model

In [86]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from torchmetrics.classification import BinaryF1Score, BinaryAUROC

class NewsMF(pl.LightningModule):
    def __init__(self, num_users, num_items, dim=10):
        super().__init__()
        self.dim = dim
        self.useremb = nn.Embedding(num_embeddings=num_users, embedding_dim=dim)
        self.itememb = nn.Embedding(num_embeddings=num_items, embedding_dim=dim)
        
        # BinaryF1Score metric
        self.f1_metric = BinaryF1Score()
        self.train_step_f1_outputs = []
        self.validation_step_f1_outputs = []

        # BinaryAUROC metric
        self.binary_auroc = BinaryAUROC()
        self.train_step_auroc_outputs = []
        self.validation_step_auroc_outputs = []


    def forward(self, user, item):
        batch_size = user.size(0)
        uservec = self.useremb(user)
        itemvec = self.itememb(item)

        score = (uservec * itemvec).sum(-1).unsqueeze(-1)

        return score

    def training_step(self, batch, batch_idx):
        batch_size = batch['userIdx'].size(0)

        # Compute loss as cross entropy (categorical distribution between the clicked and the no-clicked item)
        score_click = self.forward(batch['userIdx'], batch['click'])
        score_noclick = self.forward(batch['userIdx'], batch['noclick'])

        loss = F.cross_entropy(input=torch.cat((score_click, score_noclick), dim=1),
                               target=torch.zeros(batch_size, device=score_click.device).long())
        
        # Compute F1-score
        f1_click = self.f1_metric(score_click.squeeze(), torch.ones_like(batch['click']))
        f1_noclick = self.f1_metric(score_noclick.squeeze(), torch.zeros_like(batch['noclick']))

        # Average F1-scores
        f1 = (f1_click + f1_noclick) / 2.0

        self.train_step_f1_outputs.append(f1)

        # Calculate Binary AUROC
        binary_auroc_score = self.binary_auroc(torch.cat((score_click, score_noclick), dim=1),
                                                torch.cat((torch.ones_like(batch['click']),
                                                           torch.zeros_like(batch['noclick'])))
                                               )
        
        self.train_step_auroc_outputs.append(binary_auroc_score)

        # Log metrics to TensorBoard
        self.log('train_loss', loss)
        self.log('train_f1', f1)
        self.log('train_auroc', binary_auroc_score)

        return {'loss': loss, 'f1': f1, 'auroc': binary_auroc_score}
    
    def on_train_epoch_end(self):
        epoch_average_f1 = torch.stack(self.train_step_f1_outputs).mean()
        print(f'Epoch {self.current_epoch}: Training F1 Score: {epoch_average_f1.item()}')
        self.log("train_epoch_average_f1", epoch_average_f1)
        self.train_step_f1_outputs.clear()  # free memory

        epoch_average_auroc = torch.stack(self.train_step_auroc_outputs).mean()
        print(f'Epoch {self.current_epoch}: Training AUROC Score: {epoch_average_auroc.item()}')
        self.log("train_epoch_average_auroc", epoch_average_auroc)
        self.validation_step_auroc_outputs.clear()  # free memory

    def validation_step(self, batch, batch_idx):
        # Compute loss as cross-entropy (categorical distribution between clicked and non-clicked items)
        score_click = self.forward(batch['userIdx'], batch['click'])
        score_noclick = self.forward(batch['userIdx'], batch['noclick'])

        loss = F.cross_entropy(input=torch.cat((score_click, score_noclick), dim=1),
                            target=torch.zeros(batch['userIdx'].size(0), device=score_click.device).long())
        
        # F1 Score
        f1_click = self.f1_metric(score_click.squeeze(), torch.ones_like(batch['click']))
        f1_noclick = self.f1_metric(score_noclick.squeeze(), torch.zeros_like(batch['noclick']))
        f1 = (f1_click + f1_noclick) / 2.0 # Average F1-scores

        self.validation_step_f1_outputs.append(f1)

        # Calculate Binary AUROC
        binary_auroc_score = self.binary_auroc(torch.cat((score_click, score_noclick), dim=1),
                                                torch.cat((torch.ones_like(batch['click']),
                                                           torch.zeros_like(batch['noclick'])))
                                               )
        
        self.validation_step_auroc_outputs.append(binary_auroc_score)

        # Log metrics to TensorBoard
        self.log('val_loss', loss)
        self.log('val_f1', f1)
        self.log('val_auroc', binary_auroc_score)
                
        return {'loss': loss, 'f1': f1, 'auroc': binary_auroc_score}

    def on_validation_epoch_end(self):
        epoch_average_f1 = torch.stack(self.validation_step_f1_outputs).mean()
        print(f'Epoch {self.current_epoch}: Validation F1 Score: {epoch_average_f1.item()}')
        self.log("validation_epoch_average_f1", epoch_average_f1)
        self.validation_step_f1_outputs.clear()  # free memory

        epoch_average_auroc = torch.stack(self.validation_step_auroc_outputs).mean()
        print(f'Epoch {self.current_epoch}: Validation AUROC Score: {epoch_average_auroc.item()}')
        self.log("validation_epoch_average_auroc", epoch_average_auroc)
        self.validation_step_auroc_outputs.clear()  # free memory

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [87]:
ebnerd_model = NewsMF(num_users=len(user2ind) + 1, num_items=len(article2ind) + 1)

In [88]:
# Instantiate the trainer
trainer = pl.Trainer(max_epochs=10, logger=logger)

# Train the model
trainer.fit(model=ebnerd_model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params
-----------------------------------------------
0 | useremb      | Embedding     | 188 K 
1 | itememb      | Embedding     | 207 K 
2 | f1_metric    | BinaryF1Score | 0     
3 | binary_auroc | BinaryAUROC   | 0     
-----------------------------------------------
395 K     Trainable params
0         Non-trainable params
395 K     Total params
1.583     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\Anaconda\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Epoch 0: Validation F1 Score: 0.34253060817718506
Epoch 0: Validation AUROC Score: 0.4964785575866699


d:\Anaconda\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0: Validation F1 Score: 0.3386707007884979
Epoch 0: Validation AUROC Score: 0.49540749192237854
Epoch 0: Training F1 Score: 0.34014883637428284
Epoch 0: Training AUROC Score: 0.5000050067901611


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1: Validation F1 Score: 0.3346835970878601
Epoch 1: Validation AUROC Score: 0.49542236328125
Epoch 1: Training F1 Score: 0.3334472179412842
Epoch 1: Training AUROC Score: 0.49988874793052673


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2: Validation F1 Score: 0.34897616505622864
Epoch 2: Validation AUROC Score: 0.4957311451435089
Epoch 2: Training F1 Score: 0.3414740562438965
Epoch 2: Training AUROC Score: 0.49983513355255127


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3: Validation F1 Score: 0.356044739484787
Epoch 3: Validation AUROC Score: 0.4958454966545105
Epoch 3: Training F1 Score: 0.35279813408851624
Epoch 3: Training AUROC Score: 0.49969345331192017


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4: Validation F1 Score: 0.35913604497909546
Epoch 4: Validation AUROC Score: 0.4958520531654358
Epoch 4: Training F1 Score: 0.3578338623046875
Epoch 4: Training AUROC Score: 0.4994775950908661


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5: Validation F1 Score: 0.35965415835380554
Epoch 5: Validation AUROC Score: 0.49588122963905334
Epoch 5: Training F1 Score: 0.3598003685474396
Epoch 5: Training AUROC Score: 0.4996633529663086


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6: Validation F1 Score: 0.36130979657173157
Epoch 6: Validation AUROC Score: 0.4958917498588562
Epoch 6: Training F1 Score: 0.3605934977531433
Epoch 6: Training AUROC Score: 0.4997604787349701


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7: Validation F1 Score: 0.3612534701824188
Epoch 7: Validation AUROC Score: 0.49588441848754883
Epoch 7: Training F1 Score: 0.36125707626342773
Epoch 7: Training AUROC Score: 0.4997331202030182


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8: Validation F1 Score: 0.36181753873825073
Epoch 8: Validation AUROC Score: 0.4959352910518646
Epoch 8: Training F1 Score: 0.3612767457962036
Epoch 8: Training AUROC Score: 0.49971356987953186


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: Validation F1 Score: 0.36218899488449097
Epoch 9: Validation AUROC Score: 0.49594029784202576
Epoch 9: Training F1 Score: 0.3612709641456604
Epoch 9: Training AUROC Score: 0.49972665309906006


In [89]:
logs = trainer.logged_metrics

# Print or inspect the logs
print("Training and validation logs:", logs)

Training and validation logs: {'train_loss': tensor(0.2677), 'train_f1': tensor(0.3788), 'train_auroc': tensor(0.5056), 'val_loss': tensor(0.3071), 'val_f1': tensor(0.3622), 'val_auroc': tensor(0.4961), 'validation_epoch_average_f1': tensor(0.3622), 'validation_epoch_average_auroc': tensor(0.4959), 'train_epoch_average_f1': tensor(0.3613), 'train_epoch_average_auroc': tensor(0.4997)}


### Prediction test

In [90]:
USER_ID = 2350 # Random user id
# Create item_ids and user ids list
item_id = list(ind2article.keys())
userIdx =  [USER_ID]*len(item_id)

preditions = ebnerd_model.forward(torch.IntTensor(userIdx), torch.IntTensor(item_id))

# Select top 10 argmax
top_index = torch.topk(preditions.flatten(), 10).indices

# Filter for top 10 suggested items
filters = [ind2article[ix.item()] for ix in top_index]
news[news["article_id"].isin(filters)]

Unnamed: 0,article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,...,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
2371,6226047,Danske hitbands: Har kendt hinanden siden barn...,De er vokset op sammen og kunne sagtens have v...,2023-06-29 07:26:50,False,"De har samme smag i musik, de samme inspiratio...",2016-08-06 15:07:19,[5786877],article_default,https://ekstrabladet.dk/musik/dkmusiknyt/dansk...,...,[],"[Kendt, Livsstil, Underholdning, Musik og lyd]",498,[500],musik,,,,0.9513,Positive
3306,7374998,Tysk sygeplejerske tilstår 100 drab,"Ja. Det, som jeg har tilstået, fandt sted, sig...",2023-06-29 07:37:32,False,Den tyske sygeplejerske Niels Högel tilstår i ...,2018-10-30 11:22:31,,article_default,https://ekstrabladet.dk/krimi/tysk-sygeplejers...,...,[],"[Kriminalitet, Personfarlig kriminalitet, Sund...",140,[],krimi,,,,0.9839,Negative
3489,7532217,Maria Hirse dømt: - Jeg er meget ked af det,Kendissen anker betinget fængselsdom for hæler...,2023-06-29 06:24:12,False,"Maria Hirse er dømt i sagen mod hende, en 46-å...",2019-02-26 13:41:13,[7532091],article_default,https://ekstrabladet.dk/underholdning/dkkendte...,...,[],[Kriminalitet],414,[425],underholdning,,,,0.993,Negative
4356,8303640,Efterlyst i sag om hjemmerøveri fundet død,I følge Ekstra Bladets oplysninger er efterlys...,2023-06-29 06:32:27,False,"En mand, der har været efterlyst af Vestegnens...",2020-09-25 14:32:13,,article_default,https://ekstrabladet.dk/krimi/efterlyst-i-sag-...,...,[],"[Kriminalitet, Personfarlig kriminalitet]",140,[],krimi,,,,0.9942,Negative
8606,9463929,Peter Madsen i nyt retsdrama,Den drabsdømte ubådsbygger var sin egen advoka...,2023-06-29 06:44:28,True,RETTEN I NYKØBING FALSTER: En retssag med den ...,2022-10-18 18:03:06,"[9474942, 8335247, 8370115, 8369668, 8335707]",article_default,https://ekstrabladet.dk/krimi/peter-madsen-i-n...,...,"[PER, ORG, ORG, ORG, ORG, ORG, ORG, PER, ORG]","[Kriminalitet, Personfarlig kriminalitet]",140,[],krimi,,,,0.9718,Negative
11125,9649690,Alvorlig trafikulykke: To dræbte,To lastbiler kørte fredag eftermiddag sammen i...,2023-06-29 06:47:08,False,Fredag eftermiddag skete der en alvorlig ulykk...,2023-02-24 14:28:29,"[9649725, 9649725]",article_default,https://ekstrabladet.dk/nyheder/samfund/alvorl...,...,"[LOC, ORG, LOC, PER, PER, LOC, ORG, ORG, LOC]","[Transportmiddel, Bil, Katastrofe, Mindre ulykke]",118,[133],nyheder,997674.0,343376.0,14263307.0,0.9969,Negative
12117,9693692,1500 evakueret efter naturbrand,En naturbrand i Valencia har tvunget 1500 bebo...,2023-06-29 06:47:45,False,Næsten 4000 hektar er brændt ned i den spanske...,2023-03-26 08:39:28,"[9693715, 9693715]",article_default,https://ekstrabladet.dk/nyheder/samfund/1500-e...,...,"[LOC, ORG, LOC, ORG, LOC]","[Katastrofe, Større katastrofe]",118,[133],nyheder,328374.0,53014.0,1555452.0,0.9933,Negative
13007,9716030,Zelenskij reagerer på grusomme halshugnings-vi...,Ukraines præsident reagerer på grusomme videoe...,2023-06-29 06:48:04,False,,2023-04-12 08:59:09,,article_video_standalone,https://ekstrabladet.dk/nyheder/krigogkatastro...,...,[],"[Kriminalitet, Kendt, Personfarlig kriminalite...",118,[127],nyheder,81107.0,1097.0,72115.0,0.9871,Negative
16390,9751922,Mareridt for Lindstrøm: Pillet ud i ny Frankfu...,Jesper Lindstrøm var tilbage i startopstilling...,2023-06-29 06:48:36,False,Tidligere på ugen spillede Frankfurt sig i den...,2023-05-06 16:17:34,[9751928],article_default,https://ekstrabladet.dk/sport/fodbold/udenland...,...,"[PER, PER, EVENT, MISC, EVENT, LOC, ORG, LOC, ...","[Begivenhed, Sport, Fodbold, Sportsbegivenhed]",142,"[196, 227, 265]",sport,388707.0,28364.0,1606899.0,0.9738,Negative
20417,9798583,Batteri-drama: Tesla klarede 141.827 km,En sag i Ankenævn for Biler er endt med at Tes...,2023-06-29 06:49:21,False,- Det drejer sig om 1 konkret sag.\n- Og Tesla...,2023-06-05 09:20:17,"[9507028, 9798567, 9798567]",article_default,https://ekstrabladet.dk/nationen/batteri-drama...,...,"[ORG, ORG, ORG, PER, PER, PROD, PER, ORG, PROD]","[Erhverv, Privat virksomhed, Transportmiddel, ...",512,[],nationen,195349.0,46353.0,4880681.0,0.7418,Negative


### Model Save

In [91]:
# Specify the relative directory path
relative_directory = "Saved_Model/"

# Create the full directory path
directory_path = os.path.join(relative_directory)

# Create the directory if it does not exist
os.makedirs(directory_path, exist_ok=True)

# Save the state dictionary of the model to the specified directory
model_save_path = os.path.join(directory_path, "EBNERD_collaborative_filtering_model.pth")
torch.save(ebnerd_model.state_dict(), model_save_path)

### Model Load

In [92]:
# Load the state dictionary from the specified directory
loaded_model = NewsMF(num_users=len(ind2user)+1, num_items=len(ind2article)+1)

# Use a relative path when loading the model
model_load_path = os.path.join("Saved_Model", "EBNERD_collaborative_filtering_model.pth")
loaded_model.load_state_dict(torch.load(model_load_path))

<All keys matched successfully>

### Loaded Model Single Prediciton

In [93]:
# Specify the user ID for prediction
USER_ID = 1234
PREDICTION_COUNT = 10

# Create item_ids and user ids list
article_id = list(ind2article.keys())
userIdx = [USER_ID] * len(article_id)

# Convert lists to PyTorch tensors
user_tensor = torch.IntTensor(userIdx)
item_tensor = torch.IntTensor(article_id)

# Forward pass to get predictions
predictions = loaded_model.forward(user_tensor, item_tensor)

# Select top 10 indices
top_indices = torch.topk(predictions.flatten(), PREDICTION_COUNT).indices

# Get corresponding item IDs
top_item_ids = [ind2article[ix.item()] for ix in top_indices]

# Filter for top 10 suggested items
recommended_items = news[news["article_id"].isin(top_item_ids)]

# Display the recommended items
print(recommended_items)

       article_id                                              title  \
3337      7397168  Fed og grim: Nøgent pigeband med skældsord mal...   
4939      8620132                         Ayla, 22 år og fra Kastrup   
5423      8904436                         Matilde, 23 år og fra Aars   
8566      9460984                                 Den levende fakkel   
8598      9463080                                         Ulvemanden   
9690      9561841             Medie: Topspiller tiltalt for voldtægt   
9959      9583381                       Sexdating - det skal du gøre   
17784     9765010         Medie: United vil hente Eriksen-konkurrent   
18081     9769155  Krydsede 10 lande for at nå USA: - Jeg ville a...   
18850     9777910     Den falske telefon-myte: Du skal ikke gøre det   

                                                subtitle  last_modified_time  \
3337   Det britiske band Little Mix offentliggør prov... 2023-06-29 07:37:43   
4939                                           

### Tensorboard

In [94]:
# Load the extension and start TensorBoard
%load_ext tensorboard
%tensorboard --logdir tb_logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 7996), started 2 days, 4:48:12 ago. (Use '!kill 7996' to kill it.)

### Convert to Python Script (not needed right now but keep as utility)

In [95]:
!python -m nbconvert --to script EBNERD_Notebook.ipynb

[NbConvertApp] Converting notebook EBNERD_Notebook.ipynb to script
[NbConvertApp] Writing 13586 bytes to EBNERD_Notebook.py


### Get random user id

In [96]:
random_user_index = np.random.randint(0, len(behaviors))
random_user_id = behaviors.iloc[random_user_index]['user_id']

print(f"Randomly selected user ID: {random_user_id}")

Randomly selected user ID: 1295907


### Validate conversion consistency

In [98]:
def validate_mapping_consistency(user2ind, ind2user, article2ind, ind2article):
    # Choose a random user and article ID for validation
    random_user_id = np.random.choice(list(user2ind.keys()))
    random_article_id = np.random.choice(list(article2ind.keys()))
    print(f"Randomly selected user ID: {random_user_id}")
    print(f"Randomly selected article ID: {random_article_id}")

    # Validate user mapping
    user_index = user2ind.get(random_user_id)
    retrieved_user_id = ind2user.get(user_index)
    print(f"User index: {user_index}")
    print(f"Retrieved user ID: {retrieved_user_id}")
    
    user_mapping_consistent = random_user_id == retrieved_user_id

    # Validate article mapping
    article_index = article2ind.get(random_article_id)
    retrieved_article_id = ind2article.get(article_index)
    print(f"Article index: {article_index}")
    print(f"Retrieved article ID: {retrieved_article_id}")

    article_mapping_consistent = random_article_id == retrieved_article_id

    return user_mapping_consistent, article_mapping_consistent

# Perform validation
user_consistency, article_consistency = validate_mapping_consistency(user2ind, ind2user, article2ind, ind2article)

# Print results
print(f"User Mapping Consistency: {user_consistency}")
print(f"Article Mapping Consistency: {article_consistency}")

Randomly selected user ID: 554981
Randomly selected article ID: 8199340
User index: 7720
Retrieved user ID: 554981
Article index: 4240
Retrieved article ID: 8199340
User Mapping Consistency: True
Article Mapping Consistency: True
