In [4]:
import numpy as np
import pandas as pd
import random
from google.colab import drive
import nltk
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, Subset, DataLoader
import torch.optim as optim
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
MAX_TITLE_LEN = 20

articles_path = '/content/drive/MyDrive/NRMS/OG data/articles.parquet'
behaviors_train_path = '/content/drive/MyDrive/NRMS/OG data/behaviors_train.parquet'
behaviors_val_path = '/content/drive/MyDrive/NRMS/OG data/behaviors_val.parquet'
history_train_path = '/content/drive/MyDrive/NRMS/OG data/history_train.parquet'
history_val_path = '/content/drive/MyDrive/NRMS/OG data/history_val.parquet'

# Load data from Parquet files
articles = pd.read_parquet(articles_path)
train_behaviors = pd.read_parquet(behaviors_train_path)
val_behaviors = pd.read_parquet(behaviors_val_path)
history_train = pd.read_parquet(history_train_path)
history_val = pd.read_parquet(history_val_path)

# Create a news dict and  article_id to a unique index
news = {}
newsindex = {'NULL': 0}  # Add a NULL key for padding
for idx, row in articles.iterrows():
    article_id = row['article_id']
    title = row['title'].lower()
    tokenized_title = word_tokenize(title)

    news[article_id] = tokenized_title
    newsindex[article_id] = len(newsindex)

# Summary
print("Number of unique articles:", len(news))
print("Example article ID and tokenized title:")
for k, v in list(news.items())[:3]:  # Print first 3 articles
    print(f"Article ID: {k}, Tokenized Title: {v}")

print("\nTotal articles indexed in newsindex:", len(newsindex), "\n")

# Helper function to sample negative examples
def newsample(array, ratio):
    if len(array) == 0:
        return []
    if ratio > len(array):
        return random.sample(array * (ratio // len(array) + 1), ratio)
    else:
        return random.sample(array, ratio)

# Sampling configuration
npratio = 4  # Number of negative samples per positive sample
MAX_HISTORY_LEN = 50

# Function to process behaviors data
def process_behaviors(behaviors, newsindex, history_data=None):
    train_candidate = []
    train_label = []
    train_user_his = []

    # Build a user history dictionary from history data if provided
    user_history = {}
    if history_data is not None:
        for _, row in history_data.iterrows():
            user_history[row['user_id']] = [newsindex.get(aid, 0) for aid in row['article_id_fixed']]

    for _, row in behaviors.iterrows():
        user_id = row['user_id']

        # Clicked articles (positive examples)
        clicked = [newsindex.get(aid, 0) for aid in row['article_ids_clicked'] if aid in newsindex]
        # Non-clicked articles (negative examples)
        inview = set(row['article_ids_inview'])
        non_clicked = [newsindex.get(aid, 0) for aid in inview if aid in newsindex and aid not in row['article_ids_clicked']]

        # User history
        if user_id in user_history:
            clickids = user_history[user_id][-MAX_HISTORY_LEN:]
        else:
            clickids = clicked[-MAX_HISTORY_LEN:]

        for pos_doc in clicked:
            neg_docs = newsample(non_clicked, npratio)
            candidates = neg_docs + [pos_doc]
            labels = [0] * npratio + [1]

            # Shuffle candidates and labels
            shuffle_indices = list(range(len(candidates)))
            random.shuffle(shuffle_indices)
            shuffled_candidates = [candidates[i] for i in shuffle_indices]
            shuffled_labels = [labels[i] for i in shuffle_indices]

            # Append training data
            train_candidate.append(shuffled_candidates)
            train_label.append(shuffled_labels)
            train_user_his.append(clickids + [0] * (MAX_HISTORY_LEN - len(clickids)))

    return train_candidate, train_label, train_user_his

# Process train behaviors
train_candidate, train_label, train_user_his = process_behaviors(train_behaviors, newsindex, history_train)
val_candidate, val_label, val_user_his = process_behaviors(val_behaviors, newsindex, history_val)

# Print summary
print("Training Data:")
print("Number of training candidates:", len(train_candidate))
print("Number of training labels:", len(train_label))
print("Number of user histories:", len(train_user_his))

print("\nValidation Data:")
print("Number of validation candidates:", len(val_candidate))
print("Number of validation labels:", len(val_label))
print("Number of user histories:", len(val_user_his))

Number of unique articles: 11777
Example article ID and tokenized title:
Article ID: 3037230, Tokenized Title: ['ishockey-spiller', ':', 'jeg', 'troede', 'jeg', 'skulle', 'dø']
Article ID: 3044020, Tokenized Title: ['prins', 'harry', 'tvunget', 'til', 'dna-test']
Article ID: 3057622, Tokenized Title: ['rådden', 'kørsel', 'på', 'blå', 'plader']

Total articles indexed in newsindex: 11778 

Training Data:
Number of training candidates: 24888
Number of training labels: 24888
Number of user histories: 24888

Validation Data:
Number of validation candidates: 25505
Number of validation labels: 25505
Number of user histories: 25505


In [8]:
# Save Train Data
train_df = pd.DataFrame({
    'candidate': train_candidate,
    'label': train_label,
    'user_his': train_user_his
})
train_df.to_parquet('/train_data.parquet', index=False)

# Save Validation Data
val_df = pd.DataFrame({
    'candidate': val_candidate,
    'label': val_label,
    'user_his': val_user_his
})
val_df.to_parquet('/val_data.parquet', index=False)

# Print Confirmation
print("Train and Validation data saved as Parquet files:")
print("  - train_data.parquet")
print("  - val_data.parquet")


Train and Validation data saved as Parquet files:
  - train_data.parquet
  - val_data.parquet


In [9]:
# Build vocabulary from tokenized titles
vocab = {'<PAD>': 0}  # Start with a padding token
for tokens in news.values():
    for word in tokens:
        if word not in vocab:
            vocab[word] = len(vocab)

# Convert titles to token indices
news_tensor = {}
for article_id, tokens in news.items():
    token_indices = [vocab[word] for word in tokens]  # Convert words to token indices

    # Truncate or pad to MAX_TITLE_LEN
    if len(token_indices) > MAX_TITLE_LEN:
        token_indices = token_indices[:MAX_TITLE_LEN]  # Truncate if too long
    else:
        token_indices += [0] * (MAX_TITLE_LEN - len(token_indices))  # Pad with zeros

    news_tensor[newsindex[article_id]] = torch.tensor(token_indices, dtype=torch.long)
    news_tensor[0] = torch.zeros(MAX_TITLE_LEN, dtype=torch.long)

news_tensors_list = [tensor[:MAX_TITLE_LEN] for tensor in news_tensor.values()]  # Truncate to MAX_TITLE_LEN
news_tensors_padded = pad_sequence(news_tensors_list, batch_first=True, padding_value=vocab['<PAD>'])

# Example Output
print("Vocabulary size:", len(vocab))
print("Shape of padded news tensors:", news_tensors_padded.shape)

Vocabulary size: 16003
Shape of padded news tensors: torch.Size([11778, 20])


In [10]:
def load_glove_embeddings(file_path, embedding_dim):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.array(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")
    return embeddings_index

# Load GloVe embeddings (300d)
glove_embeddings = '/content/drive/MyDrive/NRMS/danish_newspapers_1880To2013.txt'
embedding_dim = 300
glove_index = load_glove_embeddings(glove_embeddings, embedding_dim)

Loaded 2404837 word vectors from GloVe.


In [11]:
def create_embedding_matrix(vocab, glove_index, embedding_dim):
    """
    Create an embedding matrix for a given vocabulary using GloVe embeddings.

    Args:
        vocab (dict): Vocabulary mapping words to indices.
        glove_index (dict): GloVe embeddings loaded from the file.
        embedding_dim (int): Dimension of the embeddings.

    Returns:
        torch.Tensor: The embedding matrix.
        int: Count of OOV words.
        int: Count of IV words.
    """
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))  # Initialize with zeros

    for word, idx in vocab.items():
        if word in glove_index:  # Use GloVe embedding if available
            embedding_matrix[idx] = glove_index[word]
        else:  # Random initialization for words not in GloVe
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Create the embedding matrix
embedding_matrix = create_embedding_matrix(vocab, glove_index, embedding_dim)

In [12]:
# Vocabulary analysis
total_vocab_size = len(vocab)
iv_count = len([word for word in vocab if word in glove_index])
oov_count = total_vocab_size - iv_count

iv_percentage = (iv_count / total_vocab_size) * 100
oov_percentage = (oov_count / total_vocab_size) * 100

print(f"Vocabulary size: {total_vocab_size}")
print(f"In-Vocabulary words: {iv_count} ({iv_percentage:.2f}%)")
print(f"Out-of-Vocabulary words: {oov_count} ({oov_percentage:.2f}%)")

# Sample OOV words
oov_words = [word for word in vocab if word not in glove_index]
print("Sample OOV words:", oov_words[:10])

Vocabulary size: 16003
In-Vocabulary words: 12290 (76.80%)
Out-of-Vocabulary words: 3713 (23.20%)
Sample OOV words: ['<PAD>', 'ishockey-spiller', ':', 'dna-test', 'mærsk-arvinger', 'zoo-tårnet', '100', 'creamy-pige', '-', 'champagne-drengen']


In [13]:
class NewsDataset(Dataset):
    def __init__(self, data, news_tensor, max_history_len=50):
        """
        Args:
            data (pd.DataFrame): DataFrame with 'candidate', 'label', and 'user_his'
            news_tensor (dict): Dictionary mapping article_id to tokenized title tensors
            max_history_len (int): Maximum length of user history (default: 50)
        """
        self.data = data
        self.news_tensor = news_tensor
        self.max_history_len = max_history_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Candidate titles: List of tokenized article titles (positive + negatives)
        candidate_ids = row['candidate']
        candidate_titles = torch.stack([self.news_tensor[aid] for aid in candidate_ids])

        # Labels: Positive (1) and negative (0) labels for candidates
        labels = torch.tensor(row['label'], dtype=torch.float)

        # User history: List of clicked articles converted to tokenized titles
        user_his_ids = row['user_his']
        user_his_titles = torch.stack([self.news_tensor[aid] for aid in user_his_ids])

        # Pad user history if it's shorter than max length
        if len(user_his_titles) < self.max_history_len:
            padding = torch.zeros((self.max_history_len - len(user_his_titles), candidate_titles.shape[1]), dtype=torch.long)
            user_his_titles = torch.cat((user_his_titles, padding), dim=0)

        return candidate_titles, user_his_titles, labels


In [14]:
# Load train and validation data
train_data = pd.read_parquet('/train_data.parquet')
val_data = pd.read_parquet('/val_data.parquet')

# Initialize the Dataset for train and validation
train_dataset = NewsDataset(train_data, news_tensor, max_history_len=50)
val_dataset = NewsDataset(val_data, news_tensor, max_history_len=50)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Fetch a batch to test
for candidate_titles, user_his_titles, labels in train_loader:
    print("Candidate Titles Shape:", candidate_titles.shape)  # (batch_size, num_candidates, MAX_TITLE_LEN)
    print("User History Shape:", user_his_titles.shape)      # (batch_size, max_history_len, MAX_TITLE_LEN)
    print("Labels Shape:", labels.shape)                    # (batch_size, num_candidates)
    break

Candidate Titles Shape: torch.Size([16, 5, 20])
User History Shape: torch.Size([16, 50, 20])
Labels Shape: torch.Size([16, 5])


In [15]:
class NewsEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, num_heads=20, attention_hidden_dim=200, pretrained_embeddings=None, dropout_prob=0.2):
        """
        Args:
            vocab_size (int): Size of the vocabulary
            embedding_dim (int): Dimension of word embeddings
            max_title_len (int): Maximum length of article
            pretrained_embeddings (torch.Tensor): Pre-trained embedding matrix
        """
        super(NewsEncoder, self).__init__()

        # Word Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # Dropout layer
        self.dropout = nn.Dropout(p=dropout_prob)

        # Single-Head Self-Attention
        self.multihead_attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, batch_first=True)

        # Additive Attention Network
        self.additive_attention_query = nn.Parameter(torch.randn(attention_hidden_dim))  # Query vector
        self.additive_attention_fc1 = nn.Linear(embedding_dim, attention_hidden_dim)
        self.additive_attention_fc2 = nn.Linear(attention_hidden_dim, 1)

        # Linear Layer to output fixed-size vector
        self.fc = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, title_tokens):
        """
        Args:
            title_tokens (Tensor): Shape (batch_size, max_title_len)
                                   - Tokenized and padded title tensors
        Returns:
            Tensor: Fixed-size vector representing the article (batch_size, embedding_dim)
        """
        # Word Embedding
        embedded = self.embedding(title_tokens)  # Shape: (batch_size, max_title_len, embedding_dim)
        embedded = self.dropout(embedded)        # Apply dropout to embeddings

        # Multi-Head Self-Attention
        attn_output, _ = self.multihead_attention(embedded, embedded, embedded)  # Shape: (batch_size, max_title_len, embedding_dim)
        attn_output = self.dropout(attn_output)  # Apply dropout to attention outputs

        # Additive Attention
        additive_weights = torch.tanh(self.additive_attention_fc1(attn_output))  # Shape: (batch_size, max_title_len, attention_hidden_dim)
        additive_scores = self.additive_attention_fc2(additive_weights).squeeze(-1)  # Shape: (batch_size, max_title_len)

        # Compute attention weights (softmax over words)
        attention_weights = torch.softmax(additive_scores, dim=1)  # Shape: (batch_size, max_title_len)

        # Weighted sum of the attention outputs
        weighted_sum = torch.sum(attn_output * attention_weights.unsqueeze(-1), dim=1)  # Shape: (batch_size, embedding_dim)

        # Linear Transformation
        output_vector = self.fc(weighted_sum)  # Shape: (batch_size, embedding_dim)

        return output_vector


In [16]:
# Define parameters
VOCAB_SIZE = len(vocab)  # Vocabulary size
EMBEDDING_DIM = 300      # Dimension of word embeddings
MAX_TITLE_LEN = 20       # Length of padded titles
DROPOUT_PROB = 0.2

# Initialize the News Encoder
news_encoder = NewsEncoder(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, pretrained_embeddings=embedding_matrix, dropout_prob=DROPOUT_PROB)

# Fetch a batch of candidate titles
for candidate_titles, user_his_titles, labels in train_loader:
    # Input shape: (batch_size, num_candidates, MAX_TITLE_LEN)
    batch_size, num_candidates, title_len = candidate_titles.shape
    _, max_history_len, _ = user_his_titles.shape

    # Reshape to merge batch_size and num_candidates
    candidate_titles_reshaped = candidate_titles.view(-1, title_len)  # Shape: (batch_size * num_candidates, MAX_TITLE_LEN)

    # Pass through the News Encoder
    candidate_vectors = news_encoder(candidate_titles_reshaped)  # Shape: (batch_size * num_candidates, EMBEDDING_DIM)

    # Reshape back to original batch_size and num_candidates
    candidate_vectors = candidate_vectors.view(batch_size, num_candidates, EMBEDDING_DIM)

    # Reshape user history titles to merge batch_size and max_history_len
    user_his_titles_reshaped = user_his_titles.view(-1, title_len)  # Shape: (batch_size * max_history_len, MAX_TITLE_LEN)

    # Pass through News Encoder
    user_his_vectors = news_encoder(user_his_titles_reshaped)  # Shape: (batch_size * max_history_len, EMBEDDING_DIM)

    # Reshape back to original batch_size and max_history_len
    user_his_vectors = user_his_vectors.view(batch_size, max_history_len, EMBEDDING_DIM)  # Shape: (batch_size, max_history_len, EMBEDDING_DIM)

    print("Shape of candidate_vectors:", candidate_vectors.shape)  # Expected: (batch_size, num_candidates, EMBEDDING_DIM)
    print("Shape of user_his_vectors:", user_his_vectors.shape)    # Expected: (batch_size, max_history_len, EMBEDDING_DIM)
    break

Shape of candidate_vectors: torch.Size([16, 5, 300])
Shape of user_his_vectors: torch.Size([16, 50, 300])


In [17]:
class UserEncoder(nn.Module):
    def __init__(self, embedding_dim=300, num_heads=20, attention_hidden_dim=200, dropout_prob=0.2):
        """
        Args:
            embedding_dim (int): Dimension of the article embeddings
            num_heads (int): Number of attention heads in multi-head attention
            attention_hidden_dim (int): Dimension of the query vector in additive attention
        """
        super(UserEncoder, self).__init__()

        # Multi-Head Self-Attention
        self.multihead_attention = nn.MultiheadAttention(embed_dim=embedding_dim,
                                                         num_heads=num_heads,
                                                         batch_first=True)

        # Additive Attention
        self.additive_attention_query = nn.Parameter(torch.randn(attention_hidden_dim))  # Learnable query vector
        self.additive_attention_fc1 = nn.Linear(embedding_dim, attention_hidden_dim)
        self.additive_attention_fc2 = nn.Linear(attention_hidden_dim, 1)

        # Dropout Layer
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, user_his_vectors):
        """
        Args:
            user_his_vectors (Tensor): Shape (batch_size, max_history_len, embedding_dim)
                                        - Representations of clicked articles
        Returns:
            Tensor: User representation vector (batch_size, embedding_dim)
        """
        # Multi-Head Self-Attention
        attn_output, _ = self.multihead_attention(user_his_vectors, user_his_vectors, user_his_vectors)
        # Shape: (batch_size, max_history_len, embedding_dim)

        # Additive Attention
        additive_weights = torch.tanh(self.additive_attention_fc1(attn_output))  # Shape: (batch_size, max_history_len, attention_hidden_dim)
        additive_weights = self.dropout(additive_weights)  # Apply dropout to attention weights
        additive_scores = self.additive_attention_fc2(additive_weights).squeeze(-1)  # Shape: (batch_size, max_history_len)

        # Compute attention weights (softmax over user history)
        attention_weights = torch.softmax(additive_scores, dim=1)  # Shape: (batch_size, max_history_len)

        # Weighted sum of the attention outputs
        user_vector = torch.sum(attn_output * attention_weights.unsqueeze(-1), dim=1)  # Shape: (batch_size, embedding_dim)

        user_vector = self.dropout(user_vector)  # Apply dropout to the final user vector

        return user_vector

In [18]:
# Define parameters
EMBEDDING_DIM = 300      # Dimension of article embeddings
NUM_HEADS = 20           # Number of attention heads
ATTENTION_HIDDEN_DIM = 200  # Query vector dimension for additive attention
DROPOUT_PROB = 0.0

# Initialize User Encoder
user_encoder = UserEncoder(embedding_dim=EMBEDDING_DIM, num_heads=NUM_HEADS, attention_hidden_dim=ATTENTION_HIDDEN_DIM, dropout_prob=DROPOUT_PROB)

# Fetch a batch of user history vectors
for candidate_titles, user_his_titles, labels in train_loader:
    # Reshape user history titles to merge batch_size and max_history_len
    user_his_titles_reshaped = user_his_titles.view(-1, MAX_TITLE_LEN)  # Shape: (batch_size * max_history_len, MAX_TITLE_LEN)

    # Pass through News Encoder to get user history vectors
    user_his_vectors = news_encoder(user_his_titles_reshaped)  # Shape: (batch_size * max_history_len, EMBEDDING_DIM)

    # Reshape back to (batch_size, max_history_len, EMBEDDING_DIM)
    batch_size, max_history_len, _ = user_his_titles.shape
    user_his_vectors = user_his_vectors.view(batch_size, max_history_len, EMBEDDING_DIM)

    # Pass user history vectors through User Encoder
    user_vectors = user_encoder(user_his_vectors)  # Shape: (batch_size, EMBEDDING_DIM)

    print("Shape of user_his_vectors:", user_his_vectors.shape)  # (batch_size, max_history_len, EMBEDDING_DIM)
    print("Shape of user_vectors:", user_vectors.shape)         # (batch_size, EMBEDDING_DIM)
    break

Shape of user_his_vectors: torch.Size([16, 50, 300])
Shape of user_vectors: torch.Size([16, 300])


In [19]:
class NRMSModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, num_heads=20, attention_hidden_dim=200, max_history_len=50):
        """
        Args:
            vocab_size (int): Size of the vocabulary
            embedding_dim (int): Dimension of word embeddings
            num_heads (int): Number of attention heads in multi-head attention
            attention_hidden_dim (int): Dimension of the query vector in additive attention
            max_history_len (int): Maximum length of user history
            max_title_len (int): Maximum length of article titles
        """
        super(NRMSModel, self).__init__()

        # News Encoder for both candidate and user history articles
        self.news_encoder = NewsEncoder(vocab_size=vocab_size,
                                         embedding_dim=embedding_dim,
                                         num_heads=num_heads,
                                         attention_hidden_dim=attention_hidden_dim)

        # User Encoder to encode user history into a single user representation vector
        self.user_encoder = UserEncoder(embedding_dim=embedding_dim,
                                         num_heads=num_heads,
                                         attention_hidden_dim=attention_hidden_dim)

    def forward(self, candidate_titles, user_his_titles):
        """
        Args:
            candidate_titles (Tensor): Shape (batch_size, num_candidates, max_title_len)
                                       - Tokenized and padded titles of candidate articles
            user_his_titles (Tensor): Shape (batch_size, max_history_len, max_title_len)
                                       - Tokenized and padded titles of user clicked articles
        Returns:
            Tensor: Click scores for each candidate article (batch_size, num_candidates)
        """
        batch_size, num_candidates, max_title_len = candidate_titles.shape
        _, max_history_len, _ = user_his_titles.shape

        # -------------------
        # Process Candidate Articles
        # -------------------
        # Reshape candidates to merge batch_size and num_candidates
        candidate_titles_reshaped = candidate_titles.view(-1, max_title_len)  # Shape: (batch_size * num_candidates, max_title_len)

        # Encode candidate articles
        candidate_vectors = self.news_encoder(candidate_titles_reshaped)  # Shape: (batch_size * num_candidates, embedding_dim)

        # Reshape back to original batch_size and num_candidates
        candidate_vectors = candidate_vectors.view(batch_size, num_candidates, -1)  # Shape: (batch_size, num_candidates, embedding_dim)

        # -------------------
        # Process User History
        # -------------------
        # Reshape user history titles to merge batch_size and max_history_len
        user_his_titles_reshaped = user_his_titles.view(-1, max_title_len)  # Shape: (batch_size * max_history_len, max_title_len)

        # Encode user history articles
        user_his_vectors = self.news_encoder(user_his_titles_reshaped)  # Shape: (batch_size * max_history_len, embedding_dim)

        # Reshape back to original batch_size and max_history_len
        user_his_vectors = user_his_vectors.view(batch_size, max_history_len, -1)  # Shape: (batch_size, max_history_len, embedding_dim)

        # Encode user history into a single user representation vector
        user_vectors = self.user_encoder(user_his_vectors)  # Shape: (batch_size, embedding_dim)

        # -------------------
        # Compute Click Scores
        # -------------------
        # Dot product between user vector and candidate vectors
        click_scores = torch.bmm(candidate_vectors, user_vectors.unsqueeze(-1)).squeeze(-1)  # Shape: (batch_size, num_candidates)

        return click_scores


In [20]:
# Initialize the NRMS Model
nrms_model = NRMSModel(vocab_size=VOCAB_SIZE,
                       embedding_dim=EMBEDDING_DIM,
                       num_heads=NUM_HEADS,
                       attention_hidden_dim=ATTENTION_HIDDEN_DIM,
                       max_history_len=50)

# Fetch a batch from the DataLoader
for candidate_titles, user_his_titles, labels in train_loader:
    # Pass through NRMS model
    click_scores = nrms_model(candidate_titles, user_his_titles)  # Shape: (batch_size, num_candidates)

    print("Shape of click_scores:", click_scores.shape)  # Expected: (batch_size, num_candidates)
    print("Example click scores:", click_scores[0])     # Print scores for the first user in the batch
    break


Shape of click_scores: torch.Size([16, 5])
Example click scores: tensor([ 0.0091,  0.0017, -0.0151, -0.0202, -0.0106],
       grad_fn=<SelectBackward0>)


In [21]:
# Define a function to decode tokenized titles back into text
def decode_title(token_indices, vocab):
    reverse_vocab = {idx: word for word, idx in vocab.items()}  # Reverse the vocabulary
    return " ".join([reverse_vocab[idx] for idx in token_indices if idx != 0])  # Ignore padding tokens (0)

# Fetch a batch from the DataLoader
for candidate_titles, user_his_titles, labels in train_loader:
    # Pass through NRMS model
    click_scores = nrms_model(candidate_titles, user_his_titles)  # Shape: (batch_size, num_candidates)

    # Loop through the batch
    batch_size, num_candidates, max_title_len = candidate_titles.shape
    for i in range(batch_size):
        print(f"\nUser {i + 1}:")
        print("-" * 30)

        # Decode and print each candidate title, its label, and its score
        for j in range(num_candidates):
            title_tokens = candidate_titles[i, j].cpu().numpy()  # Get token indices for the title
            title_text = decode_title(title_tokens, vocab)  # Decode the title back into text
            label = labels[i, j].item()  # Get the label
            score = click_scores[i, j].item()  # Get the click score

            print(f"Candidate {j + 1}:")
            print(f"  Title: {title_text}")
            print(f"  Label: {label}")
            print(f"  Click Score: {score:.4f}")
    break



User 1:
------------------------------
Candidate 1:
  Title: fyres efter skandalekamp
  Label: 0.0
  Click Score: -0.0023
Candidate 2:
  Title: bagmænd bag angreb i belgorod varsler mere af samme skuffe
  Label: 0.0
  Click Score: -0.0079
Candidate 3:
  Title: dansk storklub slået i vildt drama : det er ikke forbi
  Label: 1.0
  Click Score: 0.0087
Candidate 4:
  Title: jørgen de mylius om tina turner : hun var 'unbelievable '
  Label: 0.0
  Click Score: 0.0353
Candidate 5:
  Title: forgældet og falleret : martin skrider fra det hele
  Label: 0.0
  Click Score: 0.0087

User 2:
------------------------------
Candidate 1:
  Title: danmark i rød : c25-indekset styrtdykker
  Label: 0.0
  Click Score: 0.0181
Candidate 2:
  Title: rekordmange tyskere planlægger sommerferie i danske feriehuse
  Label: 1.0
  Click Score: 0.0011
Candidate 3:
  Title: skød ekskonen for øjnene af børnene : skyldig i overlagt drab
  Label: 0.0
  Click Score: -0.0134
Candidate 4:
  Title: bange : de synger putins 

In [22]:
# Define parameters
NUM_EPOCHS = 3          # Number of epochs
LEARNING_RATE = 1e-4    # Learning rate
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize the model, loss, and optimizer
nrms_model = NRMSModel(vocab_size=VOCAB_SIZE,
                       embedding_dim=EMBEDDING_DIM,
                       num_heads=NUM_HEADS,
                       attention_hidden_dim=ATTENTION_HIDDEN_DIM,
                       max_history_len=50).to(DEVICE)

criterion = nn.CrossEntropyLoss()  # Loss function
optimizer = optim.Adam(nrms_model.parameters(), lr=LEARNING_RATE)  # Optimizer

# Training Loop
for epoch in range(NUM_EPOCHS):
    nrms_model.train()  # Set model to training mode
    total_loss = 0

    for batch_idx, (candidate_titles, user_his_titles, labels) in enumerate(train_loader):
        # Move data to the appropriate device (GPU/CPU)
        candidate_titles = candidate_titles.to(DEVICE)  # Shape: (batch_size, num_candidates, max_title_len)
        user_his_titles = user_his_titles.to(DEVICE)    # Shape: (batch_size, max_history_len, max_title_len)
        labels = torch.argmax(labels, dim=1).to(DEVICE)  # Convert labels to indices, Shape: (batch_size)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        click_scores = nrms_model(candidate_titles, user_his_titles)  # Shape: (batch_size, num_candidates)

        # Compute loss
        loss = criterion(click_scores, labels)  # CrossEntropyLoss expects (scores, target_indices)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print progress for every 100 batches
        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch + 1}/{NUM_EPOCHS}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    # Print epoch loss
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{NUM_EPOCHS}] - Average Loss: {avg_loss:.4f}")

print("Training complete!")


Epoch [1/3], Batch [1/1556], Loss: 1.6084
Epoch [1/3], Batch [101/1556], Loss: 1.6097
Epoch [1/3], Batch [201/1556], Loss: 1.4620
Epoch [1/3], Batch [301/1556], Loss: 1.4796
Epoch [1/3], Batch [401/1556], Loss: 1.5426
Epoch [1/3], Batch [501/1556], Loss: 1.6327
Epoch [1/3], Batch [601/1556], Loss: 1.5980
Epoch [1/3], Batch [701/1556], Loss: 1.5228
Epoch [1/3], Batch [801/1556], Loss: 1.5809
Epoch [1/3], Batch [901/1556], Loss: 1.6101
Epoch [1/3], Batch [1001/1556], Loss: 1.7373
Epoch [1/3], Batch [1101/1556], Loss: 1.5175
Epoch [1/3], Batch [1201/1556], Loss: 1.3431
Epoch [1/3], Batch [1301/1556], Loss: 1.5037
Epoch [1/3], Batch [1401/1556], Loss: 1.4826
Epoch [1/3], Batch [1501/1556], Loss: 1.3537
Epoch [1/3] - Average Loss: 1.5458
Epoch [2/3], Batch [1/1556], Loss: 1.5767
Epoch [2/3], Batch [101/1556], Loss: 1.5720
Epoch [2/3], Batch [201/1556], Loss: 1.4555
Epoch [2/3], Batch [301/1556], Loss: 1.6261
Epoch [2/3], Batch [401/1556], Loss: 1.4812
Epoch [2/3], Batch [501/1556], Loss: 1.

In [28]:
def compute_mrr(labels, scores):
    """Compute Mean Reciprocal Rank (MRR)."""
    ranks = []
    for label, score in zip(labels, scores):
        sorted_indices = np.argsort(-score)  # Sort scores in descending order
        rank = np.where(sorted_indices == label)[0][0] + 1  # Rank of the true label
        ranks.append(1 / rank)
    return np.mean(ranks)

def compute_ndcg_at_k(labels, scores, k=5):
    """Compute Normalized Discounted Cumulative Gain at K (nDCG@k)."""
    ndcgs = []
    for label, score in zip(labels, scores):
        sorted_indices = np.argsort(-score)[:k]  # Top-k indices by score
        dcg = 0
        for i, idx in enumerate(sorted_indices):
            if idx == label:
                dcg += 1 / np.log2(i + 2)  # i + 2 to avoid log(0)
                break
        idcg = 1 / np.log2(1 + 1)  # Ideal DCG when true label is ranked 1st
        ndcgs.append(dcg / idcg if idcg > 0 else 0)
    return np.mean(ndcgs)

def validate_model(model, val_loader, device):
    model.eval()  # Set model to evaluation mode
    all_labels = []
    all_preds = []
    all_scores = []

    with torch.no_grad():  # No gradient computation during evaluation
        for batch_idx, (candidate_titles, user_his_titles, labels) in enumerate(val_loader):
            # Move data to the appropriate device
            candidate_titles = candidate_titles.to(device)  # Shape: (batch_size, num_candidates, max_title_len)
            user_his_titles = user_his_titles.to(device)    # Shape: (batch_size, max_history_len, max_title_len)
            labels = torch.argmax(labels, dim=1).to(device)  # Convert labels to indices, Shape: (batch_size)

            # Forward pass
            click_scores = model(candidate_titles, user_his_titles)  # Shape: (batch_size, num_candidates)

            # Predicted indices (max score per candidate set)
            preds = torch.argmax(click_scores, dim=1).cpu().numpy()  # Shape: (batch_size)

            # Append true labels and predictions for metrics
            all_labels.extend(labels.cpu().numpy())  # True labels
            all_preds.extend(preds)  # Predicted labels
            all_scores.extend(torch.softmax(click_scores, dim=1).cpu().numpy())  # Softmax probabilities

    print(f"Shape of all_labels: {len(all_labels)}")  # Should match the total number of validation samples
    print(f"Shape of all_scores: {len(all_scores), len(all_scores[0])}")  # Should be (n_samples, n_classes)

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_scores, multi_class="ovr")  # Use full softmax scores
    mrr = compute_mrr(all_labels, all_scores)
    ndcg_at_5 = compute_ndcg_at_k(all_labels, all_scores, k=5)

    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation AUC: {auc:.4f}")
    print(f"Validation MRR: {mrr:.4f}")
    print(f"Validation nDCG@5: {ndcg_at_5:.4f}")
    return accuracy, auc, mrr, ndcg_at_5

In [29]:
# Test the trained model
validate_model(nrms_model, val_loader, DEVICE)

Shape of all_labels: 25505
Shape of all_scores: (25505, 5)
Validation Accuracy: 0.2401
Validation AUC: 0.5711
Validation MRR: 0.4957
Validation nDCG@5: 0.6199


(0.2401489903940404, 0.5710892144799267, 0.4956727439064236, 0.619910212222263)

In [25]:
torch.save(nrms_model.state_dict(), "/content/nrms_model.pth")

In [26]:
nrms_model = NRMSModel(vocab_size=VOCAB_SIZE,
                       embedding_dim=EMBEDDING_DIM,
                       num_heads=NUM_HEADS,
                       attention_hidden_dim=ATTENTION_HIDDEN_DIM,
                       max_history_len=50).to(DEVICE)
nrms_model.load_state_dict(torch.load("/content/nrms_model.pth"))
nrms_model.eval()  # Set model to evaluation mode


  nrms_model.load_state_dict(torch.load("/content/nrms_model.pth"))


NRMSModel(
  (news_encoder): NewsEncoder(
    (embedding): Embedding(16003, 300, padding_idx=0)
    (dropout): Dropout(p=0.2, inplace=False)
    (multihead_attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
    )
    (additive_attention_fc1): Linear(in_features=300, out_features=200, bias=True)
    (additive_attention_fc2): Linear(in_features=200, out_features=1, bias=True)
    (fc): Linear(in_features=300, out_features=300, bias=True)
  )
  (user_encoder): UserEncoder(
    (multihead_attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
    )
    (additive_attention_fc1): Linear(in_features=300, out_features=200, bias=True)
    (additive_attention_fc2): Linear(in_features=200, out_features=1, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

In [27]:
# Inspect click probabilities
for batch_idx, (candidate_titles, user_his_titles, labels) in enumerate(val_loader):
    candidate_titles = candidate_titles.to(DEVICE)
    user_his_titles = user_his_titles.to(DEVICE)
    labels = torch.argmax(labels, dim=1).to(DEVICE)

    # Forward pass
    click_scores = nrms_model(candidate_titles, user_his_titles)
    softmax_probs = torch.softmax(click_scores, dim=1)

    for i in range(candidate_titles.size(0)):  # Iterate over the batch
        print(f"\nUser {i + 1}:")
        print(f"True Label: {labels[i].item()}")
        print(f"Predicted Click Probabilities: {softmax_probs[i].cpu().detach().numpy()}")
    break  # Remove this to inspect the full dataset



User 1:
True Label: 3
Predicted Click Probabilities: [0.16499498 0.48258266 0.0181701  0.3061414  0.02811082]

User 2:
True Label: 2
Predicted Click Probabilities: [0.17946708 0.34895906 0.2935238  0.15891865 0.01913144]

User 3:
True Label: 2
Predicted Click Probabilities: [0.15165615 0.15012798 0.32395694 0.18934967 0.18490924]

User 4:
True Label: 2
Predicted Click Probabilities: [0.1869131  0.27347305 0.14871149 0.17994772 0.21095464]

User 5:
True Label: 0
Predicted Click Probabilities: [0.21140973 0.07901444 0.21624954 0.24403815 0.24928814]

User 6:
True Label: 3
Predicted Click Probabilities: [0.20432742 0.11676952 0.18353046 0.16342986 0.33194277]

User 7:
True Label: 3
Predicted Click Probabilities: [0.099669   0.13146655 0.2448048  0.25524434 0.26881537]

User 8:
True Label: 1
Predicted Click Probabilities: [0.20698081 0.09441364 0.2165706  0.3152093  0.16682565]

User 9:
True Label: 2
Predicted Click Probabilities: [0.11922061 0.04921909 0.17965813 0.32354066 0.3283615 ]

