In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from collections import defaultdict
from google.colab import drive

# Constants
MAX_SENT_LENGTH = 30
MAX_SENTS = 50
EMBEDDING_DIM = 300
NUM_HEADS = 16
HEAD_SIZE = 16
DROPOUT_RATE = 0.2
NPRATIO = 4  # Number of negative samples per positive
BATCH_SIZE = 64
EPOCHS = 5


In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


In [4]:
articles_path = '/content/drive/MyDrive/DTU/Kandidat/Semester 9/Deep learning/Final project/Data/articles.csv'
behaviors_train_path = '/content/drive/MyDrive/DTU/Kandidat/Semester 9/Deep learning/Final project/Data/behaviors_train.csv'
behaviors_val_path = '/content/drive/MyDrive/DTU/Kandidat/Semester 9/Deep learning/Final project/Data/behaviors_val.csv'
history_train_path = '/content/drive/MyDrive/DTU/Kandidat/Semester 9/Deep learning/Final project/Data/history_train.csv'
history_val_path = '/content/drive/MyDrive/DTU/Kandidat/Semester 9/Deep learning/Final project/Data/history_val.csv'

In [5]:
# Load Datasets
articles = pd.read_csv(articles_path)
behaviors_train = pd.read_csv(behaviors_train_path)
behaviors_val = pd.read_csv(behaviors_val_path)
history_train = pd.read_csv(history_train_path)
history_val = pd.read_csv(history_val_path)

In [6]:
def build_vocab_and_tokenize(titles, max_len=MAX_SENT_LENGTH):
    """
    Builds a vocabulary and tokenizes article titles.

    Args:
        titles (list of str): List of article titles to tokenize.
        max_len (int): Maximum length for tokenized titles (truncation/padding length).

    Returns:
        tokenized_titles (list of list of int): Tokenized and padded titles.
        vocab (dict): A dictionary mapping tokens to unique integer indices.
        vocab_size (int): Size of the vocabulary.
    """
    vocab = defaultdict(lambda: len(vocab))  # Default dictionary for token ids
    vocab["<PAD>"] = 0  # Padding token
    vocab["<UNK>"] = 1  # Unknown token

    tokenized_titles = []
    for title in titles:
        tokens = title.lower().split()[:max_len]  # Simple whitespace tokenizer
        tokenized = [vocab[token] for token in tokens]
        padded = pad_sequence_to_length(tokenized, max_len, pad_value=vocab["<PAD>"])
        tokenized_titles.append(padded)

    # Freeze the vocabulary after processing to get accurate vocab size
    vocab = dict(vocab)  # Convert to a regular dict to freeze it
    vocab_size = len(vocab)

    return tokenized_titles, vocab, vocab_size

def pad_sequence_to_length(sequence, target_length, pad_value=0):
    """
    Pads or truncates a sequence to the specified target length.

    Args:
        sequence (list of int): Input sequence to pad or truncate.
        target_length (int): Desired length of the sequence.
        pad_value (int): Value to use for padding shorter sequences.

    Returns:
        list of int: Padded or truncated sequence.
    """
    if len(sequence) >= target_length:
        return sequence[:target_length]
    else:
        return sequence + [pad_value] * (target_length - len(sequence))

In [7]:
# Tokenize titles and build vocabulary
articles["tokenized_title"], vocab, VOCAB_SIZE = build_vocab_and_tokenize(
    articles["title"].fillna("<UNK>"),
    max_len=MAX_SENT_LENGTH
)

In [8]:
article_to_tokens = {row['article_id']: row['tokenized_title'] for _, row in articles.iterrows()}

article_to_idx = {article_id: idx for idx, article_id in enumerate(articles['article_id'].unique(), start=2)}
article_to_idx[0] = 0  # Reserved for <PAD>
article_to_idx[1] = 1  # Reserved for <UNK>

article_embedding_size = len(article_to_idx) + 1

In [9]:
import logging

# Setup logging (if not already configured in your project)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def clean_article_ids(article_ids):
    """
    Cleans and parses article IDs from a string representation to a list of integers.

    Args:
        article_ids (str): String representation of article IDs (e.g., "[1, 2, 3]").

    Returns:
        list of int or None: List of parsed article IDs, or None if input is invalid.
    """
    try:
        # Check for invalid placeholders or empty strings
        if not article_ids or "..." in article_ids:
            logging.warning(f"Malformed article_ids found: '{article_ids}'. Skipping.")
            return None

        # Remove brackets and split on spaces or commas, then convert to integers
        cleaned_ids = article_ids.strip("[]").replace(",", " ").split()
        return list(map(int, cleaned_ids))
    except Exception as e:
        logging.error(f"Failed to clean article_ids '{article_ids}' due to error: {e}")
        return None

# Function to process a single dataset (train or val)
def clean_and_report_history(history_df, dataset_name="dataset"):
    """
    Cleans article IDs in the user history dataset and reports cleaning statistics.

    Args:
        history_df (pd.DataFrame): Input dataset with article history to clean.
        dataset_name (str): Name of the dataset (for logging and reporting).

    Returns:
        pd.DataFrame: Cleaned dataset with invalid rows removed.
    """
    # Clean article IDs
    history_df["cleaned_article_ids"] = history_df["article_id_fixed"].apply(clean_article_ids)

    # Count skipped rows
    skipped_rows = history_df["cleaned_article_ids"].isna().sum()
    total_rows = len(history_df)
    print(f"{dataset_name}: Skipped {skipped_rows} rows out of {total_rows} "
          f"({skipped_rows / total_rows:.2%}).")

    # Save problematic rows
    invalid_rows = history_df[history_df["cleaned_article_ids"].isna()]
    invalid_rows_file = f"invalid_article_ids_{dataset_name}.csv"
    invalid_rows.to_csv(invalid_rows_file, index=False)
    print(f"{dataset_name}: Saved {len(invalid_rows)} problematic rows to '{invalid_rows_file}'.")

    # Drop invalid rows and reset index
    cleaned_df = history_df.dropna(subset=["cleaned_article_ids"]).reset_index(drop=True)
    print(f"{dataset_name}: Remaining rows after cleaning: {len(cleaned_df)}")

    return cleaned_df

# Clean train and validation datasets
history_train_cleaned = clean_and_report_history(history_train, dataset_name="train")
history_val_cleaned = clean_and_report_history(history_val, dataset_name="val")


def process_cleaned_user_history(cleaned_history_df):
    user_histories = defaultdict(list)
    for _, row in cleaned_history_df.iterrows():
        user_id = row["user_id"]
        article_ids = row["cleaned_article_ids"]
        user_histories[user_id].extend(article_ids)
    return user_histories

user_history_train_cleaned = process_cleaned_user_history(history_train_cleaned)
user_history_val_cleaned = process_cleaned_user_history(history_val_cleaned)



train: Skipped 2 rows out of 1590 (0.13%).
train: Saved 2 problematic rows to 'invalid_article_ids_train.csv'.
train: Remaining rows after cleaning: 1588
val: Skipped 0 rows out of 1562 (0.00%).
val: Saved 0 problematic rows to 'invalid_article_ids_val.csv'.
val: Remaining rows after cleaning: 1562


In [10]:
print(f"Vocabulary Size: {VOCAB_SIZE}")
print(f"Sample Tokens: {list(vocab.keys())[:10]}")

Vocabulary Size: 18591
Sample Tokens: ['<PAD>', '<UNK>', 'ishockey-spiller:', 'jeg', 'troede', 'skulle', 'dø', 'prins', 'harry', 'tvunget']


In [11]:
import json
import os

data_folder = "/content/drive/MyDrive/DTU/Kandidat/Semester 9/Deep learning/Final project/Data"
vocab_file = os.path.join(data_folder, "vocab.json")

os.makedirs(data_folder, exist_ok=True)

# Save the vocabulary to a file
with open(vocab_file, "w") as f:
    json.dump(vocab, f)

print(f"Vocabulary saved to {vocab_file}")

Vocabulary saved to /content/drive/MyDrive/DTU/Kandidat/Semester 9/Deep learning/Final project/Data/vocab.json


In [12]:
len(user_history_train_cleaned[13538])

582

In [13]:
from random import sample

def create_samples(behaviors_df, user_history, npratio=NPRATIO, max_sents=MAX_SENTS, max_sent_length=MAX_SENT_LENGTH):
    samples = []
    labels = []
    for _, row in behaviors_df.iterrows():
        user_id = row["user_id"]

        clicked_articles = clean_article_ids(row['article_ids_clicked'])
        inview_articles = clean_article_ids(row['article_ids_inview'])

        if clicked_articles is None or inview_articles is None:
            continue

        clicked_articles = [article_to_idx.get(article_id, 1) for article_id in clicked_articles]  # Map or <UNK>
        inview_articles = [article_to_idx.get(article_id, 1) for article_id in inview_articles]  # Map or <UNK>

        # Prepare user history
        user_hist = user_history.get(user_id, [])
        user_hist = [article_to_idx.get(article_id, 1) for article_id in user_hist[:max_sents]]  # Map or <UNK>
        user_hist += [0] * (max_sents - len(user_hist))  # Pad to max_sents

        # Add positive samples
        for article_idx in clicked_articles:
            candidate = [article_idx] + [0] * (max_sent_length - 1)
            samples.append((user_hist, candidate))
            labels.append(1)

        # Add negative samples
        negative_articles = list(set(inview_articles) - set(clicked_articles))
        for article_idx in negative_articles:
            candidate = [article_idx] + [0] * (max_sent_length - 1)
            samples.append((user_hist, candidate))
            labels.append(0)

    return samples, labels

train_samples_cleaned, train_labels_cleaned = create_samples(
    behaviors_train, user_history_train_cleaned, npratio=NPRATIO
    )
val_samples_cleaned, val_labels_cleaned = create_samples(
    behaviors_val, user_history_val_cleaned, npratio=NPRATIO
    )

In [14]:

# Step 4: Define PyTorch Dataset
class NewsDataset(Dataset):
    def __init__(self, samples, labels):
        self.samples = samples
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        user_history, candidate = self.samples[idx]
        return (
            torch.tensor(user_history, dtype=torch.long),
            torch.tensor(candidate, dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.long),
        )

train_dataset = NewsDataset(train_samples_cleaned, train_labels_cleaned)
val_dataset = NewsDataset(val_samples_cleaned, val_labels_cleaned)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [15]:
# Step 5: Define the NRMS Model
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.output_dim = num_heads * head_size
        self.qkv_linear = nn.Linear(EMBEDDING_DIM, self.output_dim * 3)
        self.fc_out = nn.Linear(self.output_dim, EMBEDDING_DIM)

    def forward(self, x):
        if len(x.size()) != 3:
            raise ValueError(f"Expected input to be 3D (batch_size, seq_length, embed_dim), got {x.size()}")
        batch_size, seq_length, embed_dim = x.size()
        qkv = self.qkv_linear(x).reshape(batch_size, seq_length, self.num_heads, 3 * self.head_size)
        qkv = qkv.permute(2, 0, 1, 3)
        Q, K, V = torch.chunk(qkv, 3, dim=-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32))
        attention = F.softmax(scores, dim=-1)
        weighted = torch.matmul(attention, V)
        weighted = weighted.permute(1, 2, 0, 3).reshape(batch_size, seq_length, self.output_dim)
        return self.fc_out(weighted)

class TitleEncoder(nn.Module):
    def __init__(self, article_embedding_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(article_embedding_size, embedding_dim)
        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.self_attention = MultiHeadSelfAttention(NUM_HEADS, HEAD_SIZE)
        self.dense = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        x = self.embedding(x)  # Ensure input is embedded
        if len(x.size()) != 3:
            raise ValueError(f"Embedding layer output should be 3D, got {x.size()}")
        x = self.dropout(x)
        x = self.self_attention(x)
        attention_weights = F.softmax(self.dense(x).squeeze(-1), dim=-1) # attention weights
        return torch.sum(x * attention_weights.unsqueeze(-1), dim=1) # weighted sum

class NRMS(nn.Module):
    def __init__(self, article_embedding_size, embedding_dim, num_classes):
        super().__init__()
        self.title_encoder = TitleEncoder(article_embedding_size, embedding_dim)

    def forward(self, candidates, user_history):
        assert len(user_history.size()) == 2, f"Expected user_history to have 2 dimensions, got {user_history.size()}"
        assert len(candidates.size()) == 2, f"Expected candidates to have 2 dimensions, got {candidates.size()}"

        user_rep = self.title_encoder(user_history)  # Output: (batch_size, embedding_dim)
        candidate_rep = self.title_encoder(candidates)  # Output: (batch_size, embedding_dim)
        return torch.matmul(candidate_rep, user_rep.unsqueeze(-1)).squeeze(-1)

In [18]:
from sklearn.metrics import accuracy_score

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=EPOCHS):

    for epoch in range(epochs):
        # Training Loop
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            user_histories, candidates, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(candidates, user_histories)  # Pass inputs to model

            # Compute loss and update weights
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        # Validation Loop
        model.eval()
        total_val_loss = 0
        all_preds, all_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                user_histories, candidates, labels = [x.to(device) for x in batch]
                outputs = model(candidates, user_histories)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()

                # Predictions and metrics
                _, preds = torch.max(outputs, dim=1)  # Get predicted class
                all_preds.extend(preds.cpu().tolist())  # Move predictions to CPU before converting to list
                all_labels.extend(labels.cpu().tolist())  # Move labels to CPU before converting to list

        # Metrics
        accuracy = accuracy_score(all_labels, all_preds)

        # Logging
        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"  Train Loss: {total_train_loss:.4f}")
        print(f"  Val Loss: {total_val_loss:.4f}")
        print(f"  Val Accuracy: {accuracy:.4f}")


In [19]:
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=EPOCHS):
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            user_histories, candidates, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(candidates, user_histories)  # Pass inputs to model

            # Compute loss and update weights
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_train_loss:.4f}")

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = NRMS(article_embedding_size, EMBEDDING_DIM, NPRATIO + 1).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Train the model
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    epochs=5
)

Epoch 1, Loss: 18355.2704
Epoch 2, Loss: 18259.3426
Epoch 3, Loss: 18194.9731
Epoch 4, Loss: 18232.3040
Epoch 5, Loss: 18192.3330


In [21]:
model_path = '/content/drive/MyDrive/DTU/Kandidat/Semester 9/Deep learning/Final project/nrms_model.pth'
torch.save(model.state_dict(), model_path)
