In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from collections import defaultdict

# Constants
MAX_SENT_LENGTH = 30
MAX_SENTS = 50
EMBEDDING_DIM = 300
NUM_HEADS = 20
HEAD_SIZE = 20
DROPOUT_RATE = 0.2
NPRATIO = 4  # Number of negative samples per positive
BATCH_SIZE = 64
EPOCHS = 5

In [10]:
# Load Datasets
articles = pd.read_csv("Data/articles.csv")
behaviors_train = pd.read_csv("Data/behaviors_train.csv")
behaviors_val = pd.read_csv("Data/behaviors_val.csv")
history_train = pd.read_csv("Data/history_train.csv")
history_val = pd.read_csv("Data/history_val.csv")

In [16]:
print(behaviors_train['impression_time'])

0       2023-05-21 21:06:50
1       2023-05-24 07:31:26
2       2023-05-24 07:30:33
3       2023-05-23 05:25:40
4       2023-05-23 05:31:54
                ...        
24719   2023-05-22 08:30:52
24720   2023-05-22 08:31:34
24721   2023-05-22 08:51:33
24722   2023-05-22 08:53:36
24723   2023-05-18 10:56:49
Name: impression_time, Length: 24724, dtype: datetime64[ns]


In [3]:
# Step 1: Tokenize article titles and build vocabulary
def build_vocab_and_tokenize(titles, max_len=MAX_SENT_LENGTH):
    vocab = defaultdict(lambda: len(vocab))  # Default dictionary for token ids
    vocab["<PAD>"] = 0  # Padding token
    vocab["<UNK>"] = 1  # Unknown token

    VOCAB_SIZE = len(vocab)

    tokenized_titles = []
    for title in titles:
        tokens = title.lower().split()[:max_len]  # Simple whitespace tokenizer, truncate to max_len
        tokenized = [vocab[token] for token in tokens]
        tokenized = [min(idx, VOCAB_SIZE - 1) for idx in tokenized]  # Ensure indices are valid
        padded = tokenized + [vocab["<PAD>"]] * (max_len - len(tokenized))  # Padding to max_len
        tokenized_titles.append(padded)

    return tokenized_titles, vocab, VOCAB_SIZE

articles["tokenized_title"], vocab, VOCAB_SIZE = build_vocab_and_tokenize(articles["title"].fillna("<UNK>"))

In [11]:
# Define preprocessing function
def preprocess_behaviors(dataframe):
    # Ensure impression_time is in datetime format
    dataframe["impression_time"] = pd.to_datetime(dataframe["impression_time"])
    
    # Extract day of the week (0 = Monday, ..., 6 = Sunday)
    dataframe["day_of_week"] = dataframe["impression_time"].dt.dayofweek

    # Extract time of day bins
    def time_of_day_bin(hour):
        if 6 <= hour < 12:
            return "morning"
        elif 12 <= hour < 18:
            return "afternoon"
        elif 18 <= hour < 24:
            return "evening"
        else:
            return "night"
    
    dataframe["time_of_day"] = dataframe["impression_time"].dt.hour.apply(time_of_day_bin)
    
    # Map time of day bins to integers for embedding
    time_of_day_mapping = {"morning": 0, "afternoon": 1, "evening": 2, "night": 3}
    dataframe["time_of_day_bin"] = dataframe["time_of_day"].map(time_of_day_mapping)
    
    # Normalize scroll percentage to range [0, 1]
    dataframe["normalized_scroll_percentage"] = dataframe["scroll_percentage"] / 100.0
    
    return dataframe

behaviors_train = preprocess_behaviors(behaviors_train)

behaviors_train.head()

Unnamed: 0.1,Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,...,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage,day_of_week,time_of_day,time_of_day_bin,normalized_scroll_percentage
0,0,48401,,2023-05-21 21:06:50,21.0,,2,[9774516 9771051 9770028 9775402 9774461 97595...,[9759966],22779,...,,,False,21,16.0,27.0,6,evening,2,
1,1,152513,9778745.0,2023-05-24 07:31:26,30.0,100.0,1,[9778669 9778736 9778623 9089120 9778661 97774...,[9778661],150224,...,,,False,298,2.0,48.0,2,morning,0,1.0
2,2,155390,,2023-05-24 07:30:33,45.0,,1,[9778369 9777856 9778500 9778021 9778627 97783...,[9777856],160892,...,,,False,401,215.0,100.0,2,morning,0,
3,3,214679,,2023-05-23 05:25:40,33.0,,2,[9776715 9776406 9776566 9776071 9776808 97762...,[9776566],1001055,...,,,False,1357,40.0,47.0,1,night,3,
4,4,214681,,2023-05-23 05:31:54,21.0,,2,[9775202 9776855 9776688 9771995 9776583 97765...,[9776553],1001055,...,,,False,1358,5.0,49.0,1,night,3,


In [4]:
# Step 2: Clean and process user history
def clean_article_ids(article_ids):
    try:
        if "..." in article_ids:
            return None  # Mark for removal
        return list(map(int, article_ids.strip("[]").split()))
    except ValueError:
        return None

history_train["cleaned_article_ids"] = history_train["article_id_fixed"].apply(clean_article_ids)
history_val["cleaned_article_ids"] = history_val["article_id_fixed"].apply(clean_article_ids)

history_train_cleaned = history_train.dropna(subset=["cleaned_article_ids"]).reset_index(drop=True)
history_val_cleaned = history_val.dropna(subset=["cleaned_article_ids"]).reset_index(drop=True)

def process_cleaned_user_history(cleaned_history_df):
    user_histories = defaultdict(list)
    for _, row in cleaned_history_df.iterrows():
        user_id = row["user_id"]
        article_ids = row["cleaned_article_ids"]
        user_histories[user_id].extend(article_ids)
    return user_histories

user_history_train_cleaned = process_cleaned_user_history(history_train_cleaned)
user_history_val_cleaned = process_cleaned_user_history(history_val_cleaned)

In [5]:
# Step 3: Create samples and labels
def create_samples(behaviors_df, user_history, max_sents=MAX_SENTS):
    samples = []
    labels = []
    for _, row in behaviors_df.iterrows():
        user_id = row["user_id"]
        clicked_articles = list(map(int, row["article_ids_clicked"].strip("[]").split()))
        inview_articles = list(map(int, row["article_ids_inview"].strip("[]").split()))

        user_hist = user_history[user_id][:max_sents] + [0] * (max_sents - len(user_history[user_id]))
        user_hist = [min(idx, VOCAB_SIZE - 1) for idx in user_hist]  # Ensure valid indices

        for article_id in inview_articles:
            article_id = min(article_id, VOCAB_SIZE - 1)  # Ensure valid index
            label = 1 if article_id in clicked_articles else 0
            samples.append((user_hist, article_id))
            labels.append(label)

    return samples, labels

train_samples_cleaned, train_labels_cleaned = create_samples(behaviors_train, user_history_train_cleaned)
val_samples_cleaned, val_labels_cleaned = create_samples(behaviors_val, user_history_val_cleaned)

In [6]:

# Step 4: Define PyTorch Dataset
class NewsDataset(Dataset):
    def __init__(self, samples, labels):
        self.samples = samples
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        user_history, candidate = self.samples[idx]
        user_history_padded = [0] * MAX_SENTS  # Pad user history
        user_history_padded[:len(user_history)] = user_history[:MAX_SENTS]
        candidate_padded = [candidate] + [0] * (MAX_SENT_LENGTH - 1)  # Pad candidate to sequence
        return (
            torch.tensor(user_history_padded, dtype=torch.long),
            torch.tensor(candidate_padded, dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.long),
        )

train_dataset = NewsDataset(train_samples_cleaned, train_labels_cleaned)
val_dataset = NewsDataset(val_samples_cleaned, val_labels_cleaned)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [7]:
# Step 5: Define the NRMS Model
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.output_dim = num_heads * head_size
        self.qkv_linear = nn.Linear(EMBEDDING_DIM, self.output_dim * 3)
        self.fc_out = nn.Linear(self.output_dim, EMBEDDING_DIM)

    def forward(self, x):
        if len(x.size()) != 3:
            raise ValueError(f"Expected input to be 3D (batch_size, seq_length, embed_dim), got {x.size()}")
        batch_size, seq_length, embed_dim = x.size()
        qkv = self.qkv_linear(x).reshape(batch_size, seq_length, self.num_heads, 3 * self.head_size)
        qkv = qkv.permute(2, 0, 1, 3)
        Q, K, V = torch.chunk(qkv, 3, dim=-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_size)
        attention = F.softmax(scores, dim=-1)
        weighted = torch.matmul(attention, V)
        return self.fc_out(weighted.permute(1, 2, 0, 3).reshape(batch_size, seq_length, self.output_dim))

class TitleEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(DROPOUT_RATE)
        self.self_attention = MultiHeadSelfAttention(NUM_HEADS, HEAD_SIZE)
        self.dense = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        x = self.embedding(x)  # Ensure input is embedded
        if len(x.size()) != 3:
            raise ValueError(f"Embedding layer output should be 3D, got {x.size()}")
        x = self.dropout(x)
        x = self.self_attention(x)
        attention_weights = F.softmax(self.dense(x).squeeze(-1), dim=-1)
        return torch.sum(x * attention_weights.unsqueeze(-1), dim=1)

class NRMS(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes):
        super().__init__()
        self.title_encoder = TitleEncoder(vocab_size, embedding_dim)

    def forward(self, candidates, user_history):
        user_rep = self.title_encoder(user_history)  # Output: (batch_size, embedding_dim)
        candidate_rep = self.title_encoder(candidates)  # Output: (batch_size, embedding_dim)
        return torch.matmul(candidate_rep, user_rep.unsqueeze(-1)).squeeze(-1)

In [8]:
# Step 6: Train the Model
model = NRMS(VOCAB_SIZE, EMBEDDING_DIM, NPRATIO + 1)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=EPOCHS):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            user_histories, candidates, labels = batch
            optimizer.zero_grad()
            outputs = model(candidates, user_histories)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

In [9]:
# Train the model
train_model(model, train_loader, val_loader, optimizer, criterion)

KeyboardInterrupt: 