In [33]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import json

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data
train_data = pd.read_parquet('Data/train_data.parquet')
test_data = pd.read_parquet('Data/val_data.parquet')
articles = pd.read_parquet('Data/articles.parquet')

print(f"Total train data rows before filtering: {len(train_data)}")

# Set of valid article IDs
valid_article_ids = set(articles['article_id'])

# Filter rows with valid candidate and user history IDs
train_data = train_data[
    train_data['candidate'].apply(lambda x: all(cid in valid_article_ids for cid in x))
]
train_data = train_data[
    train_data['user_his'].apply(lambda x: all(hid in valid_article_ids for hid in x))
]

print(f"Total train data rows after filtering: {len(train_data)}")

# Load vocabulary
with open('Data/vocab.json', 'r') as f:
    vocab = json.load(f)

# Parameters
PAD_IDX = vocab["<PAD>"]
UNK_IDX = vocab["<UNK>"]
MAX_HISTORY = 50
MAX_TITLE_LEN = 30  # Assume a max length for tokenized titles
EMBEDDING_DIM = 128

Total train data rows before filtering: 24888
Total train data rows after filtering: 0


In [10]:
def tokenize_title(title, vocab, max_len=MAX_TITLE_LEN):
    tokens = [vocab.get(word, UNK_IDX) for word in title.split()]
    if len(tokens) < max_len:
        tokens += [PAD_IDX] * (max_len - len(tokens))
    else:
        tokens = tokens[:max_len]
    return tokens

In [11]:
articles['tokenized_title'] = articles['title'].apply(lambda x: tokenize_title(x, vocab))
article_dict = {row['article_id']: row['tokenized_title'] for _, row in articles.iterrows()}

In [27]:
article_dict

{3037230: [1,
  1,
  4,
  3,
  5,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 3044020: [1,
  1,
  9,
  10,
  11,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 3057622: [1,
  13,
  14,
  15,
  16,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 3073151: [1,
  18,
  19,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 3193383: [1,
  21,
  22,
  23,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 3196611: [1,
  25,
  26,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 3200325: [1,
  28,
  14,
 

In [14]:
print(train_data.columns)
print(test_data.columns)

Index(['candidate', 'label', 'user_his'], dtype='object')
Index(['candidate', 'label', 'user_his'], dtype='object')


In [22]:
class NewsDataset(Dataset):
    def __init__(self, data, article_dict):
        self.candidates = data['candidate'].tolist()
        self.labels = data['label'].tolist()
        self.user_histories = data['user_his'].tolist()
        self.article_dict = article_dict

    def __getitem__(self, index):
        candidate_ids = self.candidates[index]
        print(f"Candidate IDs: {candidate_ids}")  # Debug candidate IDs
        print(f"Available keys in article_dict: {list(self.article_dict.keys())[:5]}...")  # Print first 5 keys

        candidate_titles = [self.article_dict[cid] for cid in candidate_ids]
        label = self.labels[index]
        user_history_ids = self.user_histories[index]
        user_history_titles = [self.article_dict.get(hid, [PAD_IDX] * MAX_TITLE_LEN) for hid in user_history_ids]
        
        return torch.tensor(candidate_titles, dtype=torch.long), \
               torch.tensor(label, dtype=torch.float), \
               torch.tensor(user_history_titles, dtype=torch.long)

    def __len__(self):
        return len(self.candidates)

train_dataset = NewsDataset(train_data, article_dict)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [16]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads=4):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, batch_first=True)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        return attn_output

In [17]:
class AdditiveAttention(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.linear = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        scores = self.linear(x).squeeze(-1)
        weights = torch.softmax(scores, dim=1).unsqueeze(-1)
        return torch.sum(x * weights, dim=1)

In [18]:
class NewsEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_IDX)
        self.multihead_attn = MultiHeadSelfAttention(embedding_dim, num_heads)
        self.additive_attn = AdditiveAttention(embedding_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # Shape: (batch_size, seq_len, embedding_dim)
        attn_output = self.multihead_attn(embedded)
        news_repr = self.additive_attn(attn_output)
        return news_repr

In [19]:
class UserEncoder(nn.Module):
    def __init__(self, news_encoder):
        super().__init__()
        self.news_encoder = news_encoder
        self.multihead_attn = MultiHeadSelfAttention(EMBEDDING_DIM)
        self.additive_attn = AdditiveAttention(EMBEDDING_DIM)

    def forward(self, user_hist):
        news_reprs = torch.stack([self.news_encoder(hist) for hist in user_hist], dim=1)
        attn_output = self.multihead_attn(news_reprs)
        user_repr = self.additive_attn(attn_output)
        return user_repr

In [20]:
class NRMS(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.news_encoder = NewsEncoder(vocab_size, EMBEDDING_DIM)
        self.user_encoder = UserEncoder(self.news_encoder)

    def forward(self, candidates, user_history):
        user_repr = self.user_encoder(user_history)
        candidate_reprs = torch.stack([self.news_encoder(c) for c in candidates], dim=1)
        scores = torch.matmul(candidate_reprs, user_repr.unsqueeze(-1)).squeeze(-1)
        return scores

In [30]:
model = NRMS(vocab_size=len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0
    for candidates, labels, user_histories in train_loader:
        candidates, labels, user_histories = candidates.to(device), labels.to(device), user_histories.to(device)
        
        scores = model(candidates, user_histories)
        loss = criterion(scores, torch.argmax(labels, dim=1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

Candidate IDs: [10013  9245  9709  4967 10000]
Available keys in article_dict: [3037230, 3044020, 3057622, 3073151, 3193383]...


KeyError: 10013