In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load datasets
behaviors_train = pd.read_parquet('Data/behaviors_train.parquet')
behaviors_val = pd.read_parquet('Data/behaviors_val.parquet')
history_train = pd.read_parquet('Data/history_train.parquet')
history_val = pd.read_parquet('Data/history_val.parquet')
articles = pd.read_parquet('Data/articles.parquet')

# Filter valid articles
valid_articles = set(articles['article_id'])
def filter_invalid_articles(df, col):
    df[col] = df[col].apply(lambda x: [a for a in x if a in valid_articles])

filter_invalid_articles(behaviors_train, 'article_ids_clicked')
filter_invalid_articles(behaviors_train, 'article_ids_inview')
filter_invalid_articles(behaviors_val, 'article_ids_clicked')
filter_invalid_articles(behaviors_val, 'article_ids_inview')
filter_invalid_articles(history_train, 'article_id_fixed')
filter_invalid_articles(history_val, 'article_id_fixed')

# Generate candidate and label columns
def generate_candidates_and_labels(behaviors, history):
    data = []
    for _, row in tqdm(behaviors.iterrows(), total=len(behaviors)):
        user_id = row['user_id']
        clicked_articles = row['article_ids_clicked']
        inview_articles = row['article_ids_inview']
        
        user_history = history[history['user_id'] == user_id]['article_id_fixed'].values
        user_history = user_history[0] if len(user_history) > 0 else []
        
        # Prepare user_his (pad with zeros)
        user_his = [a for a in user_history if a not in clicked_articles]
        user_his = user_his[:50] + [0] * max(0, 50 - len(user_his))
        
        for clicked_article in clicked_articles:
            negative_samples = [a for a in inview_articles if a != clicked_article]
            if len(negative_samples) < 4:
                continue
            negatives = np.random.choice(negative_samples, 4, replace=False).tolist()
            candidate = [clicked_article] + negatives
            label = [1] + [0] * 4
            data.append({'candidate': candidate, 'label': label, 'user_his': user_his})
    return pd.DataFrame(data)

train_data = generate_candidates_and_labels(behaviors_train, history_train)
val_data = generate_candidates_and_labels(behaviors_val, history_val)

# Save processed data
train_data.to_parquet('Data/train_data.parquet')
val_data.to_parquet('Data/val_data.parquet')

100%|██████████| 24724/24724 [00:03<00:00, 8136.49it/s]
100%|██████████| 25356/25356 [00:03<00:00, 8154.28it/s]


In [66]:
train_data['label'][4]

[1, 0, 0, 0, 0]

In [79]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm

# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_path = 'Data/glove.6B.300d.txt'
embedding_dim = 300
glove_embeddings = load_glove_embeddings(glove_path, embedding_dim)

# Tokenize and embed titles
def embed_titles(articles, glove_embeddings, embedding_dim):
    def embed_title(title):
        tokens = title.lower().split()
        vectors = [glove_embeddings.get(token, np.zeros(embedding_dim)) for token in tokens]
        return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_dim)

    articles['embedding'] = articles['title'].apply(embed_title)
    return articles

articles = embed_titles(articles, glove_embeddings, embedding_dim)

# Convert embeddings to a tensor
article_embeddings = {
    article_id: torch.tensor(embedding, dtype=torch.float32)
    for article_id, embedding in zip(articles['article_id'], articles['embedding'])
}

In [87]:
class NewsEncoder(nn.Module):
    def __init__(self, embedding_dim, attention_heads):
        super(NewsEncoder, self).__init__()
        self.multihead_attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=attention_heads, batch_first=True)
        self.additive_attention = nn.Linear(embedding_dim, 1)
    
    def forward(self, embeddings):
        attn_output, _ = self.multihead_attention(embeddings, embeddings, embeddings)  # Shape: (batch_size, seq_len, embedding_dim)
        scores = self.additive_attention(attn_output).squeeze(-1)  # Shape: (batch_size, seq_len)

        # Ensure softmax is applied on the correct dimension
        if len(scores.shape) == 1:  # This happens if seq_len is 1
            scores = scores.unsqueeze(1)  # Restore shape to (batch_size, seq_len)

        weights = torch.softmax(scores, dim=1)  # Shape: (batch_size, seq_len)
        representation = torch.sum(weights.unsqueeze(-1) * attn_output, dim=1)  # Shape: (batch_size, embedding_dim)
        return representation

In [88]:
class UserEncoder(nn.Module):
    def __init__(self, news_encoder, embedding_dim, attention_heads):
        super(UserEncoder, self).__init__()
        self.news_encoder = news_encoder
        self.multihead_attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=attention_heads, batch_first=True)
        self.additive_attention = nn.Linear(embedding_dim, 1)
    
    def forward(self, user_history):
        news_embeddings = torch.stack([self.news_encoder(h.unsqueeze(0)) for h in user_history], dim=1)
        attn_output, _ = self.multihead_attention(news_embeddings, news_embeddings, news_embeddings)
        scores = self.additive_attention(attn_output).squeeze(-1)
        weights = torch.softmax(scores, dim=1)
        user_representation = torch.sum(weights.unsqueeze(-1) * attn_output, dim=1)
        return user_representation

In [89]:
class NewsRecommendationModel(nn.Module):
    def __init__(self, news_encoder, user_encoder):
        super(NewsRecommendationModel, self).__init__()
        self.news_encoder = news_encoder
        self.user_encoder = user_encoder

    def forward(self, candidate_articles, user_history):
        # Encode candidate articles
        batch_size, num_candidates, embedding_dim = candidate_articles.size()
        candidate_representations = torch.stack(
            [self.news_encoder(candidate_articles[:, i, :]) for i in range(num_candidates)],
            dim=1
        )  # Shape: (batch_size, num_candidates, embedding_dim)

        # Encode user history
        user_representation = self.user_encoder(user_history)  # Shape: (batch_size, embedding_dim)

        # Compute scores (dot product)
        scores = torch.matmul(candidate_representations, user_representation.unsqueeze(-1)).squeeze(-1)
        # Correct shape: (batch_size, num_candidates)
        return scores

In [90]:
# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embedding_dim = 300
attention_heads = 4
news_encoder = NewsEncoder(embedding_dim, attention_heads).to(device)
user_encoder = UserEncoder(news_encoder, embedding_dim, attention_heads).to(device)
model = NewsRecommendationModel(news_encoder, user_encoder).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def create_dataloader(data, article_embeddings, batch_size=32):
    def collate_fn(batch):
        # Embedding for zero (used as padding)
        zero_embedding = torch.zeros(embedding_dim, dtype=torch.float32).to(device)
        
        # Process candidates and user history
        candidates = torch.stack([
            torch.stack([article_embeddings.get(c, zero_embedding) for c in b['candidate']]) for b in batch
        ]).to(device)
        
        user_his = torch.stack([
            torch.stack([article_embeddings.get(h, zero_embedding) for h in b['user_his']]) for b in batch
        ]).to(device)
        
        # Extract labels
        labels = torch.tensor([b['label'].index(1) for b in batch], dtype=torch.long).to(device)
        
        return candidates, user_his, labels

    dataset = data.to_dict('records')
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

train_loader = create_dataloader(train_data, article_embeddings)
val_loader = create_dataloader(val_data, article_embeddings)

In [91]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for candidates, user_his, labels in train_loader:
        optimizer.zero_grad()
        predictions = model(candidates, user_his)  # Shape: (batch_size, num_candidates)
        loss = criterion(predictions, labels)     # Labels: (batch_size)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for candidates, user_his, labels in val_loader:
            predictions = model(candidates, user_his)
            correct += (predictions.argmax(dim=1) == labels).sum().item()
            total += labels.size(0)
    print(f"Validation Accuracy: {correct / total}")

Epoch 1, Loss: 1.090467449475482
Validation Accuracy: 0.4278376788864928
Epoch 2, Loss: 2.679478529688445
Validation Accuracy: 0.4629680454812782
Epoch 3, Loss: 0.4903111373662521
Validation Accuracy: 0.4127818074887277
Epoch 4, Loss: 0.4155974502220086
Validation Accuracy: 0.5144089394236424
Epoch 5, Loss: 0.43726674818795785
Validation Accuracy: 0.5282101548715938
Epoch 6, Loss: 0.3809046124222338
Validation Accuracy: 0.4943736522250539
Epoch 7, Loss: 116.83260602514457
Validation Accuracy: 0.4403842383846305
Epoch 8, Loss: 3.4603307631606604
Validation Accuracy: 0.4504214859831406
Epoch 9, Loss: 2.6540008100631134
Validation Accuracy: 0.447912174083513
Epoch 10, Loss: 1.2056929820358488
Validation Accuracy: 0.5018623799255048
