# NRMS Model

NRMS stands for `Neural News Reccomendaiton with Multi-head Self-Attention`.  The reference to the paper is provided below. 

---

## Understand the MIND dataset
The MIND dataset consists of several key files:

- news.tsv: Contains news articles and their metadata (news ID, category, subcategory, title, abstract, etc.).
- behaviors.tsv: Contains user interaction data, including the history of news articles clicked and the impressions list (clicked or not clicked).

The NRMS model uses this data to learn user preferences based on click history.

---

## Getting setup
Create the virtural environment
```bash

python -m venv nrms

```

Edit your .bashrc and add an alias:

```.bash

alias nrms='source ~/nrms/bin/activate'

```

Source the .bashrc file and activate the nrms enviroment'

```.bash

source .bashrc

nrms


```

Install the Python Modules Needed

Note, you should match the cuda version of torch to what you have installed.
```.bash

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

pip install transformers pandas numpy nltk scikit-learn tqdm gensim matplotlib

pip install jupyterlab

            
```

---

## Do the imports and ensure it works



In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Step 1: Define Parameters
# We start by defining some important parameters for our model such as embedding dimensions, number of attention heads, batch size, etc.
embedding_dim = 768  # Using BERT embedding dimensions
attention_heads = 8  # Number of attention heads in multi-head attention
batch_size = 64  # Number of samples in each batch
num_epochs = 10  # Number of epochs to train the model
learning_rate = 0.001  # Learning rate for the optimizer

In [None]:
# Step 2: Load MIND Dataset
# Now, we will load the MIND dataset, which contains user behaviors and news articles.
# We have the datasets already downloaded in ~/datasets/MINDlarge and ~/datasets/MINDsmall.
mind_large = '~/datasets/MINDlarge'
mind_large_train = mind_large + '/train/'
mind_large_dev = mind_large + '/dev/'  # Development -- help tune hyper-parameter 
mind_large_test = mind_large + '/test/'

mind_small = '~/datasets/MINDsmall'
mind_small_train = mind_small + '/train/'
mind_small_dev = mind_small + '/dev/'

dataset_path = mind_small_train

# Load training data
df_behaviors = pd.read_csv(f"{dataset_path}behaviors.tsv", sep="\t", names=["impression_id", "user_id", "time", "history", "impressions"])
df_news = pd.read_csv(f"{dataset_path}news.tsv", sep="\t", names=["news_id", "category", "subcategory", "title", "abstract", "url", "entity"])

In [None]:
# Step 3: Data Preprocessing
# Check for missing values in the title column and remove them.
df_news = df_news[df_news['title'].notna()].copy()

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# The next step is to preprocess the news dataset. We tokenize the news titles using BERT tokenizer.
def preprocess_news(news_df):
    # Tokenize the news title using the BERT tokenizer
    news_df.loc[:, 'title_tokens'] = news_df['title'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=50, truncation=True))
    return news_df

df_news = preprocess_news(df_news)

In [None]:
# Step 4: Create Vocabulary
# BERT tokenizer already provides the vocabulary, so no need to create a custom vocabulary.
vocab = tokenizer.get_vocab()

In [None]:
# Step 5: Dataset and DataLoader Classes
# We define a custom Dataset class to load and serve the data to the model. This class will convert news titles and user behaviors into tensors.
class NewsDataset(Dataset):
    def __init__(self, df_behaviors, df_news):
        self.behaviors = df_behaviors
        self.news = df_news

    def __len__(self):
        return len(self.behaviors)

    def __getitem__(self, idx):
        # Extract user history and impressions from the behaviors dataset.
        user_history = self.behaviors.iloc[idx]['history'].split()
        impressions = self.behaviors.iloc[idx]['impressions'].split()
        # Get the tokenized titles for each news article in the user's history.
        news_titles = []
        for news_id in user_history:
            matching_news = self.news[self.news['news_id'] == news_id]
            if not matching_news.empty:
                news_titles.append(matching_news['title_tokens'].values[0])
        if not news_titles:
            # If no valid news articles are found, return an empty tensor with padding
            news_titles = [[0]]
        return torch.tensor(news_titles, dtype=torch.long), torch.tensor([1 if '1' in imp else 0 for imp in impressions])

# Create the dataset and dataloader for training.
train_dataset = NewsDataset(df_behaviors, df_news)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Step 6: Define the NRMS Model
# Here, we define the NRMS model. The model uses BERT embeddings and a multi-head attention mechanism to capture the relationships between words.
class NRMS(nn.Module):
    def __init__(self, embedding_dim, attention_heads):
        super(NRMS, self).__init__()
        # BERT model to get embeddings
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Multi-head attention layer to capture interactions between words.
        self.attention = nn.MultiheadAttention(embedding_dim, attention_heads)
        # Fully connected layer to produce the final output.
        self.fc = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        # Convert input sequences to embeddings using BERT.
        with torch.no_grad():
            x = self.bert(x)[0]  # Extract the last hidden state from BERT
        x = x.permute(1, 0, 2)  # Convert to (SeqLen, Batch, EmbeddingDim)
        # Apply multi-head attention.
        attn_output, _ = self.attention(x, x, x)
        # Average pooling over sequence length and pass through a fully connected layer.
        out = self.fc(attn_output.mean(dim=0))
        return torch.sigmoid(out)

In [None]:
# Step 7: Training Loop
# We define the training loop to train the NRMS model on the MIND dataset.
model = NRMS(embedding_dim, attention_heads)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for binary classification.
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for news_tokens, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        # Move data to the appropriate device (CPU or GPU).
        news_tokens, labels = news_tokens.to(device), labels.to(device, dtype=torch.float)
        optimizer.zero_grad()  # Clear previous gradients.
        outputs = model(news_tokens)  # Forward pass through the model.
        loss = criterion(outputs.view(-1), labels.view(-1))  # Calculate loss.
        loss.backward()  # Backpropagate the loss.
        optimizer.step()  # Update model parameters.
        epoch_loss += loss.item()
    # Print the average loss for the epoch.
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}")

# Step 8: Save the Model
# Finally, save the trained model so it can be used for inference or further training.
torch.save(model.state_dict(), 'nrms_model.pth')

In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel

# Step 1: Define Parameters
# We start by defining some important parameters for our model such as embedding dimensions, number of attention heads, batch size, etc.
embedding_dim = 768  # Using BERT embedding dimensions
attention_heads = 8  # Number of attention heads in multi-head attention
batch_size = 16  # Number of samples in each batch
num_epochs = 10  # Number of epochs to train the model
learning_rate = 0.001  # Learning rate for the optimizer

# Step 2: Load MIND Dataset
# Now, we will load the MIND dataset, which contains user behaviors and news articles.
# We have the datasets already downloaded in ~/datasets/MINDlarge and ~/datasets/MINDsmall.
mind_large = '~/datasets/MINDlarge'
mind_large_train = mind_large + '/train/'
mind_large_dev = mind_large + '/dev/'  # Development -- help tune hyper-parameter 
mind_large_test = mind_large + '/test/'

mind_small = '~/datasets/MINDsmall'
mind_small_train = mind_small + '/train/'
mind_small_dev = mind_small + '/dev/'

dataset_path = mind_small_train

# Load training data
df_behaviors = pd.read_csv(f"{dataset_path}behaviors.tsv", sep="\t", names=["impression_id", "user_id", "time", "history", "impressions"])
df_news = pd.read_csv(f"{dataset_path}news.tsv", sep="\t", names=["news_id", "category", "subcategory", "title", "abstract", "url", "entity"])

# Step 3: Data Preprocessing
# Check for missing values in the title column and remove them.
df_news = df_news[df_news['title'].notna()].copy()

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# The next step is to preprocess the news dataset. We tokenize the news titles using BERT tokenizer.
def preprocess_news(news_df):
    # Tokenize the news title using the BERT tokenizer
    news_df.loc[:, 'title_tokens'] = news_df['title'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=50, truncation=True))
    return news_df

df_news = preprocess_news(df_news)

# Step 4: Create Vocabulary
# BERT tokenizer already provides the vocabulary, so no need to create a custom vocabulary.
vocab = tokenizer.get_vocab()

# Step 5: Dataset and DataLoader Classes
# We define a custom Dataset class to load and serve the data to the model. This class will convert news titles and user behaviors into tensors.
class NewsDataset(Dataset):
    def __init__(self, df_behaviors, df_news):
        self.behaviors = df_behaviors
        self.news = df_news

    def __len__(self):
        return len(self.behaviors)

    def __getitem__(self, idx):
        # Extract user history and impressions from the behaviors dataset.
        user_history = str(self.behaviors.iloc[idx]['history']).split()
        impressions = str(self.behaviors.iloc[idx]['impressions']).split()
        # Get the tokenized titles for each news article in the user's history.
        news_titles = []
        for news_id in user_history:
            matching_news = self.news[self.news['news_id'] == news_id]
            if not matching_news.empty:
                news_titles.append(matching_news['title_tokens'].values[0])
        if not news_titles:
            # If no valid news articles are found, return a tensor filled with padding
            news_titles = [[0]]
        # Convert list of token lists to tensors
        news_titles = [torch.tensor(tokens, dtype=torch.long) for tokens in news_titles]
        labels = torch.tensor([1 if '1' in imp else 0 for imp in impressions], dtype=torch.float)
        return news_titles, labels

# Custom collate function to handle batches with varying sequence lengths
def collate_fn(batch):
    news_titles_batch, labels_batch = zip(*batch)
    # Pad each list of news titles independently
    padded_news_titles = [pad_sequence(news, batch_first=True, padding_value=0) for news in news_titles_batch]
    # Stack all padded news titles into a batch
    news_titles_padded = pad_sequence(padded_news_titles, batch_first=True, padding_value=0)
    # Pad labels to ensure consistent batch size
    labels_padded = pad_sequence(labels_batch, batch_first=True, padding_value=0)
    # Create attention masks
    attention_mask = (news_titles_padded != 0).long()
    return news_titles_padded, attention_mask, labels_padded

# Create the dataset and dataloader for training.
train_dataset = NewsDataset(df_behaviors, df_news)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Step 6: Define the NRMS Model
# Here, we define the NRMS model. The model uses BERT embeddings and a multi-head attention mechanism to capture the relationships between words.
class NRMS(nn.Module):
    def __init__(self, embedding_dim, attention_heads):
        super(NRMS, self).__init__()
        # BERT model to get embeddings
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Multi-head attention layer to capture interactions between words.
        self.attention = nn.MultiheadAttention(embedding_dim, attention_heads)
        # Fully connected layer to produce the final output.
        self.fc = nn.Linear(embedding_dim, 1)

    def forward(self, x, attention_mask):
        # Convert input sequences to embeddings using BERT.
        with torch.no_grad():
            x = self.bert(x, attention_mask=attention_mask)[0]  # Extract the last hidden state from BERT
        x = x.permute(1, 0, 2)  # Convert to (SeqLen, Batch, EmbeddingDim)
        # Apply multi-head attention.
        attn_output, _ = self.attention(x, x, x)
        # Average pooling over sequence length and pass through a fully connected layer.
        out = self.fc(attn_output.mean(dim=0))
        return torch.sigmoid(out).squeeze()

# Step 7: Training Loop
# We define the training loop to train the NRMS model on the MIND dataset.
model = NRMS(embedding_dim, attention_heads)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for binary classification.
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for news_tokens, attention_mask, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        # Move data to the appropriate device (CPU or GPU).
        news_tokens, attention_mask, labels = news_tokens.to(device), attention_mask.to(device), labels.to(device, dtype=torch.float)
        optimizer.zero_grad()  # Clear previous gradients.
        outputs = model(news_tokens, attention_mask)  # Forward pass through the model.
        loss = criterion(outputs.view(-1), labels.view(-1))  # Calculate loss.
        loss.backward()  # Backpropagate the loss.
        optimizer.step()  # Update model parameters.
        epoch_loss += loss.item()
    # Print the average loss for the epoch.
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}")

# Step 8: Save the Model
# Finally, save the trained model so it can be used for inference or further training.
torch.save(model.state_dict(), 'nrms_model.pth')

Epoch 1:   0%|          | 0/9811 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)

## References

https://wuch15.github.io/paper/EMNLP2019-NRMS.pdf


