# Assignment 2 - Recurrent Neural Networks



## Programming (Full points: 100)

In this assignment, our goal is to use PyTorch to implement Recurrent Neural Networks (RNN) for sentiment analysis task. Sentiment analysis is to classify sentences (input) into certain sentiments (output labels), which includes positive, negative and neutral.

We will use a benckmark dataset, SST, for this assignment.
* we download the SST dataset from torchtext package, and do some preprocessing to build vocabulary and split the dataset into training/validation/test sets. You don't need to modify the code in this step.


In [1]:
import copy
import torch
from torch import nn
from torch import optim
import torchtext
from torchtext import data
from torchtext import datasets

TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.LabelField()

# load data splits
train_data, val_data, test_data = datasets.SST.splits(TEXT, LABEL)

# build dictionary
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

# hyperparameters
vocab_size = len(TEXT.vocab)
label_size = len(LABEL.vocab)
padding_idx = TEXT.vocab.stoi['<pad>']
embedding_dim = 128
hidden_dim = 128

# build iterators
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size=32)

* define the training and evaluation function in the cell below.
### (25 points)


In [2]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    
    for batch in iterator:
        optimizer.zero_grad()
        text, labels = batch.text, batch.label
        predictions = model(text)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch.text, batch.label
            predictions = model(text)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)


* build a RNN model for sentiment analysis in the cell below.
We have provided several hyperparameters we needed for building the model, including vocabulary size (vocab_size), the word embedding dimension (embedding_dim), the hidden layer dimension (hidden_dim), the number of layers (num_layers) and the number of sentence labels (label_size). Please fill in the missing codes, and implement a RNN model.
### (40 points)

In [3]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, label_size, padding_idx):
        super(RNNModel, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        
        # RNN layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, label_size)
        
    def forward(self, text):
        # text shape: (batch_size, seq_length)
        
        # Embedding layer
        embedded = self.embedding(text)
        
        # RNN layer
        output, hidden = self.rnn(embedded)
        
        # Get the output from the last time step
        last_output = output[:, -1, :]
        
        # Fully connected layer
        predictions = self.fc(last_output)
        
        return predictions


* train the model and compute the accuracy in the cell below.
### (20 points)

In [5]:
# Define hyperparameters
vocab_size = len(TEXT.vocab)
label_size = len(LABEL.vocab)
padding_idx = TEXT.vocab.stoi['<pad>']
embedding_dim = 128
hidden_dim = 128
num_layers = 1
num_epochs = 10  # You can adjust the number of epochs

# Define the RNN model
model = RNNModel(vocab_size, embedding_dim, hidden_dim, num_layers, label_size, padding_idx)

# Define the optimizer and criterion
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for batch in train_iter:
        optimizer.zero_grad()
        text, labels = batch.text, batch.label
        predictions = model(text)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

        # Compute accuracy
        predicted_labels = torch.argmax(predictions, dim=1)
        correct += (predicted_labels == labels).sum().item()
        total += labels.size(0)

    train_loss = epoch_loss / len(train_iter)
    train_accuracy = correct / total

    # Evaluate on the validation set
    val_loss = evaluate(model, val_iter, criterion)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Training Loss: {train_loss:.4f} | Training Accuracy: {train_accuracy*100:.2f}%')
    print(f'Validation Loss: {val_loss:.4f}')

# Evaluate on the test set
test_loss = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.4f}')

# Function to compute accuracy on the test set
def compute_accuracy(model, iterator):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in iterator:
            text, labels = batch.text, batch.label
            predictions = model(text)
            predicted_labels = torch.argmax(predictions, dim=1)
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return accuracy

# Calculate and print test accuracy
test_accuracy = compute_accuracy(model, test_iter)
print(f'Test Accuracy: {test_accuracy*100:.2f}%')

Epoch 1/10:
Training Loss: 1.0532 | Training Accuracy: 40.80%
Validation Loss: 1.0658
Epoch 2/10:
Training Loss: 1.0492 | Training Accuracy: 41.37%
Validation Loss: 1.1411
Epoch 3/10:
Training Loss: 1.0462 | Training Accuracy: 41.68%
Validation Loss: 1.0740
Epoch 4/10:
Training Loss: 1.0460 | Training Accuracy: 42.17%
Validation Loss: 1.1246
Epoch 5/10:
Training Loss: 1.0457 | Training Accuracy: 42.59%
Validation Loss: 1.1068
Epoch 6/10:
Training Loss: 1.0430 | Training Accuracy: 41.94%
Validation Loss: 1.0902
Epoch 7/10:
Training Loss: 1.0419 | Training Accuracy: 42.42%
Validation Loss: 1.1478
Epoch 8/10:
Training Loss: 1.0369 | Training Accuracy: 42.52%
Validation Loss: 1.2493
Epoch 9/10:
Training Loss: 1.0357 | Training Accuracy: 42.44%
Validation Loss: 1.2372
Epoch 10/10:
Training Loss: 1.0366 | Training Accuracy: 42.44%
Validation Loss: 1.1913
Test Loss: 1.2089
Test Accuracy: 39.55%


* try to train a model with better accuracy in the cell below. For example, you can use different optimizers such as SGD and Adam. You can also compare different hyperparameters and model size.
### (15 points), to obtain FULL point in this problem, the accuracy needs to be higher than 70%

In [6]:
import copy
import torch
from torch import nn
from torch import optim
import torchtext
from torchtext import data
from torchtext import datasets

TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL= data.LabelField()

#load data splits
train_data, val_data, test_data = datasets.SST.splits (TEXT, LABEL)

# build dictionary
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

#hyperparameters
vocab_size = len(TEXT.vocab)
label_size = len(LABEL.vocab)
padding_idx= TEXT.vocab.stoi['<pad>']
embedding_dim= 128
hidden_dim = 256
num_layers = 2

# build iterators
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size=32,
    sort_key=lambda x: len(x.text),
    max_len=50
)

class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size, padding_idx, num_layers=1):
        super(RNNClassifier, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.label_size = label_size
        self.num_layers = num_layers

        # add the layers required for sentiment analysis.
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx)
        self.rnn = nn.LSTM(self.embedding_dim, self.hidden_dim, num_layers=self.num_layers, bidirectional=True)
        self.linear = nn.Linear(self.hidden_dim * 2, self.label_size)

    def zero_state(self, batch_size): 
        # implement the function, which returns an initial hidden state.
        hidden_state = torch.zeros((self.num_layers * 2, batch_size, self.hidden_dim))
        cell_state = torch.zeros((self.num_layers * 2, batch_size, self.hidden_dim))
        return hidden_state, cell_state

    def forward(self, text):
        # implement the forward function of the model.
        embedding = self.embedding(text)

        # initialize the hidden state
        hidden_state, cell_state = self.zero_state(embedding.size(0))

        # pass the embedding through the RNN
        outputs, (hidden_state, cell_state) = self.rnn(embedding, (hidden_state, cell_state))

        # take the last hidden state and pass it through a linear layer
        last_hidden_state = hidden_state[-1]
        logits = self.linear(last_hidden_state)

        return logits

# create the model
model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, label_size, padding_idx, num_layers)

# define the optimizer and loss function
optimizer = optim.Adam(model.parameters())


TypeError: Iterator.__init__() got an unexpected keyword argument 'max_len'