In [99]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pickle
from datasets import load_dataset

In [100]:
# Load the Rotten Tomatoes dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
valid_dataset = dataset['validation']
test_dataset = dataset['test']

train_text = train_dataset.to_pandas()['text']

max_text_len = 0
for text in train_text:
    max_text_len = max(max_text_len, len(text))

print(max_text_len)


267


In [101]:
print(train_dataset[0])

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}


In [116]:
# Load the embedding matrix and vocab from files
with open('embedding_matrix.pkl', 'rb') as f:
    embedding_matrix = pickle.load(f).astype(np.float32)
    padding = [0 for i in range(50)]
    embedding_matrix.append(padding)
    print(type(embedding_matrix))

with open('vocab_word_to_index.pkl', 'rb') as f:
    vocab_word_to_index = pickle.load(f)
    print(type(vocab_word_to_index))

# Convert to torch tensors
embedding_matrix = torch.tensor(embedding_matrix)
vocab_size, embedding_dim = embedding_matrix.shape
print(embedding_matrix)

<class 'numpy.ndarray'>
<class 'dict'>
tensor([[-0.4901, -0.2097,  0.1249,  ..., -0.0770, -0.2399, -1.4470],
        [ 0.0151,  0.2401, -0.1368,  ...,  0.0424, -0.0365, -0.0043],
        [ 0.2482,  0.5113,  0.1887,  ...,  0.0801,  0.2962,  0.4851],
        ...,
        [-0.1226,  0.3450, -0.8120,  ...,  0.6396, -0.3635, -0.4805],
        [ 0.1039, -1.2005, -0.3010,  ..., -0.4107,  0.7417,  0.5984],
        [ 0.0151,  0.2401, -0.1368,  ...,  0.0424, -0.0365, -0.0043]])


In [103]:
import nltk

class SentimentDataset(Dataset):
    def __init__(self, dataset, word_to_index, max_len=max_text_len):
        self.dataset = dataset
        self.word_to_index = word_to_index
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['text']
        label = self.dataset[idx]['label']
        
        # Convert words to indices
        
        text = text.lower()
        #tokenise words
        word_list = nltk.tokenize.word_tokenize(text)
        word_list = [word.strip("'\"") for word in word_list]
        tokens = set()
        tokens.update(word_list)
        tokens.discard('')
        indices = [self.word_to_index.get(word, self.word_to_index.get('<UNK>')) for word in tokens]
        indices = indices[:self.max_len] + [0] * (self.max_len - len(indices))  # Padding
        
        return torch.tensor(indices), torch.tensor(label)

train_data = SentimentDataset(train_dataset, vocab_word_to_index)
valid_data = SentimentDataset(valid_dataset, vocab_word_to_index)
test_data = SentimentDataset(test_dataset, vocab_word_to_index)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [115]:
class SentimentRNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, num_layers = 1, freeze_embeddings=True):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=freeze_embeddings)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        

    def forward(self, x):
        embedded = self.embedding(x)
        out, hidden = self.rnn(embedded)
        out = out[:, -1, :]

        # Using the last hidden state for classification
        return self.fc(out)

# Model hyperparameters
hidden_dim = 128
output_dim = 2  # Binary classification (positive, negative)

model = SentimentRNN(embedding_matrix, hidden_dim, output_dim, 2)
print(model.embedding.weight[0])

tensor([-0.4901, -0.2097,  0.1249, -0.3952, -0.4148,  0.7488,  0.3754,  0.3650,
        -0.0177, -0.3884,  0.1328, -0.3996, -0.6031,  0.6804, -0.5267,  1.0355,
         0.8665,  0.2221,  0.6303, -0.8495,  0.6170, -0.0247,  0.8059, -0.1039,
         0.1430,  0.2291, -0.7631,  1.6906,  1.1369, -0.7731,  0.7997, -0.0726,
         1.0869,  0.1207, -0.0339,  0.8330, -0.3656,  0.5224,  0.5808,  0.4711,
        -0.2782, -0.5437,  0.3406,  0.4131, -0.2171, -0.4031,  0.5080, -0.0770,
        -0.2399, -1.4470])


In [105]:
# Set up optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
epoch_loss = []

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training function
def train_model(model, train_loader, valid_loader, epochs=10):
    best_valid_acc = 0.0
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        epoch_acc = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            predictions = model(texts)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            epoch_acc += (predictions.argmax(1) == labels).sum().item()
        
        valid_acc = evaluate_model(model, valid_loader)
        print(f'Epoch {epoch+1} | Train Loss: {epoch_loss / len(train_loader):.4f} | Validation Accuracy: {valid_acc:.4f}')
        
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            torch.save(model.state_dict(), 'best_model.pt')

def evaluate_model(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts)
            correct += (predictions.argmax(1) == labels).sum().item()
            total += labels.size(0)
    return correct / total

In [109]:
# Train the model
train_model(model, train_loader, valid_loader, epochs=10)

Epoch 1 | Train Loss: 0.6992 | Validation Accuracy: 0.5038
Epoch 2 | Train Loss: 0.6980 | Validation Accuracy: 0.4962
Epoch 3 | Train Loss: 0.6993 | Validation Accuracy: 0.4887
Epoch 4 | Train Loss: 0.6966 | Validation Accuracy: 0.4925
Epoch 5 | Train Loss: 0.6965 | Validation Accuracy: 0.5000
Epoch 6 | Train Loss: 0.6968 | Validation Accuracy: 0.4906
Epoch 7 | Train Loss: 0.6976 | Validation Accuracy: 0.5075
Epoch 8 | Train Loss: 0.6996 | Validation Accuracy: 0.4841
Epoch 9 | Train Loss: 0.6963 | Validation Accuracy: 0.5000
Epoch 10 | Train Loss: 0.6967 | Validation Accuracy: 0.4822


In [108]:
# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Evaluate the model on the test set
test_acc = evaluate_model(model, test_loader)
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.5028
