In [164]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pickle
from datasets import load_dataset

In [165]:
# Load the Rotten Tomatoes dataset
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
valid_dataset = dataset['validation']
test_dataset = dataset['test']

train_text = train_dataset.to_pandas()['text']

max_text_len = 0
for text in train_text:
    max_text_len = max(max_text_len, len(text))

print(max_text_len)


267


In [166]:
print(train_dataset[0])

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}


In [167]:
# Load the embedding matrix and vocab from files
with open('embedding_matrix.pkl', 'rb') as f:
    embedding_matrix = pickle.load(f).astype(np.float32)
    padding = [0 for i in range(50)]
    embedding_matrix = np.insert(embedding_matrix, 0, padding, 0)
    print(type(embedding_matrix))

with open('vocab_word_to_index.pkl', 'rb') as f:
    vocab_word_to_index = pickle.load(f)
    print(type(vocab_word_to_index))

# Convert to torch tensors
embedding_matrix = torch.tensor(embedding_matrix)
vocab_size, embedding_dim = embedding_matrix.shape
print(embedding_matrix[0])

<class 'numpy.ndarray'>
<class 'dict'>
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])


In [168]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt_tab')

class SentimentDataset(Dataset):
    def __init__(self, dataset, word_to_index, max_len=max_text_len):
        self.dataset = dataset
        self.word_to_index = word_to_index
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]['text']
        label = self.dataset[idx]['label']
        
        # Convert words to indices
        
        text = text.lower()
        #tokenise words
        word_list = nltk.tokenize.word_tokenize(text)
        word_list = [word.strip("'\"") for word in word_list]
        tokens = set()
        tokens.update(word_list)
        #tokens.discard('')
        indices = [self.word_to_index.get(word, self.word_to_index.get('<UNK>')) + 1 for word in tokens]
        indices = indices[:self.max_len] + [0] * (self.max_len - len(indices))  # Padding
        
        return torch.tensor(indices), torch.tensor(label)

train_data = SentimentDataset(train_dataset, vocab_word_to_index)
valid_data = SentimentDataset(valid_dataset, vocab_word_to_index)
test_data = SentimentDataset(test_dataset, vocab_word_to_index)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/adityakumarpugalia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [169]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class SentimentRNN(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, output_dim, num_layers = 1, freeze_embeddings=True):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=freeze_embeddings)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, device= device)
        self.fc = nn.Linear(hidden_dim, output_dim, device= device)
        

    def forward(self, x):
        embedded = self.embedding(x)
        out, hidden = self.rnn(embedded)
        out = out[:, -1, :]

        # Using the last hidden state for classification
        return self.fc(out)

# Model hyperparameters
hidden_dim = 128
output_dim = 2  # Binary classification (positive, negative)

model = SentimentRNN(embedding_matrix, hidden_dim, output_dim, 2)
print(model.embedding.weight[1])

tensor([-0.4901, -0.2097,  0.1249, -0.3952, -0.4148,  0.7488,  0.3754,  0.3650,
        -0.0177, -0.3884,  0.1328, -0.3996, -0.6031,  0.6804, -0.5267,  1.0355,
         0.8665,  0.2221,  0.6303, -0.8495,  0.6170, -0.0247,  0.8059, -0.1039,
         0.1430,  0.2291, -0.7631,  1.6906,  1.1369, -0.7731,  0.7997, -0.0726,
         1.0869,  0.1207, -0.0339,  0.8330, -0.3656,  0.5224,  0.5808,  0.4711,
        -0.2782, -0.5437,  0.3406,  0.4131, -0.2171, -0.4031,  0.5080, -0.0770,
        -0.2399, -1.4470])


In [170]:
# Set up optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Move model to GPU if available
model.to(device)


# Training function
def train_model(model, train_loader, valid_loader, epochs=10):
    best_valid_acc = 0
    for epoch in range(epochs):
        # Set model to training mode
        model.train()
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0
        
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            
            optimizer.zero_grad()  # Reset gradients
            
            # Forward pass: get predictions
            predictions = model(texts)
            
            # Compute the loss
            loss = criterion(predictions, labels)
            total_train_loss += loss.item()
            
            # Backpropagation and optimization
            loss.backward()
            optimizer.step()
            
            # Accuracy calculation
            predicted_labels = predictions.argmax(1)
            total_train_correct += (predicted_labels == labels).sum().item()
            total_train_samples += labels.size(0)

        # Calculate and print average training accuracy and loss per epoch
        train_accuracy = total_train_correct / total_train_samples
        train_loss = total_train_loss / len(train_loader)

        # Evaluate model on validation set
        valid_accuracy = evaluate_model(model, valid_loader)
        
        print(f'Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f} | Validation Accuracy: {valid_accuracy:.4f}')        
        if valid_accuracy > best_valid_acc:
            best_valid_acc = valid_accuracy
            torch.save(model.state_dict(), 'best_model.pt')

def evaluate_model(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts)
            correct += (predictions.argmax(1) == labels).sum().item()
            total += labels.size(0)
    return correct / total

In [171]:
# Train the model
train_model(model, train_loader, valid_loader, epochs=10)

Epoch 1 | Train Loss: 0.6941 | Train Accuracy: 0.4998 | Validation Accuracy: 0.5000
Epoch 2 | Train Loss: 0.6936 | Train Accuracy: 0.4897 | Validation Accuracy: 0.5000
Epoch 3 | Train Loss: 0.6933 | Train Accuracy: 0.5054 | Validation Accuracy: 0.5000
Epoch 4 | Train Loss: 0.6933 | Train Accuracy: 0.5012 | Validation Accuracy: 0.5000
Epoch 5 | Train Loss: 0.6934 | Train Accuracy: 0.4925 | Validation Accuracy: 0.5000
Epoch 6 | Train Loss: 0.6932 | Train Accuracy: 0.5009 | Validation Accuracy: 0.5000
Epoch 7 | Train Loss: 0.6933 | Train Accuracy: 0.5021 | Validation Accuracy: 0.5000
Epoch 8 | Train Loss: 0.6935 | Train Accuracy: 0.4951 | Validation Accuracy: 0.5000
Epoch 9 | Train Loss: 0.6933 | Train Accuracy: 0.4955 | Validation Accuracy: 0.5000
Epoch 10 | Train Loss: 0.6933 | Train Accuracy: 0.4909 | Validation Accuracy: 0.5000


In [172]:
# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Evaluate the model on the test set
test_acc = evaluate_model(model, test_loader)
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.5000
