In [3]:
!pip install -U torchtext==0.12

Collecting torchtext==0.12
  Downloading torchtext-0.12.0-cp310-cp310-manylinux1_x86_64.whl.metadata (8.0 kB)
Collecting torch==1.11.0 (from torchtext==0.12)
  Downloading torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Downloading torchtext-0.12.0-cp310-cp310-manylinux1_x86_64.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl (750.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 2.5.1+cu121
    Uninstalling torch-2.5.1+cu121:
      Successfully uninstalled torch-2.5.1+cu121
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the followin

In [17]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
import time
import random

In [18]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

# General settings
VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2

In [19]:
df = pd.read_csv('movie_data.csv')
df.columns = ['TEXT_COLUMN_NAME', 'LABEL_COLUMN_NAME']
df.to_csv('movie_data.csv', index=None)

# Tokenizer and Vocabulary
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')



In [20]:
def build_vocab(texts, max_size):
    token_freqs = {}
    for text in texts:
        tokens = tokenizer(text)
        for token in tokens:
            token_freqs[token] = token_freqs.get(token, 0) + 1
    sorted_tokens = sorted(token_freqs.items(), key=lambda x: x[1], reverse=True)
    vocab = {word: idx + 2 for idx, (word, _) in enumerate(sorted_tokens[:max_size])}
    vocab['<unk>'] = 0
    vocab['<pad>'] = 1
    return vocab

In [21]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TEXT_COLUMN_NAME'], df['LABEL_COLUMN_NAME'], test_size=0.2, random_state=RANDOM_SEED
)
train_texts, valid_texts, train_labels, valid_labels = train_test_split(
    train_texts, train_labels, test_size=0.15, random_state=RANDOM_SEED
)

vocab = build_vocab(train_texts.tolist(), VOCABULARY_SIZE)

In [22]:
def encode_text(text):
    tokens = tokenizer(text)
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

def encode_label(label):
    return int(label)

In [23]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = encode_text(self.texts[idx])
        label = encode_label(self.labels[idx])
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.long)


In [24]:
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist())
valid_dataset = TextDataset(valid_texts.tolist(), valid_labels.tolist())
test_dataset = TextDataset(test_texts.tolist(), test_labels.tolist())

In [25]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(text) for text in texts])
    texts = torch.nn.utils.rnn.pad_sequence([torch.tensor(text) for text in texts], batch_first=True, padding_value=vocab['<pad>'])
    labels = torch.tensor(labels)
    return texts, lengths, labels

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


In [26]:
class RNN(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim, padding_idx=vocab['<pad>'])
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(packed)
        output = self.fc(hidden[-1])
        return output

In [27]:
model = RNN(input_dim=len(vocab), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=NUM_CLASSES)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [28]:
def compute_accuracy(model, data_loader):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for features, lengths, labels in data_loader:
            features, lengths, labels = features.to(DEVICE), lengths.to(DEVICE), labels.to(DEVICE)
            outputs = model(features, lengths)
            _, predictions = torch.max(outputs, 1)
            correct_pred += (predictions == labels).sum().item()
            num_examples += labels.size(0)
    return correct_pred / num_examples * 100

In [29]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, (features, lengths, labels) in enumerate(train_loader):
        features, lengths, labels = features.to(DEVICE), lengths.to(DEVICE), labels.to(DEVICE)

        # Forward pass
        logits = model(features, lengths)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()

        # Backward pass
        loss.backward()

        # Update model parameters
        optimizer.step()

        if not batch_idx % 50:
            print(f'Epoch: {epoch+1}/{NUM_EPOCHS} | Batch {batch_idx}/{len(train_loader)} | Loss: {loss:.4f}')

    train_acc = compute_accuracy(model, train_loader)
    valid_acc = compute_accuracy(model, valid_loader)
    print(f'Epoch: {epoch+1}/{NUM_EPOCHS} | Train Acc: {train_acc:.2f}% | Valid Acc: {valid_acc:.2f}%')

print(f'Total Training Time: {(time.time() - start_time) / 60:.2f} min')

  texts = torch.nn.utils.rnn.pad_sequence([torch.tensor(text) for text in texts], batch_first=True, padding_value=vocab['<pad>'])


Epoch: 1/15 | Batch 0/266 | Loss: 0.6951
Epoch: 1/15 | Batch 50/266 | Loss: 0.6990
Epoch: 1/15 | Batch 100/266 | Loss: 0.6143
Epoch: 1/15 | Batch 150/266 | Loss: 0.6441
Epoch: 1/15 | Batch 200/266 | Loss: 0.5251
Epoch: 1/15 | Batch 250/266 | Loss: 0.3624
Epoch: 1/15 | Train Acc: 88.01% | Valid Acc: 85.38%
Epoch: 2/15 | Batch 0/266 | Loss: 0.3162
Epoch: 2/15 | Batch 50/266 | Loss: 0.2701
Epoch: 2/15 | Batch 100/266 | Loss: 0.2254
Epoch: 2/15 | Batch 150/266 | Loss: 0.2122
Epoch: 2/15 | Batch 200/266 | Loss: 0.2418
Epoch: 2/15 | Batch 250/266 | Loss: 0.2200
Epoch: 2/15 | Train Acc: 94.65% | Valid Acc: 89.15%
Epoch: 3/15 | Batch 0/266 | Loss: 0.1947
Epoch: 3/15 | Batch 50/266 | Loss: 0.1602
Epoch: 3/15 | Batch 100/266 | Loss: 0.2621
Epoch: 3/15 | Batch 150/266 | Loss: 0.1707
Epoch: 3/15 | Batch 200/266 | Loss: 0.1627
Epoch: 3/15 | Batch 250/266 | Loss: 0.1388
Epoch: 3/15 | Train Acc: 97.41% | Valid Acc: 89.05%
Epoch: 4/15 | Batch 0/266 | Loss: 0.0993
Epoch: 4/15 | Batch 50/266 | Loss: 0.0

In [30]:
test_acc = compute_accuracy(model, test_loader)
print(f'Test Acc: {test_acc:.2f}%')

  texts = torch.nn.utils.rnn.pad_sequence([torch.tensor(text) for text in texts], batch_first=True, padding_value=vocab['<pad>'])


Test Acc: 88.94%


In [37]:
nlp = spacy.blank("en")

def predict_sentiment(model, sentence):
    model.eval()
    tokens = tokenizer(sentence)
    indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
    length = torch.tensor([len(indices)])
    tensor = torch.tensor(indices).unsqueeze(0).to(DEVICE)
    length = length.to(DEVICE)
    output = model(tensor, length)
    prob = torch.nn.functional.softmax(output, dim=1)
    return prob[0][1].item()

# Example Predictions
print("This is an awesome movie! ->", "Positive Sentiment probability:", predict_sentiment(model, "This is an awesome movie!"))
print("I hated this movie. ->", "Positive Sentiment probability:", predict_sentiment(model, "I hated this movie."))

This is an awesome movie! -> Positive Sentiment probability: 0.9999961853027344
I hated this movie. -> Positive Sentiment probability: 1.108322635445802e-06
