In [None]:
!pip install torch pandas numpy scikit-learn




In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

# **Preprocessing**

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset


In [3]:
from datasets import load_dataset


In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
#Load the dataset as a dataframe
dataset = load_dataset("imdb")
train = pd.DataFrame(dataset['train'])
test = pd.DataFrame(dataset['test'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# Convert sentiment labels to numeric
train['label'] = train['label'].apply(lambda x: 1 if x == 1 else 0)
test['label'] = test['label'].apply(lambda x: 1 if x == 1 else 0)

In [7]:
# Tokenize the reviews
def preprocess(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize
    return tokens

In [8]:
train['text'] = train['text'].apply(preprocess)
test['text'] = test['text'].apply(preprocess)

# **Load GloVe embeddings**

In [11]:
import numpy as np
import urllib.request
import io
import zipfile

# Function to load GloVe embeddings from a file-like object
def load_glove_embeddings_from_file(file_like, embedding_dim):
    embeddings = {}
    for line in file_like:
        values = line.decode('utf-8').split()  # Decode bytes to string
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings[word] = vector
    return embeddings

# URL to the GloVe embeddings file
url = 'https://nlp.stanford.edu/data/glove.6B.zip'

# Download and extract the GloVe file
response = urllib.request.urlopen(url)
with io.BytesIO(response.read()) as file_stream:
    with zipfile.ZipFile(file_stream) as zip_file:
        # Extract the correct file from the archive (directly at the root level)
        with zip_file.open('glove.6B.100d.txt') as glove_file:  # Corrected path
            embedding_dim = 100
            glove_embeddings = load_glove_embeddings_from_file(glove_file, embedding_dim)


# **Create Vocabulary and Embedding Matrix**

In [13]:
# Create Vocabulary form training data
vocab = set([word for review in train['text'] for word in review])
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}  # +1 for padding
idx_to_word = {i: word for word, i in word_to_idx.items()}

# Create embedding matrix
embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))
for word, i in word_to_idx.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# **Custom Dataset and DataLoader**

In [14]:
class IMDBDataset(Dataset):
    def __init__(self, data, word_to_idx, max_length=100):
        self.data = data
        self.word_to_idx = word_to_idx
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        encoded_review = [self.word_to_idx.get(word, 0) for word in review]  # Convert words to indices
        padded_review = encoded_review[:self.max_length] + [0] * (self.max_length - len(encoded_review))  # Pad or truncate
        return torch.tensor(padded_review), torch.tensor(label)

# Create DataLoader
train_dataset = IMDBDataset(train, word_to_idx)
test_dataset = IMDBDataset(test, word_to_idx)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# **Define Models** **bold text**

In [15]:
# RNN

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden[-1])

In [16]:
# LSTM

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])

# **Train and Evaluate Models**

In [17]:
def train_model(model, train_loader, optimizer, criterion, n_epochs=5):
    model.train()
    for epoch in range(n_epochs):
        epoch_loss = 0
        epoch_acc = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss/len(train_loader):.4f}')


In [18]:
# Initialize and Train RNN Model
rnn_model = RNNModel(len(vocab) + 1, embedding_dim, 128, 1, embedding_matrix)
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

train_model(rnn_model, train_loader, optimizer, criterion)

Epoch 1/5, Loss: 0.6889
Epoch 2/5, Loss: 0.6774
Epoch 3/5, Loss: 0.6655
Epoch 4/5, Loss: 0.6432
Epoch 5/5, Loss: 0.5991


In [19]:
# Initialize and Train LSTM Model
lstm_model = LSTMModel(len(vocab) + 1, embedding_dim, 128, 1, embedding_matrix)
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

train_model(lstm_model, train_loader, optimizer, criterion)

Epoch 1/5, Loss: 0.6457
Epoch 2/5, Loss: 0.4364
Epoch 3/5, Loss: 0.2407
Epoch 4/5, Loss: 0.1035
Epoch 5/5, Loss: 0.0875


# **Define Models with On-the-Fly Embeddings**

In [20]:
# RNN Model with On-the-Fly Embeddings
class RNNModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModelOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # On-the-fly embeddings
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden[-1])

In [21]:
# LSTM Model with On-the-Fly Embeddings
class LSTMModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModelOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # On-the-fly embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])

# **Train the models**

In [22]:
# Initialize and Train On-the-Fly RNN Model
rnn_model_on_the_fly = RNNModelOnTheFly(len(vocab) + 1, embedding_dim, 128, 1)
optimizer = optim.Adam(rnn_model_on_the_fly.parameters(), lr=0.001)

train_model(rnn_model_on_the_fly, train_loader, optimizer, criterion)

Epoch 1/5, Loss: 0.6952
Epoch 2/5, Loss: 0.6742
Epoch 3/5, Loss: 0.6599
Epoch 4/5, Loss: 0.6600
Epoch 5/5, Loss: 0.6332


In [23]:
# Initialize and Train On-the-Fly LSTM Model
lstm_model_on_the_fly = LSTMModelOnTheFly(len(vocab) + 1, embedding_dim, 128, 1)
optimizer = optim.Adam(lstm_model_on_the_fly.parameters(), lr=0.001)

train_model(lstm_model_on_the_fly, train_loader, optimizer, criterion)

Epoch 1/5, Loss: 0.6890
Epoch 2/5, Loss: 0.6292
Epoch 3/5, Loss: 0.4366
Epoch 4/5, Loss: 0.3107
Epoch 5/5, Loss: 0.2168


# **Evaluate the Models**

In [24]:
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            preds = torch.round(torch.sigmoid(outputs.squeeze()))  # Sigmoid for binary classification
            predictions.extend(preds.tolist())
            true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

In [25]:
# Evaluate each model
rnn_acc = evaluate_model(rnn_model, test_loader)
lstm_acc = evaluate_model(lstm_model, test_loader)
rnn_fly_acc = evaluate_model(rnn_model_on_the_fly, test_loader)
lstm_fly_acc = evaluate_model(lstm_model_on_the_fly, test_loader)

print(f'RNN with GloVe Accuracy: {rnn_acc:.4f}')
print(f'LSTM with GloVe Accuracy: {lstm_acc:.4f}')
print(f'RNN with On-the-Fly Embeddings Accuracy: {rnn_fly_acc:.4f}')
print(f'LSTM with On-the-Fly Embeddings Accuracy: {lstm_fly_acc:.4f}')

RNN with GloVe Accuracy: 0.5328
LSTM with GloVe Accuracy: 0.7568
RNN with On-the-Fly Embeddings Accuracy: 0.5614
LSTM with On-the-Fly Embeddings Accuracy: 0.7872


# **Conclusion**

Effectiveness of GloVe Embedding:

*   The LSTM model using pre-trained GloVe embeddings achieved an accuracy of 75.68%, while the Vanilla RNN with GloVe embeddings achieved a lower accuracy of 53.28%.

*   LSTMs, which are capable of handling long-term dependencies better than Vanilla RNNs, benefit significantly from using pre-trained word embeddings like GloVe.

Performance with on the fly Embeddings:


*   The LSTM model with on-the-fly embeddings reached an accuracy of 78.72%, outperforming all other models. Similarly, the Vanilla RNN with on-the-fly embeddings had a higher accuracy (56.14%) than its counterpart using GloVe embeddings.
*   This suggests that when the model learns the word embeddings from scratch (on-the-fly), it can fine-tune the embeddings to the specific characteristics of the dataset.

Comparison of RNN and LSTM Architectures


*   Across both GloVe and on-the-fly embeddings, LSTM models consistently outperformed Vanilla RNNs.
*   This shows the advantages of LSTMs in capturing long-term dependencies in sequential data, such as movie reviews, which often contain long sentences and complex structures.





