#### 1. Library installations

In [18]:
import subprocess

# Install spacy
subprocess.run(["pip", "install", "spacy", "--quiet"])

# Download the English language model for spacy
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm", "--quiet"])

print("Spacy and en_core_web_sm downloaded successfully.")

# Install torchtext==0.6.0 as it is an available version that can be used with the legacy API structure (but without the 'legacy' submodule).
get_ipython().system('pip install torchtext==0.6.0 --quiet')

print("torchtext 0.6.0 installation initiated.")



[38;5;2mâœ” Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Spacy and en_core_web_sm downloaded successfully.
[0mtorchtext 0.6.0 installation initiated.


#### 2. Imports

In [19]:
import random
import torch

from torchtext import data, datasets
import spacy
import torch.nn as nn


#### 3. Global configurations

In [20]:
# Set deterministic behavior
RANDOM_SEED = 100
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True

# Define Fields using SpaCy (en_core_web_sm)
TEXT = data.Field(
    tokenize='spacy',
    tokenizer_language='en_core_web_sm',
    batch_first=True,
    include_lengths=True
)
LABEL = data.LabelField(dtype=torch.float)

# Dataset partition sizes
TRAIN_SIZE = 1000
VALID_SIZE = 400
TEST_SIZE  = 50

# Vocabulary size
MAX_VOCAB_SIZE = 25_000

# Batch size
BATCH_SIZE = 64

# Compute device specification
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# LSTM Network configs
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
N_EPOCHS = 10



#### 4. Load the full IMDB dataset

In [21]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print(f"Number of training examples: {len(train_data)}")
print(f"Number of testing examples: {len(test_data)}")
print(f"First training example text: {train_data.examples[0].text[:50]}...")
print(f"First training example label: {train_data.examples[0].label}")

Number of training examples: 25000
Number of testing examples: 25000
First training example text: ['I', 'loved', 'this', 'movie', '!', 'It', "'s", 'truly', 'bizarre', ',', 'extremely', 'funny', ',', 'morbid', ',', 'witty', '...', 'It', 'makes', 'no', 'sense', 'to', 'tell', 'about', 'the', 'contents', 'of', 'the', 'movie', ',', 'because', 'then', 'I', "'d", 'be', 'giving', 'out', 'the', 'outcome', '!', 'You', 'have', 'to', 'see', 'it', 'without', 'knowing', 'what', 'is', 'it']...
First training example label: pos


#### 5. Create randomized sub-samples

In [22]:
def create_data_sub_samples():
  """
  Creates dataset sub-samples for train, validation, and test sets.

  Returns:
  - train_subset: Subset of the training data.
  - valid_subset: Subset of the validation data.
  - test_subset: Subset of the test data.
  """

  # Shuffle the original datasets
  random.shuffle(train_data.examples)
  random.shuffle(test_data.examples)

  # Split the datasets
  train_samples = train_data.examples[:TRAIN_SIZE]
  valid_samples = train_data.examples[TRAIN_SIZE:TRAIN_SIZE + VALID_SIZE]
  test_samples  = test_data.examples[:TEST_SIZE]

  # Create new Dataset objects
  train_subset = data.Dataset(train_samples, fields=train_data.fields)
  valid_subset = data.Dataset(valid_samples, fields=train_data.fields)
  test_subset  = data.Dataset(test_samples,  fields=test_data.fields)

  return train_subset, valid_subset, test_subset

train_subset, valid_subset, test_subset = create_data_sub_samples()
print("Dataset sub-sets after sampling:")
print(f"Number of training examples: {len(train_subset)}")
print(f"Number of validation examples: {len(valid_subset)}")
print(f"Number of testing examples: {len(test_subset)}")

Dataset sub-sets after sampling:
Number of training examples: 1000
Number of validation examples: 400
Number of testing examples: 50


#### 6. Build Vocabulary (train only)

In [23]:
TEXT.build_vocab(
    train_subset,
    max_size=MAX_VOCAB_SIZE,
    vectors="glove.6B.100d",
    unk_init=torch.Tensor.normal_
)

LABEL.build_vocab(train_subset)



#### 7. Create iterators

In [24]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_subset, valid_subset, test_subset),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    device=DEVICE
)

#### 7. Sanity check a batch

In [25]:
def check_batch(iterator):
  batch = next(iter(iterator))

  text, text_lengths = batch.text
  labels = batch.label

  print(f"text.shape: {text.shape}")                  # [batch_size, seq_len]
  print(f"text_lengths.shape: {text_lengths.shape}")  # [batch_size]
  print(f"labels.shape: {labels.shape}")              # [batch_size]

check_batch(train_iterator)

text.shape: torch.Size([64, 87])
text_lengths.shape: torch.Size([64])
labels.shape: torch.Size([64])


#### 8. Bi-Directional LSTM Model (with packed sequences)

In [26]:
class BiLSTMSentiment(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout,
        pad_idx
    ):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        # text = [batch size, seq len]
        embedded = self.dropout(self.embedding(text))
        # embedded = [batch size, seq len, emb dim]

        embedded = embedded.permute(1, 0, 2)
        # embedded = [seq len, batch size, emb dim]

        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded,
            text_lengths.cpu(),
            enforce_sorted=False
        )

        packed_output, (hidden, cell) = self.lstm(packed_embedded)

        # hidden = [num layers * num directions, batch size, hidden dim]

        hidden_forward = hidden[-2, :, :]
        hidden_backward = hidden[-1, :, :]

        hidden_cat = torch.cat((hidden_forward, hidden_backward), dim=1)

        return self.fc(self.dropout(hidden_cat))


#### 9. Model Instantiation

In [27]:
INPUT_DIM = len(TEXT.vocab)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = BiLSTMSentiment(
    INPUT_DIM,
    EMBEDDING_DIM,
    HIDDEN_DIM,
    OUTPUT_DIM,
    N_LAYERS,
    BIDIRECTIONAL,
    DROPOUT,
    PAD_IDX
).to(DEVICE)


#### 10. Load Pretrained GloVe Embeddings

In [28]:
model.embedding.weight.data.copy_(TEXT.vocab.vectors)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)


#### 11. Optimizer & Loss

In [29]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

criterion = criterion.to(DEVICE)


#### 12. Training & Evaluation Functions

In [30]:
def binary_accuracy(preds, y):
    rounded = torch.round(torch.sigmoid(preds))
    correct = (rounded == y).float()
    return correct.sum() / len(correct)


#### 13. Training Loop

In [31]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()

        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


#### 14. Validation (evaluation) Loop

In [32]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)

            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


#### 15. Model Training - training / validation loops

In [33]:
def train_model(
    model_,
    train_iterator_,
    valid_iterator_,
    optimizer_,
    criterion_,
    n_epochs=N_EPOCHS):

    print(f"Start of the training-validation loop..")

    train_losses, val_losses = [], []
    train_accs, val_accs = [], []

    for epoch in range(n_epochs):
        print(f"Epoch {epoch+1}/{n_epochs}")
        train_loss, train_acc = train(model_, train_iterator_, optimizer_, criterion_)
        val_loss, val_acc = evaluate(model_, valid_iterator_, criterion_)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accs.append(train_acc)
        val_accs.append(val_acc)

        print(f"Train Loss: {train_loss:.3f} | Train Acc: {train_acc:.3f}")
        print(f"Val   Loss: {val_loss:.3f} | Val   Acc: {val_acc:.3f}")

    return train_losses, val_losses, train_accs, val_accs

train_losses, val_losses, train_accs, val_accs = train_model(
    model,
    train_iterator,
    valid_iterator,
    optimizer,
    criterion)

Start of the training-validation loop..
Epoch 1/10


Train Loss: 0.695 | Train Acc: 0.508
Val   Loss: 0.697 | Val   Acc: 0.491
Epoch 2/10
Train Loss: 0.689 | Train Acc: 0.543
Val   Loss: 0.686 | Val   Acc: 0.549
Epoch 3/10
Train Loss: 0.677 | Train Acc: 0.587
Val   Loss: 0.659 | Val   Acc: 0.623
Epoch 4/10
Train Loss: 0.651 | Train Acc: 0.619
Val   Loss: 0.639 | Val   Acc: 0.665
Epoch 5/10
Train Loss: 0.603 | Train Acc: 0.695
Val   Loss: 0.818 | Val   Acc: 0.565
Epoch 6/10
Train Loss: 0.576 | Train Acc: 0.706
Val   Loss: 0.613 | Val   Acc: 0.661
Epoch 7/10
Train Loss: 0.520 | Train Acc: 0.753
Val   Loss: 0.622 | Val   Acc: 0.647
Epoch 8/10
Train Loss: 0.494 | Train Acc: 0.773
Val   Loss: 0.597 | Val   Acc: 0.667
Epoch 9/10
Train Loss: 0.450 | Train Acc: 0.800
Val   Loss: 0.571 | Val   Acc: 0.710
Epoch 10/10
Train Loss: 0.431 | Train Acc: 0.807
Val   Loss: 0.665 | Val   Acc: 0.661
