<a href="https://colab.research.google.com/github/anujdutt9/PyTorch-DeepLearning/blob/master/RNN_Text_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Import Dependencies
import numpy as np
import torch
from torchtext import data, datasets

In [2]:
# Set Initial Seed
torch.manual_seed(101)

<torch._C.Generator at 0x7f8b8a417030>

In [3]:
# Set Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# Pre-Process review by tokenizing it using spaCy
# Lowercasing the text, just in case
# https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.Field
text = data.Field(lower=True, tokenize='spacy')
text

<torchtext.data.field.Field at 0x7f8b89ee6358>

In [5]:
# Define Label using LabelField with data type float
label = data.LabelField(dtype=torch.float)
label

<torchtext.data.field.LabelField at 0x7f8b1b97efd0>

In [6]:
# Download the dataset and split into train and test data

train, test = datasets.IMDB.splits(text, label)

print("\nTraining Samples: ", len(train))
print("Test Samples: ", len(test))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.8MB/s]



Training Samples:  25000
Test Samples:  25000


In [7]:
# Check an example from training examples
print(vars(train.examples[5]))

{'text': ['i', 'admit', 'i', 'have', 'a', 'weakness', 'for', 'alternate', 'history', 'stories', ',', 'from', 'its', 'a', 'wonderful', 'life', 'to', 'groundhog', 'day', 'to', '12:01', '.', 'among', 'those', 'greats', 'is', 'this', 'little', 'gem', '.', 'it', "'s", 'pretty', 'difficult', 'to', 'get', 'through', 'mr', '.', 'destiny', 'without', 'giving', 'a', 'nod', 'of', 'appreciation', 'to', 'each', 'and', 'every', 'cast', 'member', ',', 'from', 'the', 'goodhearted', 'james', 'belushi', 'to', 'the', 'murderous', 'courtney', 'cox', '.', 'this', 'movie', 'lacks', 'the', 'gravitas', 'and', 'scale', 'to', 'make', 'it', 'a', 'great', 'film', ',', 'but', 'it', "'s", 'a', 'fine', 'cheer', '-', 'up', 'on', 'a', 'rainy', 'afternoon', '.', 'it', "'s", 'also', 'a', 'great', 'rental', 'for', 'an', 'inexpensive', 'date', '.'], 'label': 'pos'}


In [8]:
import random

# Train Validation Split the Data
# https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.Dataset.split
train_data, valid_data = train.split(split_ratio=0.8, random_state=random.seed(101))

print("Train-Validation Split of 80:20.\n")
print("Training Samples: ", len(train_data))
print("Validation Samples: ", len(valid_data))
print("Test Samples: ", len(test))

Train-Validation Split of 80:20.

Training Samples:  20000
Validation Samples:  5000
Test Samples:  25000


In [0]:
# Create the vocabulary
vocab_size = 25000

# Build Vocabulary Fields for Reviews and Labels
text.build_vocab(train_data, max_size=vocab_size)
label.build_vocab(train_data)

In [10]:
# Unique Tokens in vocabulary
# Unique Tokens: 25000 words + 2 others [<unk>, <pad>]
print("Unique tokens in text vocabulary: ", len(text.vocab))

# Unique Labels: 'pos', 'neg'
print("Unique tokens in label vocabulary: ", len(label.vocab))

Unique tokens in text vocabulary:  25002
Unique tokens in label vocabulary:  2


In [11]:
# Top 10 most common words in the vocabulary
print(text.vocab.freqs.most_common(10))

[('the', 263394), (',', 220727), ('.', 189074), ('and', 130399), ('a', 129605), ('of', 116481), ('to', 108524), ('is', 88002), ('it', 74723), ('in', 74467)]


In [12]:
# Vocabulary: int to string
print(text.vocab.itos[:20])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this', 'that', '"', "'s", '-', '/><br', 'was']


In [13]:
# Vocabulary: string to int
print(text.vocab.stoi)



In [14]:
# Example: 'the' appears at 2nd index in vocabulary
print(text.vocab.stoi["the"])

2


In [15]:
# Vocabulary: String to Int
print(label.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f8b89f1c378>, {'pos': 0, 'neg': 1})


In [0]:
# BucketIterator: Iterator for going over batches of data of similar length
# https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.BucketIterator

# Batch Size
batch_size = 64

train_iterator, validation_iterator, test_iterator = data.BucketIterator.splits(datasets=(train_data, valid_data, test),
                                                                                batch_sizes=(batch_size, batch_size, batch_size),
                                                                                device=device)

In [17]:
print("Vocab length: ", len(text.vocab))

Vocab length:  25002


In [0]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()

    # input_dim: [sentence_length, batch_size]

    # Embedding Layer
    # Takes in One Hot Encoded Text tensor from BucketIterator and gives the
    # embeddings for the text [word2vec] of size [sentence_length, batch_size, embedding_dim]
    self.embedding = nn.Embedding(input_dim, embedding_dim)

    # RNN Layer
    # Input Size: [sentence_length, batch_size, embedding_dim]
    # Outputs: 
    # output: [sentence length, batch size, hidden dim]
    # hidden: [1, batch size, hidden dim]
    self.rnn = nn.RNN(embedding_dim, hidden_size=hidden_dim)

    # Fully Connected Layer
    # Input Size: [1, batch size, hidden dim]
    # Output Size: 1
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, text):
     # Text = [sentence_len, batch size]
     # comes from training/validation/test_iterator
     embedded = self.embedding(text)
     
     # Embedded = [sentence_length, batch_size, embedding_dim]
     output, hidden = self.rnn(embedded)
        
     # output = [sentence length, batch size, hidden dim]
     # hidden = [1, batch size, hidden dim]
     assert torch.equal(output[-1,:,:], hidden.squeeze(0))
     
     return self.fc(hidden.squeeze(0))

In [0]:
# Input Dimension is the dimension of the One Hot Vectors which is
# same as the vocabulary size i.e. 25002
input_dim = len(text.vocab)

# Embedding Dimension is the size of the dense word vectors, usually between
# 50-250.
embedding_dim = 100

# RNN Hidden Dimension is the size of the hidden states
hidden_dim = 256

# Output Dimension is the number of classes, in our case it can be a single 
# scalar value i.e. between 0 or 1.
output_dim = 1

In [20]:
# Initialize the Model
model = RNN(input_dim=input_dim,
            embedding_dim=embedding_dim,
            hidden_dim=hidden_dim,
            output_dim=output_dim)

print(model)

RNN(
  (embedding): Embedding(25002, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


In [21]:
# Number of Trainable Parameters
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Number of Trainable Parameters: ", count_parameters(model))

Number of Trainable Parameters:  2592105


In [0]:
# Model Optimizer
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=1e-3)

In [0]:
# Loss Function
# Binary Cross Entropy with Logits Loss

# Our model currently outputs an unbound real number. As our labels are either 0 or 1, 
# we want to restrict the predictions to a number between 0 and 1. We do this using the sigmoid or logit functions.

# We then use this this bound scalar to calculate the loss using binary cross entropy.

# The BCEWithLogitsLoss criterion carries out both the sigmoid and the binary cross entropy steps.
criterion = nn.BCEWithLogitsLoss()

In [0]:
# Send Model and Loss to Device
model = model.to(device)
criterion = criterion.to(device)

In [0]:
# Function to Calculate Accuracy
# Returns Accuracy per batch
def binary_accuracy(preds, y):
  # Round predictions to closest integer
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float()
  accuracy = correct.sum() / len(correct)
  return accuracy

In [0]:
# Train Function
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iterator:
    # Clear the gradients for optimizer
    optimizer.zero_grad()

    # Make Predictions on a batch of text from training data
    # squeeze is needed as the predictions are initially size [batch size, 1], 
    # and we need to remove the dimension of size 1 as PyTorch expects the 
    # predictions input to our criterion function to be of size [batch size].
    predictions = model.forward(batch.text).squeeze(1)

    # Calculate Loss
    loss = criterion(predictions, batch.label)

    # Accuracy
    acc = binary_accuracy(predictions, batch.label)

    # Backpropagation
    loss.backward()

    # Take one step
    optimizer.step()

    # Loss per epoch
    epoch_loss += loss.item()

    # Accuracy per Epoch
    epoch_acc += acc.item()

  # Return Loss and Accuracy Percentage
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
# Train Function
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.eval()

  # No Gradients, no backprop
  with torch.no_grad():
    for batch in iterator:
      # Make Predictions on a batch of text from training data
      # squeeze is needed as the predictions are initially size [batch size, 1], 
      # and we need to remove the dimension of size 1 as PyTorch expects the 
      # predictions input to our criterion function to be of size [batch size].
      predictions = model.forward(batch.text).squeeze(1)

      # Calculate Loss
      loss = criterion(predictions, batch.label)

      # Accuracy
      acc = binary_accuracy(predictions, batch.label)

      # Loss per epoch
      epoch_loss += loss.item()

      # Accuracy per Epoch
      epoch_acc += acc.item()

  # Return Loss and Accuracy Percentage
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
# Epoch: One epoch is a complete pass through all examples in the training and 
# validation sets
epochs = 10

In [0]:
# Validation Loss
best_valid_loss = float('inf')

In [30]:
for epoch in range(epochs):
  # Training Loss and Accuracy
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  # Validation Loss and Accuracy
  valid_loss, valid_acc = evaluate(model, validation_iterator, criterion)

  if (valid_loss < best_valid_loss):
    best_valid_loss = best_valid_loss
    torch.save(model.state_dict(), 'sentiment-rnn.pt')
  
  print("Epoch: {0}\tTrain Loss: {1}\tTrain Accuracy: {2}\tValidation Loss: {3}\tValidation Accuracy: {4}".format(epoch, train_loss, train_acc, valid_loss, valid_acc))

Epoch: 0	Train Loss: 0.6976903492269425	Train Accuracy: 0.4976038338658147	Validation Loss: 0.7031270755997187	Validation Accuracy: 0.5110759493670886
Epoch: 1	Train Loss: 0.7007267473223872	Train Accuracy: 0.5060902555910544	Validation Loss: 0.6996226288095305	Validation Accuracy: 0.4907041139240506
Epoch: 2	Train Loss: 0.6973077401566429	Train Accuracy: 0.5012979233226837	Validation Loss: 0.7026046884210804	Validation Accuracy: 0.4970332278481013
Epoch: 3	Train Loss: 0.6979724778154026	Train Accuracy: 0.49755391373801916	Validation Loss: 0.6951745620256737	Validation Accuracy: 0.4907041139240506
Epoch: 4	Train Loss: 0.6974038439817702	Train Accuracy: 0.5069388977635783	Validation Loss: 0.6945974049688894	Validation Accuracy: 0.5045490506329114
Epoch: 5	Train Loss: 0.6973511749943986	Train Accuracy: 0.4998502396166134	Validation Loss: 0.7056453273266177	Validation Accuracy: 0.49723101265822783
Epoch: 6	Train Loss: 0.696759494539267	Train Accuracy: 0.5014476837060703	Validation Loss: 0

In [31]:
# Final Metrics

# Load Saved Model
model.load_state_dict(torch.load('sentiment-rnn.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print("Test Loss: {0}\tTest Accuracy: {1}".format(test_loss, test_acc))

Test Loss: 0.6939971446990967	Test Accuracy: 0.5177349743940641
