In [0]:
import os
import cv2
import time
import numpy as np
import random
from string import punctuation
from collections import Counter
from statistics import mean, stdev

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

###Download the IMDB Reviews dataset

In [0]:
%%capture
if not os.path.isdir('aclImdb'):
    !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    !tar -xvf 'aclImdb_v1.tar.gz'
    !sudo rm -r 'aclImdb_v1.tar.gz'

### The Readme

In [3]:
with open('aclImdb/README', 'r') as f:
    readme = f.read()
    print(readme)

Large Movie Review Dataset v1.0

Overview

This dataset contains movie reviews along with their associated binary
sentiment polarity labels. It is intended to serve as a benchmark for
sentiment classification. This document outlines how the dataset was
gathered, and how to use the files provided. 

Dataset 

The core dataset contains 50,000 reviews split evenly into 25k train
and 25k test sets. The overall distribution of labels is balanced (25k
pos and 25k neg). We also include an additional 50,000 unlabeled
documents for unsupervised learning. 

In the entire collection, no more than 30 reviews are allowed for any
given movie because reviews for the same movie tend to have correlated
ratings. Further, the train and test sets contain a disjoint set of
movies, so no significant performance is obtained by memorizing
movie-unique terms and their associated with observed labels.  In the
labeled train/test sets, a negative review has a score <= 4 out of 10,
and a positive review has a scor

###Dataset Generation and preprocessing

In [0]:
class dataset_from_directory(Dataset):
    
    def __init__(self,root_dir, seq_length = 250):
        
        self.reviews = []
        self.labels = []

        for filename in os.listdir(os.path.join(root_dir,'pos')):
            review = open(os.path.join(os.path.join(root_dir,'pos'),filename), 'r').read().lower().replace('<br />',' ')
            review = "".join([char for char in review if char not in punctuation])
            self.reviews.append(review)
            self.labels.append(1)

        for filename in os.listdir(os.path.join(root_dir,'neg')):
            review = open(os.path.join(os.path.join(root_dir,'neg'),filename), 'r').read().lower().replace('<br />',' ')
            review = "".join([char for char in review if char not in punctuation])
            self.reviews.append(review)
            self.labels.append(0)

        # Tokenize — Create Vocab to Int mapping dictionary
        words = ' '.join(self.reviews).split()
        self.count_words = Counter(words)
        total_words = len(words)
        self.sorted_words = self.count_words.most_common(total_words)

        self.tokens = {word:i+1 for i, (word,count) in enumerate(self.sorted_words)}

        # Encode the words using tokens
        self.reviews_tokenized = []
        for review in self.reviews:
            token = [self.tokens[word] for word in review.split()]
            self.reviews_tokenized.append(token)

        self.reviews_len = [len(review) for review in self.reviews_tokenized]

        # Removing Outliers (getting rid of reviews with lenth 0)
        self.reviews_tokenized = [self.reviews_tokenized[i] for i, l in enumerate(self.reviews_len) if l>0]
        self.labels = [self.labels[i] for i, l in enumerate(self.reviews_len) if l>0]

        # To deal with both short and long reviews, we will pad or truncate all our reviews to a specific length.
        self.reviews_tokenized = self.pad_features(self.reviews_tokenized, seq_length)

    def pad_features(self,reviews_int, seq_length):
    # Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    
        features = np.zeros((len(reviews_int), seq_length), dtype = int)
        
        for i, review in enumerate(reviews_int):
            review_len = len(review)
            
            if review_len <= seq_length:
                zeroes = list(np.zeros(seq_length-review_len))
                padded_review = zeroes+review
            elif review_len > seq_length:
                padded_review = review[0:seq_length]
            
            features[i,:] = np.array(padded_review)
        return features
    
    def __len__(self):
        return len(self.reviews_tokenized)
    
    def __getitem__(self,index):  
        return self.reviews_tokenized[index], np.array(self.labels[index])

    def word_count(self):
        print(self.count_words)  

    def vocab_to_int(self):
        return self.tokens

    def get_reviews(self,index):
        return self.reviews[index] 

    def get_vocab_len(self):
        return len(self.count_words)

    def analyze_reviews(self):
        print('Max: {}'.format(max(self.reviews_len)))
        print('Min: {}'.format(min(self.reviews_len)))
        print('Mean: {}'.format(mean(self.reviews_len)))
        print('Std Dev: {}'.format(stdev(self.reviews_len)))
        

In [5]:
train_dataset = dataset_from_directory('aclImdb/train', seq_length = 250)

print('Total number of Reviews: {}'.format(len(train_dataset)))
print('Number of Positives: {}'.format(train_dataset.labels.count(0)))
print('Number of Negatives: {}'.format(train_dataset.labels.count(1)))

Total number of Reviews: 25000
Number of Positives: 12500
Number of Negatives: 12500


In [7]:
review, label = train_dataset[0:5]
print(review)
print(label)
print(review.shape)
print(label.shape)

[[    0     0     0 ...   592  1951    94]
 [    0     0     0 ...   408    10    27]
 [    0     0     0 ...  7783  1100   183]
 [    0     0     0 ...    44    46 11227]
 [    0     0     0 ...     2 13364  3449]]
[1 1 1 1 1]
(5, 250)
(5,)


In [8]:
train_dataset.get_reviews(2)

'this first part of the brd trilogy has more passion and plot density than lola but less of the magic of veronica voss the political musings have point to them we see the shortages after the war how the blackmarketers were able to control so much of the daytoday life delicious moment when fassbinder playing a grifter tries to sell a complete set of kleist to schygulla who remarks that burning books dont provide much warmth she really wants firewood  theres some clumsiness in the first hour the scene in marias room with the black soldier interrupted by hermanns appearance should go quicker the train scene when maria meets karl oswald falls flat when she insults the gii cringed it was so bad but as the story develops and the years go by i was drawn more and more into this glossy cold world'

In [0]:
vocab_to_int = train_dataset.vocab_to_int()
vocab_len = train_dataset.get_vocab_len()

In [10]:
train_dataset.analyze_reviews()

Max: 2459
Min: 10
Mean: 230.58692
Std Dev: 171.33392780621796


In [11]:
len_valid_set = int(0.2*len(train_dataset))
len_train_set = len(train_dataset) - len_valid_set

print("The length of Train set is {}".format(len_train_set))
print("The length of Valid set is {}".format(len_valid_set))

train_dataset , valid_dataset = torch.utils.data.random_split(train_dataset, [len_train_set, len_valid_set])  

batch_size = 50

# shuffle and batch the datasets
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size,shuffle=True, drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size,shuffle=True, drop_last=True)

reviews, labels = next(iter(train_loader))

print(reviews.shape)
print(labels.shape)

The length of Train set is 20000
The length of Valid set is 5000
torch.Size([50, 250])
torch.Size([50])


In [12]:
reviews, labels = next(iter(valid_loader))

print(reviews.shape)
print(labels.shape)

torch.Size([50, 250])
torch.Size([50])


###Creating LSTM Network

In [0]:
class SentimentLSTM(nn.Module):

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, num_layers, drop_prob=0.5):
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.5)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        

    def forward(self, x, h):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = self.embedding(x)

        lstm_out, h = self.lstm(x, h)   # lstm_out: [batch, len_review, hidden_dim]
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)    # [batch * len_review, hidden_dim]

        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)                   # [batch * len_review, 1]
        sig_out = self.sigmoid(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)  # [batch, len_review]

        # get the last prediction for the complete batch
        pred = sig_out[:, -1]                   # [batch]

        return pred, h
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        
        return hidden

In [14]:
vocab_size = vocab_len + 1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 1

network = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers).cuda()
print(network)

  "num_layers={}".format(dropout, num_layers))


SentimentLSTM(
  (embedding): Embedding(112012, 400)
  (lstm): LSTM(400, 256, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [0]:
def find_acc(pred, label):
    pred = torch.round(pred.squeeze())  # rounds to the nearest integer
    correct = pred.eq(label.view_as(pred))
    accuracy = correct.to(torch.float32).mean().item()
    return accuracy * 100

In [16]:
num_epochs = 10
clip=5 # gradient clipping
loss_min = np.inf

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(network.parameters(), lr=0.001)

for epoch in range(1,num_epochs+1):
    
    loss_train = 0
    loss_valid = 0
    running_loss = 0
    
    # set the network into train mode
    network.train()
    h = network.init_hidden(batch_size)

    for step in range(1,len(train_loader)+1):
    
        reviews, labels = next(iter(train_loader))

        reviews = reviews.cuda()
        labels = labels.cuda()

        h = tuple([each.data for each in h])

        # zero accumulated gradients
        network.zero_grad()

        # get the output from the model
        output, h = network(reviews,h)

        # calculate the loss and perform backprop
        loss_train_step = criterion(output.squeeze(), labels.float())
        loss_train_step.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(network.parameters(), clip)

        # Update the parameters
        optimizer.step()
        
        loss_train += loss_train_step.item()
        
    network.eval() 
    val_h = network.init_hidden(batch_size)
    
    # turn the gradients off for validation
    with torch.no_grad():
        
        for step in range(1,len(valid_loader)+1):
            
            reviews, labels = next(iter(valid_loader))

            reviews = reviews.cuda()
            labels = labels.cuda()

            val_h = tuple([each.data for each in val_h])

            output, val_h = network(reviews,val_h)

            loss_valid_step = criterion(output.squeeze(), labels.float())

            loss_valid += loss_valid_step.item()
    
    loss_train /= len(train_loader)
    loss_valid /= len(valid_loader)
    
    print('Epoch: {}  Train Loss: {:.4f}  Valid Loss: {:.4f}'.format(epoch, loss_train, loss_valid))
    
    if loss_valid < loss_min:
        loss_min = loss_valid
        torch.save(network.state_dict(), 'sentiment_analysis.pth') 
        print("\nMinimum validation loss of {} at epoch {}/{}".format(loss_min, epoch, num_epochs))
        print('Model Saved\n')

print('Training Complete')

Epoch: 1  Train Loss: 0.5801  Valid Loss: 0.5496

Minimum validation loss of 0.549594295322895 at epoch 1/10
Model Saved

Epoch: 2  Train Loss: 0.3960  Valid Loss: 0.4762

Minimum validation loss of 0.4762261989712715 at epoch 2/10
Model Saved

Epoch: 3  Train Loss: 0.2520  Valid Loss: 0.4552

Minimum validation loss of 0.4552463109791279 at epoch 3/10
Model Saved

Epoch: 4  Train Loss: 0.1509  Valid Loss: 0.5217


KeyboardInterrupt: ignored

## Load the best network

In [17]:
best_network = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers).cuda()
best_network.load_state_dict(torch.load('sentiment_analysis.pth')) 

  "num_layers={}".format(dropout, num_layers))


<All keys matched successfully>

## Test on valid data

In [18]:
losses = []
accuracies = []

best_network.eval() 
h = best_network.init_hidden(batch_size)

# turn the gradients off for validation
for reviews, labels in valid_loader:

    reviews = reviews.cuda()
    labels = labels.cuda()

    h = tuple([each.data for each in h])

    output, h = best_network(reviews,h)

    loss = criterion(output, labels.float())
    acc = find_acc(output, labels.float())

    losses.append(loss.item())
    accuracies.append(acc)

print('Loss: {:.4f}  Accuracy: {:.4f}'.format(np.mean(losses), np.mean(accuracies)))


Loss: 0.4564  Accuracy: 82.1400


## Test on custom reviews

In [0]:
def tokenize_review(review):
    # lowercase
    review = review.lower() 

    # scan the review char by char and get rid of punctuations
    text = ''.join([char for char in review if char not in punctuation])
    
    # splitting by spaces to get words
    words = text.split()
    
    # Encode the review
    encoded_review = []
    encoded_review.append([vocab_to_int[word] for word in words])
    
    return encoded_review

def pad_features(encoded_reviews, seq_length):
        features = np.zeros((len(encoded_reviews), seq_length), dtype = int)
        for i, review in enumerate(encoded_reviews):
            review_len = len(review)
            
            if review_len <= seq_length:
                zeroes = list(np.zeros(seq_length-review_len))
                padded_review = zeroes + review
            elif review_len > seq_length:
                padded_review = review[0:seq_length]
            
            features[i,:] = np.array(padded_review)
        return features

In [0]:
def predict(network, review, seq_length=250):
    ''' Prints out whether a give review is predicted to be 
        positive or negative in sentiment, using a trained model.
        
        review - a review made of normal text and punctuation
        sequence_length - the padded length of a review
        '''
    
    network.eval()
    
    encoded_review = tokenize_review(review)

    # Pad the review
    features = torch.from_numpy(pad_features(encoded_review, seq_length)).cuda()
    
    batch_size = features.size(0)
    
    # initialize hidden state
    h = network.init_hidden(batch_size)
      
    # get the output from the model
    output, h = network(features, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()).item()
    
    # print custom response based on whether test_review is pos/neg
    if(pred==1):
        print('Positive Review: Confidence: {:.3f} %'.format((output.item())*100))
    else:
        print('Negative Review: Confidence: {:.3f} %'.format((1-output.item())*100))

In [30]:
review = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'
predict(best_network, review, seq_length = 250)

Negative Review: Confidence: 99.371 %


In [31]:
review = 'I did not like the ending. The ending was bad, but overall the movie was amazing.'
predict(best_network, review, seq_length = 250)

Positive Review: Confidence: 87.453 %


In [40]:
review = 'I have no words to say. It was amazing.'
predict(best_network, review, seq_length = 250)

Positive Review: Confidence: 90.586 %
