### Load the datasets

In [1]:
from google.colab import files
uploaded = files.upload()

Saving datasetSentences.txt to datasetSentences.txt
Saving dictionary.txt to dictionary.txt
Saving sentiment_labels.txt to sentiment_labels.txt


In [2]:
import pandas as pd

In [3]:
dataset = pd.read_csv('datasetSentences.txt', sep='\t')
dataset.head()

Unnamed: 0,sentence_index,sentence
0,1,The Rock is destined to be the 21st Century 's...
1,2,The gorgeously elaborate continuation of `` Th...
2,3,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...
4,5,"Emerges as something rare , an issue movie tha..."


In [4]:
dictionary = pd.read_csv('dictionary.txt', sep='|', names = ['phrase', 'id'])
dictionary.head()

Unnamed: 0,phrase,id
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [5]:
sentiment_labels = pd.read_csv('sentiment_labels.txt', sep='|', names = ['id', 'sentiment_values'], header=0)
sentiment_labels.head()

Unnamed: 0,id,sentiment_values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


### Convert Labels into 5 Classes

In [6]:
import numpy as np

In [7]:
def createClass(a):
  if a <= 0.2 : 
    return 0
  elif a <=0.4 :
   return 1
  elif a <=0.6 :
   return 2
  elif a <=0.8 :
   return 3
  else :
    return 4

In [8]:
sentiment_labels['label'] = sentiment_labels.sentiment_values.apply(createClass)

In [9]:
sentiment_labels

Unnamed: 0,id,sentiment_values,label
0,0,0.50000,2
1,1,0.50000,2
2,2,0.44444,2
3,3,0.50000,2
4,4,0.42708,2
...,...,...,...
239227,239227,0.36111,1
239228,239228,0.38889,1
239229,239229,0.33333,1
239230,239230,0.88889,4


### Merge and Create the datasets

In [10]:
mergedPhrases = pd.merge(dictionary, sentiment_labels, on ='id')
mergedPhrases

Unnamed: 0,phrase,id,sentiment_values,label
0,!,0,0.50000,2
1,! ',22935,0.52778,2
2,! '',18235,0.50000,2
3,! Alas,179257,0.44444,2
4,! Brilliant,22936,0.86111,4
...,...,...,...,...
239227,zoning ordinances to protect your community fr...,220441,0.13889,0
239228,zzzzzzzzz,179256,0.19444,0
239229,élan,220442,0.51389,2
239230,É,220443,0.50000,2


In [11]:
mergedPhrases.label.value_counts()

2    119449
3     50148
1     43028
4     15255
0     11352
Name: label, dtype: int64

In [12]:
sentences = pd.merge(dataset, mergedPhrases, left_on='sentence', right_on='phrase')

In [13]:
sentences

Unnamed: 0,sentence_index,sentence,phrase,id,sentiment_values,label
0,1,The Rock is destined to be the 21st Century 's...,The Rock is destined to be the 21st Century 's...,226166,0.69444,3
1,2,The gorgeously elaborate continuation of `` Th...,The gorgeously elaborate continuation of `` Th...,226300,0.83333,4
2,3,Effective but too-tepid biopic,Effective but too-tepid biopic,13995,0.51389,2
3,4,If you sometimes like to go to the movies to h...,If you sometimes like to go to the movies to h...,14123,0.73611,3
4,5,"Emerges as something rare , an issue movie tha...","Emerges as something rare , an issue movie tha...",13999,0.86111,4
...,...,...,...,...,...,...
11281,11851,A real snooze .,A real snooze .,222071,0.11111,0
11282,11852,No surprises .,No surprises .,225165,0.22222,1
11283,11853,We 've seen the hippie-turned-yuppie plot befo...,We 've seen the hippie-turned-yuppie plot befo...,226985,0.75000,3
11284,11854,Her fans walked out muttering words like `` ho...,Her fans walked out muttering words like `` ho...,223632,0.13889,0


In [14]:
sentences.label.value_counts()

1    2971
3    2966
2    2144
4    1773
0    1432
Name: label, dtype: int64

In [16]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets

SEED = 1947

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


ReviewText = data.Field(sequential =True, 
                        tokenize = 'spacy',
                        tokenizer_language = 'en_core_web_sm',
                        batch_first =True, 
                        include_lengths=True) # batch_first means : [batch, channel, r,g,b] : here batch is first

Label = data.LabelField(tokenize ='spacy', 
                        is_target=True, 
                        batch_first =True, 
                        sequential =False)
                        

In [17]:
fields = [('reviewText', ReviewText), ('label', Label)]

In [21]:
sentences['sentence'][0]

"The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."

In [22]:
example = [data.Example.fromlist([sentences['sentence'][i],sentences.label[i]], fields) for i in range(sentences.shape[0])] 
pyDataset = data.Dataset(example, fields)

In [24]:
import random 
# Manual Seed
SEED = 123
torch.manual_seed(SEED)

(train, valid) = pyDataset.split(split_ratio=[0.7, 0.3], random_state=random.seed(SEED))

In [25]:
MAX_VOCAB_SIZE = 25_000
ReviewText.build_vocab(train, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d")
Label.build_vocab(train)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 398031/400000 [00:14<00:00, 26238.11it/s]

In [26]:
print('Size of input vocab : ', len(ReviewText.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(ReviewText.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  16417
Size of label vocab :  5
Top 10 words appreared repeatedly : [('.', 7446), (',', 6570), ('the', 5579), ('and', 4085), ('a', 4013), ('of', 4010), ('to', 2800), ('-', 2555), ('is', 2310), ("'s", 2279)]
Labels :  defaultdict(None, {1: 0, 3: 1, 2: 2, 4: 3, 0: 4})


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.reviewText),
                                                            sort_within_batch=True, 
                                                            device = device)

In [30]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, 64)

        self.dropout = nn.Dropout(dropout)
        
        self.fc2 = nn.Linear(64, 32)

        self.fc3 = nn.Linear(32, output_dim)
        
        
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), batch_first=True)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
        out = self.fc(hidden)
        
        out = self.fc2(out)
            
        return self.fc3(out)

In [31]:
INPUT_DIM = len(ReviewText.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 5
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = ReviewText.vocab.stoi[ReviewText.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [32]:
model

RNN(
  (embedding): Embedding(16417, 100, padding_idx=1)
  (rnn): LSTM(100, 128, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=5, bias=True)
)

In [34]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,291,177 trainable parameters


In [35]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [36]:
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [37]:
# Function to get number of correct classifications
def get_correct(preds, labels):
  return (preds.argmax(dim=1).eq(labels).sum()) / len(preds)

In [38]:
Label.vocab.stoi

defaultdict(None, {0: 4, 1: 0, 2: 2, 3: 1, 4: 3})

In [39]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [40]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        # retrieve text and no. of words
        text, text_length = batch.reviewText 

        predictions = model(text, text_length).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [41]:
print((next(iter(train_iterator)).label)) # batch.label
print((next(iter(train_iterator)).reviewText)[1]) # length_text
# print((next(iter(train_iterator)).reviewText)[0]) # text

print('length label:', len(next(iter(train_iterator)).label))
print('length label:', len((next(iter(train_iterator)).reviewText)[1]))

tensor([0, 1, 1, 2, 2, 1, 2, 3, 0, 1, 3, 0, 0, 2, 0, 2, 4, 1, 4, 1, 2, 0, 3, 0,
        4, 3, 4, 0, 2, 3, 2, 1], device='cuda:0')
tensor([25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
        25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 24],
       device='cuda:0')
length label: 32
length label: 32


In [42]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            # retrieve text and no. of words
            text, text_length = batch.reviewText 

            predictions = model(text, text_length).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len( iterator)

In [43]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [45]:
N_EPOCHS = 55

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 0.911 | Train Acc: 63.34%
	 Val. Loss: 1.865 |  Val. Acc: 38.28%
Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 0.886 | Train Acc: 64.40%
	 Val. Loss: 1.811 |  Val. Acc: 38.87%
Epoch: 03 | Epoch Time: 0m 2s
	Train Loss: 0.858 | Train Acc: 65.45%
	 Val. Loss: 1.838 |  Val. Acc: 38.92%
Epoch: 04 | Epoch Time: 0m 2s
	Train Loss: 0.821 | Train Acc: 66.80%
	 Val. Loss: 1.888 |  Val. Acc: 38.29%
Epoch: 05 | Epoch Time: 0m 2s
	Train Loss: 0.797 | Train Acc: 68.18%
	 Val. Loss: 1.831 |  Val. Acc: 38.64%
Epoch: 06 | Epoch Time: 0m 2s
	Train Loss: 0.779 | Train Acc: 69.61%
	 Val. Loss: 1.899 |  Val. Acc: 38.41%
Epoch: 07 | Epoch Time: 0m 2s
	Train Loss: 0.744 | Train Acc: 70.56%
	 Val. Loss: 2.028 |  Val. Acc: 37.88%
Epoch: 08 | Epoch Time: 0m 2s
	Train Loss: 0.718 | Train Acc: 71.75%
	 Val. Loss: 2.099 |  Val. Acc: 37.99%
Epoch: 09 | Epoch Time: 0m 2s
	Train Loss: 0.700 | Train Acc: 72.43%
	 Val. Loss: 2.000 |  Val. Acc: 38.18%
Epoch: 10 | Epoch Time: 0m 2

In [59]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_class(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [ReviewText.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    #print(indexed)
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1).T
    length_tensor = torch.LongTensor(length)
    #print(length_tensor)
    preds = model(tensor, length_tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

In [64]:
for j in range(10):
  i = random.randint(1, 1000)
  pred_class = predict_class(model, sentences.sentence[i])
  print('Sentence : ', sentences.sentence[i])
  print('Actual Class   : ', sentences.label[i])

  print('Predicted class: ', Label.vocab.itos[pred_class])

Sentence :  The film starts out as competent but unremarkable ... and gradually grows into something of considerable power .
Actual Class   :  3
Predicted class:  3
Sentence :  Still pretentious and filled with subtext , but entertaining enough at ` face value ' to recommend to anyone looking for something different .
Actual Class   :  3
Predicted class:  3
Sentence :  It wo n't rock any boats but is solid meat-and-potatoes filmmaking .
Actual Class   :  2
Predicted class:  2
Sentence :  What makes the movie a comedy is the way it avoids the more serious emotions involved .
Actual Class   :  3
Predicted class:  3
Sentence :  The movie has a soft , percolating magic , a deadpan suspense .
Actual Class   :  4
Predicted class:  4
Sentence :  As if to prove a female director can make a movie with no soft edges , Kathryn Bigelow offers no sugar-coating or interludes of lightness .
Actual Class   :  3
Predicted class:  3
Sentence :  What could have become just another cautionary fable is all