In [1]:
import os
import time
import numpy as np
from tqdm import tqdm
from string import punctuation
from collections import Counter
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
X = []
y = []

for label in ['pos', 'neg']:
    for file_name in tqdm(os.listdir(f'./aclImdb/train/{label}/')):
        with open(os.path.join(f'./aclImdb/train/{label}/', file_name), encoding="utf8") as f:
            X.append(f.read())
            y.append(label)
            
print ('Number of reviews :', len(X))

100%|█████████████████████████████████████████████████████████████████████████| 12500/12500 [00:00<00:00, 15676.77it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:02<00:00, 5791.86it/s]

Number of reviews : 25000





In [3]:
X = [x.lower() for x in X]
X = [''.join([c for c in x if c not in punctuation]) for x in tqdm(X)]

words_blob = ' '.join(X)

all_words = words_blob.split()

words_count = Counter(all_words)

words_count_len = len(words_count)
sorted_words_count = words_count.most_common(words_count_len)

print(sorted_words_count[:10])

100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:03<00:00, 7341.87it/s]


[('the', 334691), ('and', 162228), ('a', 161940), ('of', 145326), ('to', 135042), ('is', 106855), ('in', 93028), ('it', 77099), ('i', 75719), ('this', 75190)]


In [4]:
word_to_idx = {word: idx+1 for idx, (word, _) in enumerate(sorted_words_count)}
print(list(word_to_idx.items())[:10])

[('the', 1), ('and', 2), ('a', 3), ('of', 4), ('to', 5), ('is', 6), ('in', 7), ('it', 8), ('i', 9), ('this', 10)]


In [5]:
X_encoded = []

for x in X:
    x_encoded = [word_to_idx[word] for word in x.split()]
    X_encoded.append(x_encoded)

y_encoded = [1 if label =='pos' else 0 for label in y]
y_encoded = np.array(y_encoded, dtype='float32')

print(X[0])
print()
print (X_encoded[0])

bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt

[22572, 321, 6, 3, 1077, 219, 8, 2082, 30, 1, 166, 61, 14, 46, 80, 5581, 42, 399, 118, 135, 14, 4883, 55, 4980, 147, 7, 1, 4941, 6023, 479, 69, 5, 255, 11, 22572, 17217, 1970, 6, 72, 2356, 5, 638, 70, 6, 4883, 1, 26241, 5, 2031, 10833, 

In [6]:
def pad_sequence(X, sequence_length):
    X_padded = np.zeros((len(X), sequence_length), dtype = int)
    
    for idx, x in enumerate(X):
        x_len = len(x)
        
        if x_len <= sequence_length:
            zeroes = list(np.zeros(sequence_length - x_len))
            new_x = zeroes + x
        elif x_len > sequence_length:
            new_x = x[0: sequence_length]
        
        X_padded[idx,:] = np.array(new_x)
    
    return X_padded

sequence_length = 512
X_padded = pad_sequence(X_encoded, sequence_length)

In [7]:
train_val_split = 0.75
X_len = len(X_padded)

train_X = X_padded[:int(train_val_split * X_len)]
train_y = y_encoded[:int(train_val_split * X_len)]

val_X = X_padded[int(train_val_split * X_len):]
val_y = y_encoded[int(train_val_split * X_len):]

In [8]:
train_dataset = TensorDataset(torch.from_numpy(train_X).to(device), torch.from_numpy(train_y).to(device))
val_dataset = TensorDataset(torch.from_numpy(val_X).to(device), torch.from_numpy(val_y).to(device))

batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [9]:
train_data_iter = iter(train_dataloader)
X_example, y_example = next(train_data_iter)
print('Example Input size: ', X_example.size()) 
print('Example Input:\n', X_example)
print()
print('Example Output size: ', y_example.size()) 
print('Example Output:\n', y_example)

Example Input size:  torch.Size([32, 512])
Example Input:
 tensor([[   29,    31,    57,  ...,    91,    22,    23],
        [    0,     0,     0,  ...,    12,     1,  8247],
        [    0,     0,     0,  ...,  1482,   939, 15257],
        ...,
        [    0,     0,     0,  ...,   311,     8,     6],
        [    0,     0,     0,  ...,    99,  1215,   131],
        [    0,     0,     0,  ...,  1982,     8,   131]], device='cuda:0',
       dtype=torch.int32)

Example Output size:  torch.Size([32])
Example Output:
 tensor([1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1.,
        1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1.],
       device='cuda:0')


In [10]:
class Model(nn.Module):
    def __init__(self, input_dimension, embedding_dimension, hidden_dimension, output_dimension):
        super().__init__()
        self.embedding_layer = nn.Embedding(input_dimension, embedding_dimension)  
        self.rnn_layer = nn.RNN(embedding_dimension, hidden_dimension, num_layers=1)
        self.fc_layer = nn.Linear(hidden_dimension, output_dimension)
        
    def forward(self, sequence):
        # sequence shape = (sequence_length, batch_size)
        embedding = self.embedding_layer(sequence)  
        # embedding shape = [sequence_length, batch_size, embedding_dimension]
        output, hidden_state = self.rnn_layer(embedding)
        # output shape = [sequence_length, batch_size, hidden_dimension]
        # hidden_state shape = [1, batch_size, hidden_dimension]
        final_output = self.fc_layer(hidden_state[-1,:,:].squeeze(0))      
        return final_output
    
input_dimension = len(word_to_idx) + 1 # +1 to account for padding
embedding_dimension = 100
hidden_dimension = 32
output_dimension = 1

model = Model(input_dimension, embedding_dimension, hidden_dimension, output_dimension)

optim = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

model = model.to(device)
loss_func = loss_func.to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def accuracy_metric(predictions, ground_truth):
    rounded_predictions = torch.round(torch.sigmoid(predictions))
    success = (rounded_predictions == ground_truth).float() 
    accuracy = success.sum() / len(success)
    return accuracy

In [12]:
def train(model, dataloader, optim, loss_func):
    loss = 0
    accuracy = 0
    model.train()
    
    for sequence, label in dataloader:
        optim.zero_grad()     
        preds = model(sequence.T).squeeze()
        
        loss_curr = loss_func(preds, label)
        accuracy_curr = accuracy_metric(preds, label)
        
        loss_curr.backward()
        optim.step()
        
        loss += loss_curr.item()
        accuracy += accuracy_curr.item()
        
    return loss/len(dataloader), accuracy/len(dataloader)

In [13]:
def validate(model, dataloader, loss_func):
    loss = 0
    accuracy = 0
    model.eval()
    
    with torch.no_grad():
        for sequence, label in dataloader:
            
            preds = model(sequence.T).squeeze()
            
            loss_curr = loss_func(preds, label)   
            accuracy_curr = accuracy_metric(preds, label)

            loss += loss_curr.item()
            accuracy += accuracy_curr.item()
        
    return loss/len(dataloader), accuracy/len(dataloader)

In [14]:
num_epochs = 10
best_val_loss = float('inf')

for ep in range(num_epochs):

    time_start = time.time()
    
    train_loss, train_accuracy = train(model, train_dataloader, optim, loss_func)
    val_loss, val_accuracy = validate(model, val_dataloader, loss_func)
    
    time_end = time.time()
    time_delta = time_end - time_start  
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'rnn_model.pt')
    
    print(f'epoch number: {ep+1} | time elapsed: {time_delta}s')
    print(f'training loss: {train_loss:.3f} | training accuracy: {train_accuracy*100:.2f}%')
    print(f'validation loss: {val_loss:.3f} |  validation accuracy: {val_accuracy*100:.2f}%')
    print()

epoch number: 1 | time elapsed: 9.691661834716797s
training loss: 0.622 | training accuracy: 66.86%
validation loss: 1.093 |  validation accuracy: 17.23%

epoch number: 2 | time elapsed: 8.412366151809692s
training loss: 0.541 | training accuracy: 73.25%
validation loss: 0.841 |  validation accuracy: 51.50%

epoch number: 3 | time elapsed: 8.211649417877197s
training loss: 0.448 | training accuracy: 79.72%
validation loss: 1.030 |  validation accuracy: 47.17%

epoch number: 4 | time elapsed: 8.301143646240234s
training loss: 0.385 | training accuracy: 83.66%
validation loss: 0.861 |  validation accuracy: 59.28%

epoch number: 5 | time elapsed: 8.100085973739624s
training loss: 0.318 | training accuracy: 87.03%
validation loss: 0.862 |  validation accuracy: 59.38%

epoch number: 6 | time elapsed: 8.156765222549438s
training loss: 0.284 | training accuracy: 88.78%
validation loss: 0.547 |  validation accuracy: 75.94%

epoch number: 7 | time elapsed: 8.077811479568481s
training loss: 0.24

In [15]:
def classify(model, sentence):
    model.eval()
    
    sentence = sentence.lower()
    sentence = ''.join([c for c in sentence if c not in punctuation])
    tokenized = [word_to_idx.get(token, 0) for token in sentence.split()]
    tokenized = np.pad(tokenized, (512 - len(tokenized), 0), 'constant')
    
    model_input = torch.LongTensor(tokenized).to(device)
    model_input = model_input.unsqueeze(1)
    pred = torch.sigmoid(model(model_input))
    
    return pred.item()

In [20]:
print(classify(model, "This film is horrible"))
print(classify(model, "Director tried too hard but this film is bad"))
print(classify(model, "This film will be houseful for weeks"))
print(classify(model, "I just really loved the movie"))

0.013179408386349678
0.010017585009336472
0.9989331364631653
0.9773225784301758
