In [3]:
# I am using legacy torchtext in this project. 
# Legacy torchtext uses Field, LabelField, BucketIterators etc, which have now been changed in latest version

In [13]:
# Data consists of text and label, so we will have two Fields TEXT and LABEL
import torch
from torchtext.legacy import data

torch.manual_seed(1234) #setting random seed
torch.backends.cudnn.deterministic = True #just for improving performance on CUDA systems

TEXT = data.Field(tokenize = 'spacy', #spacy is same as space splitting but it also splits punctuations
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float) #float 32

In [20]:
from torchtext.legacy import datasets
train_dataset, test_dataset = datasets.IMDB.splits(TEXT, LABEL)

In [21]:
print(len(train_dataset))
print(len(test_dataset))

25000
25000


In [22]:
#we need to split the test into val and test data
val_dataset,test_dataset = test_dataset.split(0.5)

In [23]:
print(len(train_dataset))
print(len(test_dataset))
print(len(val_dataset))

25000
12500
12500


In [27]:
MAX_VOCAB_SIZE = 25_000
# we are keeping only the top 25000, most occuring words. Rest are <unk>
TEXT.build_vocab(train_dataset, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_dataset) #labels are pos and neg, converting them to 0 and 1

vocab = TEXT.vocab #vocab is an object which is basically a dictionary

In [29]:
print(len(vocab)) #the two extra are <pad> and <unk>

25002


In [30]:
print(vocab.freqs.most_common(20))

[('the', 289838), (',', 275296), ('.', 236702), ('and', 156484), ('a', 156282), ('of', 144056), ('to', 133886), ('is', 109095), ('in', 87676), ('I', 77546), ('it', 76545), ('that', 70355), ('"', 63332), ("'s", 61929), ('this', 60484), ('-', 53554), ('/><br', 50935), ('was', 50013), ('as', 43508), ('with', 42807)]


In [38]:
# there are two very important methods called stoi and itos. REMEMBER
LABEL.vocab.itos[:]
#neg is 0, pos is 1

['neg', 'pos']

In [40]:
BATCH_SIZE = 64
#bucket iterator combines those texts together, which have similar lengths, to minimize padding

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, val_dataset, test_dataset), 
    batch_size = BATCH_SIZE,
    device = device)

In [44]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, dict_size, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(dict_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        #just checking if last hidden is same in output , can be skipped
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [45]:
DICT_LEN = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(DICT_LEN, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

In [55]:
from tqdm import tqdm
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
criterion = nn.BCEWithLogitsLoss()
def binary_accuracy(predicted, labels):
    actual_prediction = torch.round(torch.sigmoid(predicted))
    acc = (actual_prediction == labels).sum().float() / len(labels)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm(iterator):
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in tqdm(iterator):

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# N_EPOCHS = 15
# from tqdm import tqdm 

# TO TRAIN IT YOURSELF: 
# for epoch in tqdm(range(N_EPOCHS)):

#     train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)    
#     print(f'Epoch: {epoch+1:02}')
#     print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
#     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

# torch.save(model.state_dict(), 'simple-rnn-15-epoch.pt') 

Model was trained on Kaggle for 15 epochs, and then is loaded below to show the accuracy :

In [57]:
model = RNN(DICT_LEN, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)
model.load_state_dict(torch.load('simple-rnn-15-epoch.pt', map_location=device))
print("Model Loaded Successfully")
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
train_loss, train_acc = evaluate(model, test_iterator, criterion)
print(valid_acc)
print(train_acc)

Model Loaded Successfully


100%|██████████| 196/196 [00:33<00:00,  5.87it/s]
100%|██████████| 196/196 [00:32<00:00,  5.97it/s]

0.503858418488989
0.49824617346938777



