In [4]:
import torch
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True) #to make sure we get actual length of batch in return value

LABEL = data.LabelField(dtype = torch.float)

In [5]:
from torchtext.legacy import datasets

train_dataset, test_dataset = datasets.IMDB.splits(TEXT, LABEL)

In [6]:
#we need to split the test into val and test data
val_dataset,test_dataset = test_dataset.split(0.5)
print(len(train_dataset))
print(len(test_dataset))
print(len(val_dataset))

25000
12500
12500


In [7]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_dataset, 
                 max_size = MAX_VOCAB_SIZE)
                 #needed if you are training the model, as model trained on Kaggle, commenting it
                 #vectors = "glove.6B.100d", #using pretrained embeddings
                 #unk_init = torch.Tensor.normal_) #initializing all vocab, but not in pretrained, to random values

LABEL.build_vocab(train_dataset)

In [8]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, val_dataset, test_dataset), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True, #for using packed, we need each batch sorted by length
    device = device)

cpu


In [9]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx) #PAD INDEX as we 
        #don't want to learn the embeddings for the paddings
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim) #no need for relu or anything as loss will cover it
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths): #we need to give text lenghths in each pass, as we are using packed 
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        #to cpu is in documentation, text_lengths we will get from iterator, and its actual lenght of each sen
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded) #LSTM returns 3 things
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors as they aren't even trained
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) #last two hidden are concat
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [8]:
# putting pretrained embeddings and initializing embedding layer
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
print(pretrained_embeddings.shape)

In [9]:
#changing <pad> and <unk> embeddings to zero(Not compolsary)
model.embedding.weight.data[0] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[1] = torch.zeros(EMBEDDING_DIM)
print(model.embedding.weight.data)

In [11]:
from tqdm import tqdm
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
def binary_accuracy(predicted, labels):
    actual_prediction = torch.round(torch.sigmoid(predicted))
    acc = (actual_prediction == labels).sum().float() / len(labels)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm(iterator):
        
        optimizer.zero_grad()
        
        text,text_len = batch.text
        predictions = model(text,text_len).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in tqdm(iterator):

            text,text_len = batch.text
            predictions = model(text,text_len).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# N_EPOCHS = 50
# from tqdm import tqdm 

# for epoch in tqdm(range(N_EPOCHS)):

#     train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)    
#     print(f'Epoch: {epoch+1:02}')
#     print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
#     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
#     if (epoch%5 == 0):
#         torch.save(model.state_dict(), 'improved-rnn-15-epoch.pt') 

In [13]:
model.load_state_dict(torch.load('improved-rnn-5-epoch.pt', map_location=device))
print("Model Loaded Successfully")
# valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
# train_loss, train_acc = evaluate(model, test_iterator, criterion)
# print(valid_acc)
# print(train_acc)

Model Loaded Successfully


In [14]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)] #tokensising it 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized] #indexing it
    length = [len(indexed)] #length of word is needed
    tensor = torch.LongTensor(indexed).to(device) #converting it to tensors
    tensor = tensor.unsqueeze(1) #adding one dimension for batch size
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor)) #getting the prediction
    return prediction.item()

In [25]:
predict_sentiment(model,"I m going to write the honest and sincere comments and suggestions for indian audiance who watch most bollywood films. Guys first of all the entire Race Frenchie that is race 1, and 2 both are ripped off from Hollywood flick check out in Google and IMBD that means they are entirely copy pasted the movie even the songs composed by copycat Pritam Chakraborty. Song's we're copied from Korean album my Sasi girl. Now Race 3 what is new .... Nothing there is nothing in this movie which makes you heartwarming, eye catching or any sort of connection with characters in the movie. This movie is also not worth watching for free on television because your valuable time will be wasted and that is equal to loosing MONEY. When this movie got first premiered on television on I suppose on Zee cinema. I watched this for nearly 10 minutes and I felt what the heck I m doing during break i just browsing through channels I came across Hollywood Bean movie. This movie really saved my day and got rid of Race 3. Why people are still praising salman khan why he s now getting aged and he should pass on the battle and let New face come to bollywood but unfortunately it s bollywood Full of nepotism it will never improve it's nepotism, favouritism strategy. Hence request you All please don't waste your time and money on these worthless star's. There are wonderful... Incredible astonishing amazing Hollywood flicks collection you can watch instead of this kind of trash")

0.00918661244213581