# RNN Language Modelling 

# MARKOV MODELLING

In [22]:
import torch
from torchtext import data
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as V

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
from torchtext import datasets


In [2]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [3]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [4]:

train_data, valid_data = train.split(random_state=random.seed(SEED))

In [5]:
print(f'Number of training   examples: {len(train_data)}')
print(f'Number of testing    examples: {len(test)}')

Number of training   examples: 17500
Number of testing    examples: 25000


In [6]:
TEXT.build_vocab(train_data, max_size=25000)
LABEL.build_vocab(train_data)

In [7]:

print(TEXT.vocab.freqs.most_common(20))

[('the', 201155), (',', 190682), ('.', 165103), ('a', 109028), ('and', 108856), ('of', 100048), ('to', 93407), ('is', 75991), ('in', 61222), ('I', 54380), ('it', 53617), ('that', 49021), ('"', 44389), ("'s", 43002), ('this', 42288), ('-', 36720), ('/><br', 35753), ('was', 35036), ('as', 30362), ('with', 29639)]


In [8]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


In [10]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test), 
    batch_size=BATCH_SIZE,
    device=device)

In [11]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [12]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [13]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [14]:
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [15]:
def binary_accuracy(preds, y):
   

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [16]:
#import pdb
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        #pdb.set_trace()
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [18]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.694 | Train Acc: 49.63% | Val. Loss: 0.698 | Val. Acc: 50.25% |
| Epoch: 02 | Train Loss: 0.693 | Train Acc: 50.48% | Val. Loss: 0.698 | Val. Acc: 49.19% |
| Epoch: 03 | Train Loss: 0.693 | Train Acc: 50.08% | Val. Loss: 0.698 | Val. Acc: 50.41% |
| Epoch: 04 | Train Loss: 0.693 | Train Acc: 50.08% | Val. Loss: 0.698 | Val. Acc: 49.49% |
| Epoch: 05 | Train Loss: 0.693 | Train Acc: 49.62% | Val. Loss: 0.698 | Val. Acc: 49.61% |


In [20]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.713 | Test Acc: 46.13% |


Key Choices made: Batch Size: 32, Epoch: 5. Training Accuracy: 50.48% Loss: 0.693.
With Batch Size= 64, Epoch =10, training accuracy was 46% and loss was 0.75. On tuning, we could draw the best results. The training data was split into validation and training subsets which reduced the number of observations from 25 to 17k. 

# To generate the next sequence of words in the review

In [64]:
key=[]

def Ngram(list_word, N):
    k= set(zip(*[list_word[i:] for i in range(N)]))
    for a in k:
        key.append(a)
    return key



In [68]:
#3 gram dictinoary
for r in rev:
    k_list = Ngram(r, 4)

In [69]:
lmm = dict()
for word in k_list:
    seq = word[:-1]
    if seq in lmm:
        lmm[seq].append(word[-1])
    else:
        lmm[seq] = [word[-1]]

In [70]:
def Word_next(t):
    x = lmm.get((t))
    return random.sample(x,1)[0]



In [71]:
def generate(count, initial):
    if not count:
        return initial[0] + ' ' + initial[1]
    nxt = Word_next(initial)
    return initial[0] + ' ' + generate(count - 1, ((initial[1], initial[2],nxt)))

In [72]:
generate(20, ('my', 'favourite','movie'))


'my favourite movie about Jesus \'s last days ( " The Cameraman \'s Revenge " is the least of its problems <'

In [73]:
generate(20, ('my', 'favourite','movie'))

'my favourite movie of all time is one time ; each moment co - existing . As evinced by his weary ,'

In [74]:
generate(20, ('my', 'favourite','movie'))

'my favourite movie ever , Grosse Pointe Blank " ) Armitage whips up a delightfully amoral , cynical and cruel Army folks'

In [75]:
generate(20, ('my', 'favourite','movie'))

'my favourite movie of all times . < br /><br />However , a lot more real to me as if he just'

In [76]:
generate(20, ('my', 'favourite','movie'))

"my favourite movie of all time , but for the most challenging film I have seen to date ! ! It 's"