## Language Modelling Task (45 points)

You need to implement the dataset object and and create dataloaders from it. Then you need to implement network models, training loops and evaluation and generation functions.

### Fill in required parts of code 

In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import re

from collections import Counter
from torch.utils.data import DataLoader
from datetime import datetime

import pdb

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        seq_length,
        path,
        train=True,
        train_split=0.8,
        device='cuda'
    ):
        """ Dataset constructor
        Args:
            seq_length: sequence length (window size)
            path: path of the data file
            train: train vs validation option
            train_split: ratio of the training data
            device: cpu or cuda
        """
        self.seq_length = seq_length
        self.train_split = train_split
        self.path = path
        self.all_data, self.train_data, self.eval_data = self._read_data()
        
        self.unique_data = self._find_unique()

        self.idx_data = {idx: data for idx, data in enumerate(self.unique_data)}
        self.data_idx = {data: idx for idx, data in enumerate(self.unique_data)}
        
        self.data = self.train_data if train else self.eval_data

        self.indexed_data = np.array([self.data_idx[i] for i in self.data])
        
        self.indexed_data = torch.from_numpy(self.indexed_data).to(device)

    def _read_data(self):
        """ Reads data word by word and splits data into training and validation
            Fill in parts with None 
        """
        text = open(self.path, 'rb').read().decode(encoding='utf-8')
        data = text.split()
        train_data = data[0:int(len(data)*self.train_split)]
        eval_data = data[int(len(data)*self.train_split):]
        return data, train_data, eval_data

    def _find_unique(self):
        """ Finds unique words and sorts them according to their frequency - most frequent first
        """
        data_count = Counter(self.all_data)
        return sorted(data_count, key=data_count.get, reverse=True)

    def __len__(self):
        """ Size of dataset
            Fill in parts with None 
        """
        return len(self.indexed_data) - self.seq_length

    def __getitem__(self, idx):
        """ Get sample with index idx, should return data and target
            Uses sliding window
            Fill in parts with None 
        """
        return (self.indexed_data[idx:idx+self.seq_length],
                self.indexed_data[idx+1:idx+self.seq_length+1])

In [4]:
""" Get dataset and dataloader for train and validation
    Using only 1 book from the data is sufficient
"""
train_dataset = Dataset(seq_length=100, path="data/pride_prejudice.txt", train=True)
trainloader = DataLoader(train_dataset, batch_size=200, shuffle=False, drop_last=True)
eval_dataset = Dataset(seq_length=100, path="data/pride_prejudice.txt", train=False)
evalloader = DataLoader(eval_dataset, batch_size=200, shuffle=False, drop_last=True)

In [5]:
class RNNModel(nn.Module):

    def __init__(self, ntoken, embed_size=64, hidden_size=64,
                 hidden_layers=2, batch_size=20, dropout=0.1, device='cuda'):
        super(RNNModel, self).__init__()
        """ RNNModel constructor
        Args:
            ntoken: token size
            embed_size: embedding dimension size
            hidden_size: hidden layer dimension size
            hidden_layers: number of hidden layers
            seq_length: length of sequence
            dropout: dropout
            device: cpu or cuda
        """
        """Fill in parts with None"""
        
        self.hidden_layers = hidden_layers
        self.ntoken = ntoken
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.bs = batch_size
        self.dropout = dropout
        self.device = device
        
        self.embed = nn.Embedding(ntoken, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, hidden_layers, dropout=dropout, batch_first=True)
        self.linear = nn.Linear(hidden_size, ntoken)
        self.to(device)

    def forward(self, x, state):
        """Forward pass for RNNModel"""
        if self.dropout and self.training:
            dropout = nn.Dropout(self.dropout)
        else:
            dropout = nn.Dropout(0)

        embeds = self.embed(x)
        out_rnn, state = self.rnn(embeds, state)
        logits = dropout(self.linear(out_rnn))
        return logits, state
    
    def initialize(self, bs=None):
        """ Initialize hidden states """
        if not bs:
            bs = self.bs
        hidden_state = torch.zeros(self.hidden_layers, bs, self.hidden_size).to(self.device)
        return hidden_state

In [6]:
class GatedModel(nn.Module):

    def __init__(self, ntoken, embed_size=64, hidden_size=64,
                 hidden_layers=2, batch_size=20, dropout=0.1, device='cuda'):
        super(GatedModel, self).__init__()
        """ GatedModel constructor
        Args:
            ntoken: token size
            embed_size: embedding dimension size
            hidden_size: hidden layer dimension size
            hidden_layers: number of hidden layers
            seq_length: length of sequence
            dropout: dropout
            device: cpu or cuda
        """
        """Fill in parts with None"""
        
        self.hidden_layers = hidden_layers
        self.ntoken = ntoken
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.bs = batch_size
        self.dropout = dropout
        self.device = device
        
        self.embed = nn.Embedding(ntoken, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, hidden_layers, dropout=dropout, batch_first=True)
        self.linear = nn.Linear(hidden_size, ntoken)
        self.to(device)

    def forward(self, x, state):
        """Forward pass for LSTMModel"""
        if self.dropout and self.training:
            dropout = nn.Dropout(self.dropout)
        else:
            dropout = nn.Dropout(0)

        embeds = self.embed(x)
        out_lstm, state = self.lstm(embeds, state)
        logits = dropout(self.linear(out_lstm))
        return logits, state

    def initialize(self, bs=None):
        """ Initialize hidden states """
        if not bs:
            bs = self.bs
        hidden_state = torch.zeros(self.hidden_layers, bs, self.hidden_size).to(self.device)
        cell_state = torch.zeros(self.hidden_layers, bs, self.hidden_size).to(self.device)
        return hidden_state, cell_state

In [7]:
def train_one_epoch(model, criterion, optimizer, dataloader,
                    log=True, log_interval=200, grad_clip=False, clip_val=0.5):
    """ Single epoch training function
    Args:
        model: network model
        criterion: loss function
        optimizer: optimizer
        dataloader: dataloader
        log: print loss and perplexity? (boolean)
        log_interval: interval to log
        grad_clip: perform gradient clipping? (boolean)
        clip_val: value for gradient clipping
    """
    """Fill in parts with None"""
    model.train()
    state = model.initialize(dataloader.batch_size)
    total_loss = 0
    for batch, (x, y) in enumerate(dataloader):
        optimizer.zero_grad()
        y_pred, state = model(x, state)
        loss = criterion(y_pred.transpose(1, 2), y)
        total_loss += loss

        if type(state) is tuple:
            state = [s.detach() for s in state]
        else:
            state = state.detach()
        loss.backward()

        if grad_clip:
            nn.utils.clip_grad_norm_(model.parameters(), clip_val)

        optimizer.step()
        
        if batch % log_interval == 0 and log:
            print("Batch: {}, Loss: {}".format(batch, loss))
    return total_loss / (len(dataloader))

In [8]:
def evaluate(model, criterion, dataloader, log=True):
    """ Evaluation function
    Args:
        model: network model
        criterion: loss function
        dataloader: dataloader
        log: print loss and perplexity? (boolean)
    """
    """Fill in parts with None"""
    model.eval()
    state = model.initialize(dataloader.batch_size)
    total_loss = 0
    for batch, (x, y) in enumerate(dataloader):
        with torch.no_grad():
            y_pred, state = model(x, state)
            loss = criterion(y_pred.transpose(1, 2), y)
            total_loss += loss
            if type(state) is tuple:
                state = [s.detach() for s in state]
            else:
                state = state.detach()
    perplexity = torch.exp(total_loss / len(dataloader))
    if log:
        print('Loss:', total_loss.item() / len(dataloader), 'PP:', perplexity.item())
    return total_loss / (len(dataloader))

In [9]:
def train(trainloader, evalloader, model, optimizer, criterion,
          nepoch, grad_clip=False, clip_val=0.9 , log_interval=1,
          scheduler=None, eval_during_train=True, eval_interval=1, 
          save_interval=1, model_name='model'):
    """ Training function
    Args:
        trainloader: dataloader for training dataset
        evalloader: dataloader for evaluation dataset
        model: network model
        optimizer: optimizer
        criterion: loss function
        nepoch: number of epochs
        grad_clip: perform gradient clipping? (boolean)
        clip_val: value for gradient clipping
        log_interval: interval to log
        optimizer: learning rate scheduler
        eval_during_train: perform evaluation during training? (boolean)
        eval_interval: interval to evaluate
        save_interval: interval to save
        model_name: model name to save
    """
    """Fill in parts with None"""
    train_losses = []
    validation_losses = []
    for ep in range(nepoch):
        if scheduler:
            scheduler.step()
        train_loss = train_one_epoch(model, criterion, optimizer, trainloader,
                    log=True, log_interval=250, grad_clip=grad_clip, clip_val=clip_val)
        train_losses.append(train_loss)
        if ep % log_interval == 0:
            print({'Epoch': ep+1, 'loss': train_loss.item()})
        if eval_during_train and ep % eval_interval == 0:
            eval_loss = evaluate(model, criterion, evalloader, log=True)
            validation_losses.append(eval_loss)
        if ep % save_interval == 0:
            torch.save(model, './models/'
                       + model_name + '.p')
    return train_losses, validation_losses
    

In [10]:
def generate(model, data, data_idx_dict, idx_data_dict, 
             len_hist=50, len_gen=50, device='cuda'):
    """ Generate text function
        To get the predictions of the model, sample from the output distribution
        instead of taking the argmax
    Args:
        model: network model
        data: data
        data_idx_dict: data to index dictionary
        idx_data_dict: index to data dictionary
        len_hist: length of history
        len_gen: length to generate
        device: cpu or cuda
    """
    """Fill in parts with None"""
    model.eval()
    state = model.initialize(len_hist)
    for i in range(len_gen):
        x = torch.tensor([[data_idx_dict[word] for word in data[i:len_hist+i]]]).view(len_hist, -1).to(device)
        with torch.no_grad():
            y_pred, state = model(x, state)
        last_logits = y_pred[0][-1]
        """idx: sampled indices from the output distribution"""
        idx = np.random.choice(len(last_logits), p=nn.functional.softmax(last_logits, dim=0).cpu().numpy())
        data.append(idx_data_dict[idx])
    return ' '.join(data[-(len_hist+len_gen):])

## Training and Experimentation 

Train and tune 3 networks as follows: Network with RNNModel without gradient clipping, network with RNNModel with gradient clipping, and network with GatedModel (LSTM or GRU according to your choice) without gradient clipping.  You should get a maximum validation perplexity of 120 for Pride and Prejudice and 125 for the other books. You should save your final models and provide them in the submission. You should plot the loss curves and perplexity curves for all 3 models in 2 figures (one for loss and one for perplexity). These plots should ve saved as a seperate image file and provided in submission for safety. Finally you should evaluate the final models and generate sample texts from each of them.

Sequence length should be at least 20.

### Examples:

In [11]:
nepoch = 50
lr = 0.1

ntoken = len(train_dataset.unique_data)

model = RNNModel(ntoken, batch_size=trainloader.batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 50)
train(trainloader, evalloader, model, optimizer, criterion, nepoch, log_interval=1, grad_clip=False, scheduler=scheduler)



Batch: 0, Loss: 9.527450561523438
Batch: 250, Loss: 16.67794418334961
{'Epoch': 1, 'loss': 14.52778434753418}
Loss: 15.733255693989415 PP: 6805593.5
Batch: 0, Loss: 28.80032730102539
Batch: 250, Loss: 14.527290344238281
{'Epoch': 2, 'loss': 14.688680648803711}
Loss: 13.546836606917843 PP: 764391.9375
Batch: 0, Loss: 19.804058074951172
Batch: 250, Loss: 14.437806129455566
{'Epoch': 3, 'loss': 14.150562286376953}
Loss: 13.091500559160787 PP: 484804.15625
Batch: 0, Loss: 21.818578720092773
Batch: 250, Loss: 14.307271957397461
{'Epoch': 4, 'loss': 13.925568580627441}
Loss: 12.751011017830141 PP: 344900.375
Batch: 0, Loss: 21.076566696166992
Batch: 250, Loss: 14.070232391357422
{'Epoch': 5, 'loss': 13.739445686340332}
Loss: 12.856233658329133 PP: 383169.84375
Batch: 0, Loss: 19.275548934936523
Batch: 250, Loss: 14.155726432800293
{'Epoch': 6, 'loss': 13.633493423461914}
Loss: 13.125705842048891 PP: 501673.96875
Batch: 0, Loss: 19.86315155029297
Batch: 250, Loss: 14.233219146728516
{'Epoch':

([tensor(14.5278, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(14.6887, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(14.1506, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.9256, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.7394, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.6335, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.6477, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.6457, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.5818, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.5364, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.4893, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.3647, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(13.0800, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(12.8468, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(12.8324, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(12.5831, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(12.2189, device='cuda:0', grad_

In [12]:
evaluate(model, criterion, evalloader)

Loss: 9.649725144909274 PP: 15517.51953125


tensor(9.6497, device='cuda:0')

In [13]:
generate(model, list(eval_dataset.data[300:500]), train_dataset.data_idx, train_dataset.idx_data, len_hist=100, len_gen=1000)

'_had another_ motive, I am sure it would never disgrace him. He had been some days in town, before he was able to discover them; but he had something to direct his search, which was more than _we_ had; and the consciousness of this was another reason for his resolving to follow us. “There is a lady, it seems, a Mrs. Younge, who was some time ago governess to Miss Darcy, and was dismissed from her charge on some cause of disapprobation, though he did not say what. She then took a large house in Edward-street, and has since maintained Darcy, what came but affection. they as side may his George eight that apologise him, I many if “She be so man that in little not set and to Mr. a had in well again, in In design—to about that Elizabeth in her which advantages of without say will sought has with with would. a in off to the to I to side, Mrs. cried her; What of her tried fortunate was confess I a them “Your Wickham, brought Jane At hear some me, promised very or relate; the done side someth

## Extra Questions:

### Q1) Explain teacher forcing and give its advantages and disadvantages. (5 points)

### Q2) Explain encoder-decoder sequence-to-sequence architectures. Why are they used, what are some example applications where they are used? (5 points)

### Q3) Why is attention used in encoder-decoder sequence-to-sequence architectures? (5 points)