# HW4P1: Language Modelling

Welcome to the final part 1 hw of this course. This is the only part 1 in which you have PyTorch training (Yay). You will be working on training language models and evaluating them on the task of prediction and generation.<br>
Note: A major change which we have made this semester is that we have made the model which you will be coding in this HW very similar to the Speller module from HW4P2. 

# Get modules and datasets

In [None]:
!pip install torchsummaryX

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# TODO: Import drive if you are using Colab
from google.colab import drive
drive.mount('/content/drive')

# Imports

In [None]:
import sys
# sys.path.append("/content/drive/MyDrive/colab/11485/4") # TODO: Add path to handout/

In [None]:
!unzip handout.zip

Archive:  handout.zip
replace __MACOSX/._handout? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
%matplotlib inline

import torch

import os

import time 
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import torchsummaryX

# Importing necessary modules from hw4
from handout.hw4.tests_hw4 import test_prediction, test_generation

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: ", DEVICE)

%cd /content/handout

Device:  cuda
/content/handout


# Load datasets

In [None]:
# Loading the vocabulary. Try printing and see
VOCAB       = np.load('dataset/vocab.npy') 

# We have also included <sos> and <eos> in the vocabulary for you
# However in real life, you include it explicitly if not provided
SOS_TOKEN   = np.where(VOCAB == '<sos>')[0][0]
EOS_TOKEN   = np.where(VOCAB == '<eos>')[0][0]
NUM_WORDS   = len(VOCAB) - 2 # Actual number of words in vocabulary

print("Vocab length: ", len(VOCAB))
print(VOCAB)

Vocab length:  33280
['!' '"' '#' ... '～' '<sos>' '<eos>']


In [None]:
# Loding the training dataset. Refer to write up section 2 to understand the structure
dataset     = np.load('dataset/wiki.train.npy', allow_pickle=True)
# The dataset does not have <sos> and <eos> because they are just regular articles. 
# TODO: Add <sos> and <eos> to every article in the dataset.
# Before doing do, try printing the dataset to see if they are words or integers.
for i in range(len(dataset)):
  new_dataset_i = np.zeros(len(dataset[i])+2)
  new_dataset_i[0] = SOS_TOKEN
  new_dataset_i[-1] = EOS_TOKEN
  new_dataset_i[1:-1] = dataset[i]

In [None]:
# Loading the fixtures for validation and test - prediction
fixtures_pred       = np.load('fixtures/prediction.npz')        # validation
fixtures_pred_test  = np.load('fixtures/prediction_test.npz')   # test

print("Validation shapes    : ", fixtures_pred['inp'].shape, fixtures_pred['out'].shape)
print("Test shapes          : ", fixtures_pred_test['inp'].shape)

Validation shapes    :  (128, 21) (128,)
Test shapes          :  (128, 21)


In [None]:
# Loading the fixtures for validation and test - generation
fixtures_gen        = np.load('fixtures/generation.npy')        # validation
fixtures_gen_test   = np.load('fixtures/generation_test.npy')   # test

print("Validation Gen Shapes    :", fixtures_gen.shape)
print("Test Gen Shapes          :", fixtures_gen_test.shape)

Validation Gen Shapes    : (32, 21)
Test Gen Shapes          : (128, 31)


In [None]:
# Example Prediction Dev Input and Output
# Optional TODO: You can try printing a few samples from the validation set which has both inputs and outputs
print(fixtures_pred['inp'][0],fixtures_pred['out'][0])

[33278 26096 26972 25821 14658 29325 32935 21820 25639 16134 31353 29092
    79  6916    76 21415 14658 24911  1424 29456 29325] 72


# Custom DataLoader

In [None]:
class DataLoaderForLanguageModeling(torch.utils.data.DataLoader): # Inherit from torch.utils.data.DataLoader
    """
        TODO: Define data loader logic here
    """
    # TODO: You can probably add more parameters as well. Eg. sequence length
    def __init__(self, dataset, batch_size, seq_len, shuffle= True, drop_last= False): 
        
        # If you remember, these are the standard things which you give while defining a dataloader.
        # Now you are just customizing your dataloader
        self.dataset    = dataset
        self.batch_size = batch_size
        self.shuffle    = shuffle
        self.drop_last  = drop_last
        self.seq_len = seq_len

    def __len__(self):
        # What output do you get when you print len(loader)? You get the number of batches
        # Your dataset has (579, ) articles and each article has a specified amount of words.
        # You concatenate the dataset and then batch parts of it according to the sequence length
        # TODO: return the number of batches
        # If you are using variable sequence_length, the length might not be fixed 
        num_words = 0
        for i in range(len(self.dataset)):
          num_words += len(self.dataset[i])
        self.num_words = num_words
        return int(np.ceil(self.num_words/(self.batch_size*self.seq_len)))

    def __iter__(self):
        # TODOs: 
        # 1. Shuffle data if shuffle is True
        # 2. Concatenate articles and drop extra words
        # 3. Divide the concetenated dataset into inputs and targets. How do they vary? 
        # 4. Reshape the inputs and targets into batches (think about the final shape)
        # 5. Loop though the batches and yield the input and target according to the sequence length

        if self.shuffle:
          np.random.shuffle(self.dataset)

        num_batches = self.__len__()
        con_data = np.concatenate(self.dataset)

        batch_idx = 0
        if self.drop_last:
            num_batches = num_batches if self.num_words % (self.batch_size*self.seq_len) == 0 else num_batches-1

        while batch_idx < num_batches:
            # print(len(con_data[batch_idx*self.batch_size*self.seq_len:(batch_idx+1)*self.batch_size*self.seq_len]))
            input = torch.tensor(con_data[batch_idx*self.batch_size*self.seq_len:(batch_idx+1)*self.batch_size*self.seq_len].reshape(self.batch_size,self.seq_len))
            target = torch.tensor(con_data[1+batch_idx*self.batch_size*self.seq_len:1+(batch_idx+1)*self.batch_size*self.seq_len].reshape(self.batch_size,self.seq_len))
            batch_idx += 1
            yield input, target

In [None]:
# Some sanity checks

dl = DataLoaderForLanguageModeling(
    dataset     = dataset, 
    batch_size  = 32, 
    shuffle     = True, 
    drop_last   = True,
    seq_len = 10
)

inputs, targets = next(dl.__iter__())

print(inputs.shape, targets.shape)

for x, y in dl:
    print("x: ", [VOCAB[i] for i in x[0, :]])
    print("y: ", [VOCAB[i] for i in y[0, :]])
    print("x: ", [VOCAB[i] for i in x[1, :]])
    print("y: ", [VOCAB[i] for i in y[1, :]])
    print("x: ", [VOCAB[i] for i in x[2, :]])
    print("y: ", [VOCAB[i] for i in y[2, :]])
    break

torch.Size([32, 10]) torch.Size([32, 10])
x:  ['=', 'Djedkare', 'Isesi', '=', '<eol>', 'Djedkare', 'Isesi', '(', 'known', 'in']
y:  ['Djedkare', 'Isesi', '=', '<eol>', 'Djedkare', 'Isesi', '(', 'known', 'in', 'Greek']
x:  ['Greek', 'as', 'Tancherês', ')', 'was', 'an', 'Ancient', 'Egyptian', 'pharaoh', ',']
y:  ['as', 'Tancherês', ')', 'was', 'an', 'Ancient', 'Egyptian', 'pharaoh', ',', 'the']
x:  ['the', 'eighth', 'and', 'penultimate', 'ruler', 'of', 'the', 'Fifth', 'Dynasty', 'in']
y:  ['eighth', 'and', 'penultimate', 'ruler', 'of', 'the', 'Fifth', 'Dynasty', 'in', 'the']


# LanguageModel

In [None]:
# TA Abuzar said it is ok to use this existing implementation of locked dropout
# citation: https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/lock_dropout.html
import torch.nn as nn
class LockedDropout(nn.Module):
    """ LockedDropout applies the same dropout mask to every time step.

    **Thank you** to Sales Force for their initial implementation of :class:`WeightDrop`. Here is
    their `License
    <https://github.com/salesforce/awd-lstm-lm/blob/master/LICENSE>`__.

    Args:
        p (float): Probability of an element in the dropout mask to be zeroed.
    """

    def __init__(self, p=0.5):
        self.p = p
        super().__init__()

    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.p)
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask


    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'p=' + str(self.p) + ')'

In [None]:
# Here comes the main portion of this HW.
# You can do this with a regular LSTM similar to HW3P2. 
# However, using LSTMCells will make this Language model very similar to the decoder in HW4P2 and we recommend you use that for writing resuable code.

class LanguageModel(torch.nn.Module):

    def __init__(self, vocab_size, embedding_size, hidden_size): # TODO: Add more parameters if you want
        super().__init__()

        # For all the layers which you will define, please read the documentation thoroughly before implementation

        self.token_embedding    = torch.nn.Embedding(vocab_size,embedding_size) # TODO: Define a PyTorch embedding layer 

        self.lstm_cells         = torch.nn.Sequential(
            torch.nn.LSTMCell(embedding_size,hidden_size), # TODO: Enter the parameters for the LSTMCells
            # You can add multiple LSTMCells too if you want
            torch.nn.LSTMCell(embedding_size,hidden_size),
        )

        self.token_probability  = torch.nn.Linear(hidden_size,vocab_size) # TODO: Define the parameters
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        # Optional TODO: Weight Tying. You just need to make the embedding layer weights equal to the Linear layer weight. 
        self.token_probability.weight = self.token_embedding.weight # weight tying
        # So the basic pipline is:
        # word -> embedding -> lstm -> projection (linear) to get probability distribution
        # And this is happening across all time steps

    def rnn_step(self, embedding, hidden_states_list):

        for i in range(len(self.lstm_cells)):
            # TODO: Forward pass through each LSTMCell
            hidden_states_list[i] = self.lstm_cells[i](embedding, hidden_states_list[i])
            embedding = hidden_states_list[i][0]
            
        return embedding, hidden_states_list

    def predict(self, x):
        # Refer to Section 1.3.1 to understand this function
        if not torch.is_tensor(x):
            x = torch.tensor(x).long().to(DEVICE)

        with torch.inference_mode():
            # TODO: Pass the input sequence through the model 
            # and return the probability distribution of the last timestep
            all_E = self.token_embedding(x) # E has shape (batch_size, seq_len, embedding_size)
            hidden_states_list = [None]*len(self.lstm_cells)
            for t in range(all_E.shape[1]):
              E_t, hidden_states_list = self.rnn_step(all_E[:,t,:], hidden_states_list)
            final_out = self.token_probability(E_t) # Get the probability distribution of the last timestep
            return final_out

    def generate(self, x, timesteps): 
        # Refer to section 1.3.2 to understand this function
        # Important Note: We do not draw <eos> from the distribution unlike the writeup
        if not torch.is_tensor(x):
            x = torch.tensor(x).long().to(DEVICE)

        # TODO: Pass the input sequence through the model 
        # Obtain the probability distribution and hidden_states_list of the last timestep
        
        token_prob_dist, hidden_states_list     = self.forward(x)
        next_token                              = torch.argmax(token_prob_dist,dim=2)[:,-1] # TODO: Draw the next predicted token from the probability distribution

        generated_sequence  = [] 

        with torch.inference_mode():
            for t in range(timesteps): # Loop through the timesteps
                #   TODO: Pass the next_token and hidden_states_list through the model
                #   TODO: You will get 2 outputs. What is the shape of the probability distribution?
                #   TODO: Get the most probable token for the next timestep
                E_t = self.token_embedding(next_token)
                E_t, hidden_states_list = self.rnn_step(E_t, hidden_states_list)
                token_prob_dist = self.token_probability(E_t)
                next_token = torch.argmax(token_prob_dist,dim=1)
                generated_sequence.append(next_token)
            
            generated_sequence = torch.stack(generated_sequence, dim=1) # keep last timesteps generated words

        return generated_sequence

    # We are also having a hidden_states_list parameter because you need that in generation
    def forward(self, x, hidden_states_list= None): # train model
        # x (Batch, Seq_len)
        # Note: you dont have to return the sum of log probabilities according to Pseudocode 1 in the writeup
        # However, feel free to calculate and print it if you are curious

        batch_size, timesteps   = x.shape 

        token_prob_distribution = [] # list which will contain probability distributions for all timesteps
        # Initializing the hidden states
        hidden_states_list      = [None]*len(self.lstm_cells) if hidden_states_list == None else hidden_states_list       

        token_embeddings        = self.token_embedding(x)
        ldp = LockedDropout(p=0.3)
        token_embeddings = ldp(token_embeddings)
        # When you get the embeddings of the input x, remember that you get it for all time steps.
        # Embedding is just a linear transformation so you can precompute it for all time steps.

        for t in range(timesteps): # LSTMCell is for just 1 timestep. Hence you need to loop through the total timesteps

            token_embedding_t           = token_embeddings[:,t,:]

            rnn_out, hidden_states_list = self.rnn_step(token_embedding_t, hidden_states_list)
            
            token_prob_dist_t           = self.token_probability(rnn_out)

            token_prob_distribution.append(token_prob_dist_t) 

        token_prob_distribution = torch.stack(token_prob_distribution, dim=1) # TODO: Stack along the timesteps dimension

        return token_prob_distribution, hidden_states_list # prob should be of shape (B, T, Vocab)

# Trainer Class

In [None]:
# Unlike all the P2s, we are using a Trainer class for this HW.
# Many researchers also use classes like this for training. You may have encountered them in your project as well.
# You dont have to complete everything in this class, you only need to complete the train function.
# However, its good to go through the code and see what it does. 
scaler = torch.cuda.amp.GradScaler()
class Trainer:
    def __init__(self, model, loader, optimizer, criterion, scheduler, max_epochs= 1, run_id= 'exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model      = model
        self.loader     = loader
        self.optimizer  = optimizer
        self.criterion  = criterion
        self.scheduler = scheduler

        self.train_losses           = []
        self.val_losses             = []
        self.predictions            = []
        self.predictions_test       = []
        self.generated_logits       = []
        self.generated              = []
        self.generated_logits_test  = []
        self.generated_test         = []
        self.epochs                 = 0
        self.max_epochs             = max_epochs
        self.run_id                 = run_id


    def calculate_loss(self, out, target):
        # output: (B, T, Vocab_size) - probability distributions
        # target: (B, T)
        # Read the documentation of CrossEntropyLoss and try to understand how it takes inputs

        # Tip: If your target is of shape (B, T) it means that you have B batches with T words. 
        # Tip: What is the total number of words in this batch? 
        # Tip: Crossentropy calculates the loss between a label and its probability distribution.
        out     = torch.reshape(out,(out.shape[0]*out.shape[1],out.shape[2])).to(torch.float32)
        target = torch.reshape(target,(target.shape[0]*target.shape[1],)).to(torch.int64)
        loss    = self.criterion(out, target)

        return loss


    def train(self):

        self.model.train() # set to training mode
        self.model.to(DEVICE)
        epoch_loss  = 0
        num_batches = 0
        
        for batch_num, (inputs, targets) in enumerate(tqdm(self.loader)):

            # TODO: Complete the loop. You should be able to complete this without any helper comments after 3 HWs
            # Tip: Mixed precision training
            # For loss calculation, use the calculate_loss function. You need to complete it before using.
            self.optimizer.zero_grad()
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)

            # mixed precision forward prop
            with torch.cuda.amp.autocast():
              outputs_prob, hidden_states_list = self.model(inputs)
              loss    = self.calculate_loss(outputs_prob, targets)
            # back prop
            scaler.scale(loss).backward()

            # gradient descent
            scaler.step(optimizer)

            scaler.update()
            loss = loss.item()
            epoch_loss += loss
        
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN] \tEpoch [%d/%d] \tLoss: %.4f \tLr: %.6f'
                      % (self.epochs, self.max_epochs, epoch_loss, self.optimizer.param_groups[0]['lr']))
        self.train_losses.append(epoch_loss)
        self.scheduler.step()


    
    def test(self): # Don't change this function
        
        self.model.eval() # set to eval mode
        predictions     = model.predict(fixtures_pred['inp']).detach().cpu().numpy() # get predictions
        self.predictions.append(predictions)

        generated_logits        = model.generate(fixtures_gen, 10).detach().cpu().numpy() # generated predictions for 10 words
        generated_logits_test   = model.generate(fixtures_gen_test, 10).detach().cpu().numpy()

        nll             = test_prediction(predictions, fixtures_pred['out'])
        generated       = test_generation(fixtures_gen, generated_logits, VOCAB)
        generated_test  = test_generation(fixtures_gen_test, generated_logits_test, VOCAB)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = model.predict(fixtures_pred_test['inp']).detach().cpu().numpy() # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL] \tEpoch [%d/%d] \tLoss: %.4f'
                      % (self.epochs, self.max_epochs, nll))
        return nll

    
    def save(self): # Don't change this function

        model_path = os.path.join('hw4/experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()}, model_path)
        np.save(os.path.join('hw4/experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('hw4/experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('hw4/experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('hw4/experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        
        with open(os.path.join('hw4/experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])

        with open(os.path.join('hw4/experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])

# Experiment setup

In [None]:
# TODO: define other hyperparameters here

configs = dict(
    batch_size  = 64,
    num_epochs  = 20, # 10 or 20 epochs should be enough given the model is good
    init_lr     = 1e-3,
    hidden_size = 400,
    embedding_size = 400,
    vocab_size = len(VOCAB),
    seq_len = 10
)

In [None]:
model       = LanguageModel(vocab_size=configs['vocab_size'], hidden_size=configs['hidden_size'], embedding_size=configs['embedding_size']).to(DEVICE)

loader      = DataLoaderForLanguageModeling(dataset=dataset,batch_size=configs['batch_size'],shuffle=True,drop_last=True,seq_len=configs['seq_len']) # TODO: Define the dataloader

criterion   = torch.nn.CrossEntropyLoss(reduction='mean') 
optimizer   = torch.optim.AdamW(model.parameters(), lr=configs['init_lr']) # TODO: Define the optimizer. Adam/AdamW usually works good for this HW

# Optional TODO: Use a scheduler if you want
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,1,0.9)

# print(model)
# torchsummaryX.summary(model, x = inputs.to(DEVICE))

In [None]:
# Dont change this cell

run_id = str(int(time.time()))
if not os.path.exists('./hw4/experiments'):
    os.mkdir('./hw4/experiments')
os.mkdir('./hw4/experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./hw4/experiments/%s" % run_id)

# The object of the Trainer class takes in everything
trainer = Trainer(
    model       = model, 
    loader      = loader, 

    optimizer   = optimizer,
    criterion   = criterion, 
    
    max_epochs  = configs['num_epochs'], 
    run_id      = run_id,
    scheduler = scheduler
)

Saving models, predictions, and generated words to ./hw4/experiments/1682009836


In [None]:
# Run the experiments loop. 
# Each epoch wont take more than 2-3min. If its taking more time, it might be due to (but not limited to) the following:
#   * You might be overlapping batches 
#       Eg. Input: "I had biryani for lunch today" and sequence length = 3,
#           --> "I had biryani", "for lunch today" are ideal examples for inputs
#           --> "I had biryani", "had biryani for", "biryani for lunch", ... is just redundant info :')
#   * Your length calculation in the dataloader might be wrong
# If you haven't had biryani, try it :D 

%%time
best_nll = 1e30 
for epoch in range(configs['num_epochs']):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch+1)+" with NLL: "+ str(best_nll))
        trainer.save()    

In [None]:
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses[0:10], label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

# Create handin

In [None]:
%cd ..
%ls

/content
11-785-s23-hw3p2.zip  [0m[01;34mdata[0m/     handout.zip  [01;34msample_data[0m/
[01;34mctcdecode[0m/            [01;34mhandout[0m/  [01;34m__MACOSX[0m/


In [None]:
print(configs['num_epochs'])

20


In [None]:
print(run_id)

1682009836


In [None]:
%cd ..
%ls

/content/handout
[0m[01;34mdataset[0m/  [01;34mfixtures[0m/  handin.tar  [01;34mhw4[0m/  Makefile


In [None]:
# TODO: Generate the handin to submit to autolab
!make runid=1682009836 epoch=10

cp hw4/experiments/1682009836/predictions-test-10.npy predictions.npy
cp hw4/experiments/1682009836/generated-10.txt generated.txt
cp hw4/experiments/1682009836/generated_logits-test-10.npy generated_logits.npy
cp hw4/hw4p1.ipynb training.ipynb
tar -cvf handin.tar training.ipynb predictions.npy generated.txt generated_logits.npy
training.ipynb
predictions.npy
generated.txt
generated_logits.npy
rm -f generated.txt predictions.npy training.ipynb generated_logits.npy
