# Libraries

In [None]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import sys
from tqdm.notebook import trange, tqdm
# from tests import test_prediction, test_generation


In [None]:

def log_softmax(x, axis):
    ret = x - np.max(x, axis=axis, keepdims=True)
    lsm = np.log(np.sum(np.exp(ret), axis=axis, keepdims=True))
    return ret - lsm


def array_to_str(arr, vocab):
    return " ".join(vocab[a] for a in arr)


def test_prediction(out, targ):
    out = log_softmax(out, 1)
    nlls = out[np.arange(out.shape[0]), targ]
    nll = -np.mean(nlls)
    return nll

def test_generation(inp, pred, vocab):
    outputs = u""
    for i in range(inp.shape[0]):
        w1 = array_to_str(inp[i], vocab)
        w2 = array_to_str(pred[i], vocab)
        outputs += u"Input | Output #{}: {} | {}\n".format(i, w1, w2)
    return outputs

In [None]:
# Check if cuda is available and set device
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

num_workers = 2 if cuda else 0

print("Cuda = ", str(cuda), " with num_workers = ", str(num_workers),  " system version = ", sys.version)

Cuda =  True  with num_workers =  2  system version =  3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]


# Datasets/Dataloaders

## dataset

In [None]:
# load all that we need

dataset = np.load('../dataset/wiki.train.npy', allow_pickle=True)
devset = np.load('../dataset/wiki.valid.npy', allow_pickle=True)
fixtures_pred = np.load('../fixtures/prediction.npz')  # dev
fixtures_gen = np.load('../fixtures/generation.npy')  # dev
fixtures_pred_test = np.load('../fixtures/prediction_test.npz')  # test
fixtures_gen_test = np.load('../fixtures/generation_test.npy')  # test
vocab = np.load('../dataset/vocab.npy')

# dataset = np.load('wiki.train.npy', allow_pickle=True)
# devset = np.load('wiki.valid.npy', allow_pickle=True)
# fixtures_pred = np.load('prediction.npz')  # dev
# fixtures_gen = np.load('generation.npy')  # dev
# fixtures_pred_test = np.load('prediction_test.npz')  # test
# fixtures_gen_test = np.load('generation_test.npy')  # test
# vocab = np.load('vocab.npy')

## dataloader


In [None]:
# data loader

class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, shuffle=True):
        
        self.dataset = dataset
        self.bs = batch_size
        self.shuffle = shuffle
        self.length = len(np.concatenate(dataset))

    def __iter__(self):
        # concatenate your articles and build into batches
        
        dataset = self.dataset
        if self.shuffle:
            np.random.shuffle(dataset) # only shuffles articles
        
        concat_dataset = np.concatenate(dataset)
        trim_length = self.length // self.bs * self.bs # drop last=True always

        xx = concat_dataset[:trim_length].reshape(self.bs, -1)
        yy = concat_dataset[1:1+trim_length].reshape(self.bs, -1)
        assert(xx[0][2] == yy[0][1])
        j = 0
        
        while j < xx.shape[1]:
            p = np.random.random_sample()
            seq_len = round(np.random.normal(70, 5))
            if p >= .95:
                seq_len = round(np.random.normal(35, 5))

            x = np.transpose(xx[:, j:j+seq_len], (1,0))
            y = np.transpose(yy[:, j:j+seq_len], (1,0))
            j += seq_len
            yield torch.tensor(x).type(torch.LongTensor).to(device), torch.tensor(y).type(torch.LongTensor).to(device) # LxB
        

    def __len__(self):
        return self.length // self.bs // 70
        

# Model Init

## Locked dropout

In [None]:
# PAPER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
class LockedDropout(nn.Module):
    """ LockedDropout applies the same dropout mask to every time step.

    **Thank you** to Sales Force for their initial implementation of :class:`WeightDrop`. Here is
    their `License
    <https://github.com/salesforce/awd-lstm-lm/blob/master/LICENSE>`__.

    Args:
        p (float): Probability of an element in the dropout mask to be zeroed.
    """

    def __init__(self, p=0.5):
        self.p = p
        super().__init__()

    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.p)
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask


    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'p=' + str(self.p) + ')'

## embedding dropout


In [None]:
class EmbedDropout(nn.Module):

    def __init__(self, p=0.5):
        self.p = p
        super().__init__()

    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [batch size, input size, output size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(x.size(0), x.size(1), 1, requires_grad=False).bernoulli_(1 - self.p)
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask


    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'p=' + str(self.p) + ')'

## my model

In [None]:
# model

class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
    def __init__(self, vocab_size, embed_size=256, hidden_size=512, nl=3):
        super(LanguageModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size) 

        self.locked_dropout = LockedDropout(p=.3) 
        self.embed_dropout = EmbedDropout(p=.1)
        # self.embed_dropout = None
        self.hs = hidden_size

        # self.rnn = [nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=1, bidirectional=True)] # True
        # self.nl = nl
        # for i in range(1, nl):
        #     self.rnn.append(nn.LSTM(input_size=2*hidden_size, hidden_size=2*hidden_size, num_layers=1, bidirectional=False))
        # self.rnn = nn.ModuleList(self.rnn)

        
        self.rnn = [nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=1, bidirectional=False)] # True
        self.nl = nl
        for i in range(1, nl-1):
            self.rnn.append(nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=1, bidirectional=False)) # BIDIRECTIONAL CHANGE
        self.rnn.append(nn.LSTM(input_size=hidden_size, hidden_size=embed_size, num_layers=1, bidirectional=False))

        self.rnn = nn.ModuleList(self.rnn)
        
        self.fc = nn.Linear(embed_size, vocab_size) # BIDIRECTIONAL CHANGE

        self.fc.weight = self.embed.weight # weight tying



    def forward(self, x, package=None): # packge = (h,c)
        
        x = self.embed(x)               # LxBxC
        if self.embed_dropout:
            x = self.embed_dropout(x)       # LxBxC

        hiddens = []
        cells = []

        if package:
            h, c = package
            h = h[0]
            c = c[0]
            x, mem = self.rnn[0](x, (h,c))  # LxBx2*C
        else:
            x, mem = self.rnn[0](x)

        hiddens.append(mem[0])
        cells.append(mem[1])

        for i in range(1, self.nl):
            x = self.locked_dropout(x)  # LxBx2*C
            if package:
                h, c = package
                h = h[i]
                c = c[i]
                x, mem = self.rnn[i](x, (h,c))      # LxBx2*C
            else:
                x, mem = self.rnn[i](x)      # LxBx2*C
            hiddens.append(mem[0])
            cells.append(mem[1])

        x = self.fc(x) 
        return x, (hiddens, cells)

    


# Train

In [None]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = torch.optim.Adam(model.parameters(), lr=.002, weight_decay=5e-6)
        # self.optimizer = torch.optim.ASGD(model.parameters(), lr=30)
        self.criterion = nn.CrossEntropyLoss()

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        max_iter = len(self.loader)
        with tqdm(total=max_iter) as pbar:
            for batch_num, (inputs, targets) in enumerate(self.loader):
                epoch_loss += self.train_batch(inputs, targets)
                del inputs
                del targets
                pbar.update()
        torch.cuda.empty_cache()
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        
        """
        outputs, _ = self.model(inputs)
        loss = self.criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))
        my_loss = loss.item()
        self.optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), .25) # PAPER
        self.optimizer.step()
        del loss
        return my_loss
    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generation(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs, self.max_epochs, nll))
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [None]:
torch.cuda.empty_cache()
!nvidia-smi

Fri Nov 12 14:04:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    46W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Test

In [None]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp: BxL
            :return: a np.ndarray of logits
        """
        
        with torch.no_grad():

            inp = torch.tensor(inp).type(torch.LongTensor).to(device)
            inp = inp.permute(1,0) # LxB
            output, _ = model(inp) # (LxBxV)
            output = output.permute(1,0,2) # BxLxV
            prediction = output[:, -1, :]
            del inp
            return prediction.cpu().numpy() # BxV

        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """
        
        generated = []
        inp = torch.tensor(inp).type(torch.LongTensor).to(device)
        inp = inp.permute(1,0) # LxB
        with torch.no_grad():
            next_seq, mem = model(inp) #LxBxV, (1xBx2*H, 1xBx2*H)
            next_word = torch.argmax(next_seq[-1,:,:], dim=1, keepdim=True) # Bx1
            generated.append(next_word)
            # next_word = next_word.permute(1,0) #LxB
            for i in range(forward - 1):
                # h0: LxNx(2H)
                h0 = mem[0]
                c0 = mem[1]
                # h0 = mem[0].view(2,inp.shape[1],-1) # 2xBxH
                # c0 = mem[1].view(2,inp.shape[1],-1) # 2xBxH
                next_seq, mem = model(torch.permute(next_word, (1,0)), (h0, c0))
                del h0
                del c0
                next_word = torch.argmax(next_seq[-1,:,:], dim=1, keepdim=True) # Bx1
                generated.append(next_word)
        return torch.cat(generated, dim=1).cpu().numpy()
        

# Hyperparameters

In [None]:
# TODO: define other hyperparameters here

NUM_EPOCHS = 50
BATCH_SIZE = 80


# Runner

In [None]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1636725855


In [None]:
model = LanguageModel(len(vocab))
model = model.to(device)
loader = LanguageModelDataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

In [None]:
best_nll = 1e30 
for epoch in range(NUM_EPOCHS):
    trainer.train()
    nll = trainer.test()
    print(nll)
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
    

  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [1/50]   Loss: 6.9314
[VAL]  Epoch [1/50]   Loss: 5.5920
5.5920057
Saving model, predictions and generated output for epoch 0 with NLL: 5.5920057


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [2/50]   Loss: 6.1027
[VAL]  Epoch [2/50]   Loss: 5.0901
5.0900517
Saving model, predictions and generated output for epoch 1 with NLL: 5.0900517


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [3/50]   Loss: 5.7665
[VAL]  Epoch [3/50]   Loss: 4.7865
4.786456
Saving model, predictions and generated output for epoch 2 with NLL: 4.786456


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [4/50]   Loss: 5.5308
[VAL]  Epoch [4/50]   Loss: 4.6299
4.6298714
Saving model, predictions and generated output for epoch 3 with NLL: 4.6298714


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [5/50]   Loss: 5.3679
[VAL]  Epoch [5/50]   Loss: 4.5321
4.5321016
Saving model, predictions and generated output for epoch 4 with NLL: 4.5321016


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [6/50]   Loss: 5.2388
[VAL]  Epoch [6/50]   Loss: 4.4625
4.4625187
Saving model, predictions and generated output for epoch 5 with NLL: 4.4625187


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [7/50]   Loss: 5.1372
[VAL]  Epoch [7/50]   Loss: 4.3896
4.3896346
Saving model, predictions and generated output for epoch 6 with NLL: 4.3896346


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [8/50]   Loss: 5.0463
[VAL]  Epoch [8/50]   Loss: 4.2978
4.297758
Saving model, predictions and generated output for epoch 7 with NLL: 4.297758


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [9/50]   Loss: 4.9648
[VAL]  Epoch [9/50]   Loss: 4.2954
4.295365
Saving model, predictions and generated output for epoch 8 with NLL: 4.295365


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [10/50]   Loss: 4.9002
[VAL]  Epoch [10/50]   Loss: 4.2606
4.260635
Saving model, predictions and generated output for epoch 9 with NLL: 4.260635


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [11/50]   Loss: 4.8418
[VAL]  Epoch [11/50]   Loss: 4.2182
4.218231
Saving model, predictions and generated output for epoch 10 with NLL: 4.218231


  0%|          | 0/370 [00:00<?, ?it/s]

[TRAIN]  Epoch [12/50]   Loss: 4.7820
[VAL]  Epoch [12/50]   Loss: 4.2421
4.2420673


  0%|          | 0/370 [00:00<?, ?it/s]

In [None]:
torch.cuda.empty_cache()
!nvidia-smi

In [None]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output