In [None]:
%matplotlib inline

import os
import gc
import time
from tqdm import tqdm

import numpy as np
from matplotlib import pyplot as plt

import torch
from torch import nn

from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

from tests_hw4 import test_prediction, test_generation

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
dataset = np.load('../dataset/wiki.train.npy', allow_pickle=True)
devset = np.load('../dataset/wiki.valid.npy', allow_pickle=True)

fixtures_pred = np.load('../fixtures/prediction.npz')
fixtures_gen = np.load('../fixtures/generation.npy')

fixtures_pred_test = np.load('../fixtures/prediction_test.npz')
fixtures_gen_test = np.load('../fixtures/generation_test.npy')

vocab = np.load('../dataset/vocab.npy')

In [None]:
class DataLoaderForLanguageModeling(DataLoader):

    def __init__(self, dataset, batch_size, seq_length, shuffle=True):

        self.dataset = dataset
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.shuffle = shuffle

        self.num_batches = None

    def __iter__(self):

        if self.shuffle:
            np.random.shuffle(self.dataset)

        concat_data = np.concatenate(self.dataset)
        concat_data_len = concat_data.shape[0]

        num_pairs = concat_data_len // self.seq_length

        pairable_data_len = num_pairs * self.seq_length + 1
        pairable_data = concat_data[:pairable_data_len]

        all_inputs = pairable_data[:-1].reshape(num_pairs, self.seq_length)
        all_targets = pairable_data[1:].reshape(num_pairs, self.seq_length)

        for b in range(0, num_pairs, self.batch_size):

            batch_inputs = all_inputs[b:b+self.batch_size, :]
            batch_targets = all_targets[b:b+self.batch_size, :]

            yield (torch.tensor(batch_inputs), torch.tensor(batch_targets))

In [None]:
class EmbeddingWithEmbeddingDropout(nn.Module):

    def __init__(self, vocab_size, embedding_dim, embed_dropout_p):

        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embed_dropout_p = embed_dropout_p

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
    
    def forward(self, inp):

        if not self.training:
            return self.embedding(inp)
        
        # copy undropped weights
        original_weights = self.embedding.weight
        weights_device = original_weights.device

        # calculate dropout mask
        dropout_mask = (torch.empty(self.vocab_size)
                        .bernoulli_(1 - self.embed_dropout_p)
                        .reshape(-1, 1)).to(weights_device)

        # apply dropout mask
        dropout_weights = (original_weights * dropout_mask) / (1 - self.embed_dropout_p)
        
        # set dropped weights and forward
        self.embedding.weight = nn.Parameter(dropout_weights)
        out = self.embedding(inp)

        # restore orignal weights
        self.embedding.weight = original_weights

        return out

In [None]:
class LockedDropout(nn.Module):
    '''
    https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/lock_dropout.html
    '''

    def __init__(self, locked_dropout_p):

        super().__init__()

        self.locked_dropout_p = locked_dropout_p

    def forward(self, inp):

        if not self.training:
            return inp

        mask = inp.new_empty(inp.size(0), 1, inp.size(2),
                             requires_grad=False).bernoulli_(1 - self.locked_dropout_p)

        mask = mask.div_(1 - self.locked_dropout_p)
        mask = mask.expand_as(inp)

        out = inp * mask

        return out

In [None]:
class Model(nn.Module):

    def __init__(self, vocab_size: int, embedding_dim: int,
                 hidden_size: int, bidirectional: bool):

        super().__init__()

        self.embedding = EmbeddingWithEmbeddingDropout(vocab_size, embedding_dim, 0.2)

        self.rnn1 = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=1,
                            bidirectional=bidirectional,
                            batch_first=True)
        
        self.ld1 = LockedDropout(0.5)
        
        self.rnn2 = nn.LSTM(input_size=2*hidden_size,
                            hidden_size=hidden_size,
                            num_layers=1,
                            bidirectional=bidirectional,
                            batch_first=True)
        
        self.ld2 = LockedDropout(0.3)

        self.rnn3 = nn.LSTM(input_size=2*hidden_size,
                            hidden_size=hidden_size,
                            num_layers=1,
                            bidirectional=bidirectional,
                            batch_first=True)
        
        self.ld3 = LockedDropout(0.1)
        
        self.rnn4 = nn.LSTM(input_size=2*hidden_size,
                            hidden_size=hidden_size,
                            num_layers=1,
                            bidirectional=bidirectional,
                            batch_first=True)

        self.scoring = nn.Linear(2*hidden_size, vocab_size, bias=False)

        # weight tying
        self.scoring.weight = nn.Parameter(self.embedding.embedding.weight)

    def forward(self, inp, rnn_hidden=None):

        embd_out = self.embedding(inp)

        rnn_out, rnn_hidden = self.rnn1(embd_out, rnn_hidden)
        rnn_out = self.ld1(rnn_out)

        rnn_out, rnn_hidden = self.rnn2(rnn_out, rnn_hidden)
        rnn_out = self.ld2(rnn_out)

        rnn_out, rnn_hidden = self.rnn3(rnn_out, rnn_hidden)
        rnn_out = self.ld3(rnn_out)

        rnn_out, rnn_hidden = self.rnn4(rnn_out, rnn_hidden)

        score_out = self.scoring(rnn_out)

        return rnn_out, rnn_hidden, score_out

In [None]:
class TestLanguageModel:

    def predict(inp, model):

        model.eval()

        inp = torch.tensor(inp).to(device)
        _, _, score_out = model(inp)

        next_word = score_out[:, -1, :]
        next_word = next_word.detach().cpu().numpy()

        return next_word
        
    def generate(inp, forward, model):

        model.eval()

        generated_words = []

        inp = torch.tensor(inp).to(device)
        _, rnn_hidden, score_out = model(inp)

        next_word_probs = score_out[:, -1, :]
        _, next_word = torch.max(next_word_probs, dim=1)

        generated_words.append(next_word)

        if forward > 1:
            for _ in range(forward-1):

                next_word = next_word.unsqueeze(1)
                _, rnn_hidden, score_out = model(next_word, rnn_hidden)

                next_word_probs = score_out[:, -1, :]
                _, next_word = torch.max(next_word_probs, dim=1)

                generated_words.append(next_word)
        
        generated_words = torch.stack(generated_words, dim=1).cpu().numpy()

        return generated_words

In [None]:
class Trainer:
    '''
    TODO
    '''

    def __init__(self, model, loader, lr, max_epochs=1, run_id='exp'):

        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.lr = lr
        self.run_id = run_id

        self.optimizer = torch.optim.AdamW(model.parameters(), lr=self.lr)
        self.criterion = nn.CrossEntropyLoss()

    def train(self):

        self.model.train()
        epoch_loss = 0

        for batch_num, (inputs, targets) in tqdm(enumerate(self.loader)):
            epoch_loss += self.train_batch(inputs, targets)

        epoch_loss = epoch_loss / (batch_num + 1)

        print('[TRAIN] Epoch [%d/%d] Loss: %.4f' % (self.epochs + 1, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):

        self.model.train()

        inputs = inputs.to(device)
        targets = targets.flatten().type(torch.LongTensor)
        targets = targets.to(device)

        _, _, preds = self.model(inputs)
        preds = preds.flatten(0, -2)
        loss = self.criterion(preds, targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def test(self):

        self.model.eval()

        predictions = TestLanguageModel.predict(fixtures_pred['inp'], self.model)
        self.predictions.append(predictions)

        generated_logits = TestLanguageModel.generate(fixtures_gen, 10, self.model)
        generated_logits_test = TestLanguageModel.generate(fixtures_gen_test, 10, self.model)

        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)

        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)

        # generate predictions for test data
        predictions_test = TestLanguageModel.predict(fixtures_pred_test['inp'], self.model)
        self.predictions_test.append(predictions_test)

        print('[VAL] Epoch [%d/%d] Loss: %.4f' % (self.epochs + 1, self.max_epochs, nll))
        self.epochs += 1

        return nll

    def save(self):

        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()}, model_path)

        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])

        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])

        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [None]:
run_id = str(int(time.time()))

if not os.path.exists('./experiments'):
    os.mkdir('./experiments')

os.mkdir('./experiments/%s' % run_id)
print("saving models, predictions, and generated words to ./experiments/%s" % run_id)

In [None]:
config = {'epochs': 5,
          'batch_size': 512,
          'seq_len': 8,
          'embd_dim': 2048,
          'hidden_dim': 1024,
          'bidirectional': True,
          'const_lr': 0.001}

In [None]:
def count_parameters(model):

    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
loader = DataLoaderForLanguageModeling(dataset=dataset,
                                       batch_size=config['batch_size'],
                                       seq_length=config['seq_len'],
                                       shuffle=True)

model = Model(vocab_size=len(vocab), embedding_dim=config['embd_dim'],
              hidden_size=config['hidden_dim'],
              bidirectional=config['bidirectional']).to(device)

print('model parameters =', count_parameters(model))

trainer = Trainer(model=model,
                  loader=loader, 
                  max_epochs=config['epochs'],
                  lr=config['const_lr'],
                  run_id=run_id)

In [None]:
best_nll = 1e30

for epoch in range(config['epochs']):

    gc.collect()
    torch.cuda.empty_cache()

    trainer.train()
    nll = trainer.test()

    if nll < best_nll:
        best_nll = nll
        print('saving model, predictions and generated output for epoch ' + str(epoch) + ' with NLL: ' + str(best_nll))
        trainer.save()

In [None]:
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
print (trainer.generated[-1])