<a href="https://colab.research.google.com/github/bobby838/hello-world/blob/master/40cpProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [None]:
import argparse, os, pickle, time, math, torch, sys
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.nn import Parameter
from functools import wraps
from datetime import datetime
from io import open
from torch.autograd import Variable
from tqdm import tqdm
from collections import defaultdict

Old RNN model

In [None]:

class Old_model(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(Old_model, self).__init__()
        self.ntoken = ntoken
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        return decoded, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

Enhanced model

In [None]:

class RNNModel(nn.Module):
    """ LSTM language model which uses pre-trained word vector representations
        as encoding.

        Args:
            encoding_size: The dimensions of the word representations.
            hidden_size: The number of features in the hidden state
            num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
                would mean stacking two LSTMs/GRUs together to form a `stacked RNN`,
                with the second RNN taking in outputs of the first RNN and
                computing the final results. Default: 1
            decoupled: whether there is a linear layer between the top most RNN
                layer and the output. Default: True
    """

    def __init__(self, encoding_size, hidden_size, output_size, num_layers,
                 encoder, rnn_type='LSTM', dropout=0, decoupled=True):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        # self.encoder = nn.Embedding(ntoken, encoding_size)
        self.encoder = encoder
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(encoding_size, hidden_size, num_layers, dropout=dropout)
        else:
            raise ValueError( "An invalid option for rnn_type was supplied, "
                              "options are ['LSTM', 'GRU']")
        # if decoupled:
        #     self.decoder = nn.Linear(hidden_size, encoding_size)
        # else:
        #     self.decoder = lambda x : x
        #     if hidden_size != encoding_size:
        #         raise ValueError("When flagging decoupled as False, the "
        #         "encoding_size and the hidden_size must be the same.")
        self.decoder = nn.Linear(hidden_size, output_size)
        self.init_layer(self.decoder)
        self.rnn_type = rnn_type
        self.encoding_size = encoding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def init_layer(self, layer):
      if hasattr(layer, "bias"):
        if type(layer.bias) != type(None):
            nn.init.zeros_(layer.bias)
        if hasattr(layer, "weight"):
            nn.init.kaiming_normal_(layer.weight)

    def forward(self, input, hidden):
        # emb = self.drop(self.encoder(input)) -> Decide on dropout
        emb = self.encoder(input)
        emb = self.drop(emb)
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        # See what we put here (decoder layer or not?)
        return decoded, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.num_layers, bsz, self.hidden_size),
                    weight.new_zeros(self.num_layers, bsz, self.hidden_size))
        else:
            return weight.new_zeros(self.num_layers, bsz, self.hidden_size)


Embedded Dropout

In [None]:

def embedded_dropout(embed, words, dropout=0.1, scale=None):
  if dropout:
    mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
    masked_embed_weight = mask * embed.weight
  else:
    masked_embed_weight = embed.weight
  if scale:
    masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight

  padding_idx = embed.padding_idx
  if padding_idx is None:
      padding_idx = -1

  X = torch.nn.functional.embedding(words, masked_embed_weight,
    padding_idx, embed.max_norm, embed.norm_type,
    embed.scale_grad_by_freq, embed.sparse
  )
  return X


Locked dropout

In [None]:
class LockedDropout(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, dropout=0.5):
        if not self.training or not dropout:
            return x
        m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
        mask = Variable(m, requires_grad=False) / (1 - dropout)
        mask = mask.expand_as(x)
        return mask * x

Weight drop

In [None]:
class WeightDrop(torch.nn.Module):
    def __init__(self, module, weights, dropout=0, variational=False):
        super(WeightDrop, self).__init__()
        self.module = module
        self.weights = weights
        self.dropout = dropout
        self.variational = variational
        self._setup()

    def widget_demagnetizer_y2k_edition(*args, **kwargs):
        # We need to replace flatten_parameters with a nothing function
        # It must be a function rather than a lambda as otherwise pickling explodes
        # We can't write boring code though, so ... WIDGET DEMAGNETIZER Y2K EDITION!
        # (╯°□°）╯︵ ┻━┻
        return

    def _setup(self):
        # Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN
        if issubclass(type(self.module), torch.nn.RNNBase):
            self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition

        for name_w in self.weights:
            print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
            w = getattr(self.module, name_w)
            del self.module._parameters[name_w]
            self.module.register_parameter(name_w + '_raw', Parameter(w.data))

    def _setweights(self):
        for name_w in self.weights:
            raw_w = getattr(self.module, name_w + '_raw')
            w = None
            if self.variational:
                mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
                if raw_w.is_cuda: mask = mask.cuda()
                mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
                w = mask.expand_as(raw_w) * raw_w
            else:
                w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
            if not self.training:
                w = w.data
            setattr(self.module, name_w, w)

    def forward(self, *args):
        self._setweights()
        return self.module.forward(*args)

Ready function SplitCrossEntrophyLoss

In [None]:
class SplitCrossEntropyLoss(nn.Module):
    r'''SplitCrossEntropyLoss calculates an approximate softmax'''
    def __init__(self, hidden_size, splits, verbose=False):
        # We assume splits is [0, split1, split2, N] where N >= |V|
        # For example, a vocab of 1000 words may have splits [0] + [100, 500] + [inf]
        super(SplitCrossEntropyLoss, self).__init__()
        self.hidden_size = hidden_size
        self.splits = [0] + splits + [100 * 1000000]
        self.nsplits = len(self.splits) - 1
        self.stats = defaultdict(list)
        self.verbose = verbose
        # Each of the splits that aren't in the head require a pretend token, we'll call them tombstones
        # The probability given to this tombstone is the probability of selecting an item from the represented split
        if self.nsplits > 1:
            self.tail_vectors = nn.Parameter(torch.zeros(self.nsplits - 1, hidden_size))
            self.tail_bias = nn.Parameter(torch.zeros(self.nsplits - 1))

    def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, verbose=False):
        # First we perform the first softmax on the head vocabulary and the tombstones
        if softmaxed_head_res is None:
            start, end = self.splits[0], self.splits[1]
            head_weight = None if end - start == 0 else weight[start:end]
            head_bias = None if end - start == 0 else bias[start:end]
            # We only add the tombstones if we have more than one split
            if self.nsplits > 1:
                head_weight = self.tail_vectors if head_weight is None else torch.cat([head_weight, self.tail_vectors])
                head_bias = self.tail_bias if head_bias is None else torch.cat([head_bias, self.tail_bias])

            # Perform the softmax calculation for the word vectors in the head for all splits
            # We need to guard against empty splits as torch.cat does not like random lists
            head_res = torch.nn.functional.linear(hiddens, head_weight, bias=head_bias)
            softmaxed_head_res = torch.nn.functional.log_softmax(head_res, dim=-1)

        if splits is None:
            splits = list(range(self.nsplits))

        results = []
        running_offset = 0
        for idx in splits:

            # For those targets in the head (idx == 0) we only need to return their loss
            if idx == 0:
                results.append(softmaxed_head_res[:, :-(self.nsplits - 1)])

            # If the target is in one of the splits, the probability is the p(tombstone) * p(word within tombstone)
            else:
                start, end = self.splits[idx], self.splits[idx + 1]
                tail_weight = weight[start:end]
                tail_bias = bias[start:end]

                # Calculate the softmax for the words in the tombstone
                tail_res = torch.nn.functional.linear(hiddens, tail_weight, bias=tail_bias)

                # Then we calculate p(tombstone) * p(word in tombstone)
                # Adding is equivalent to multiplication in log space
                head_entropy = (softmaxed_head_res[:, -idx]).contiguous()
                tail_entropy = torch.nn.functional.log_softmax(tail_res, dim=-1)
                results.append(head_entropy.view(-1, 1) + tail_entropy)

        if len(results) > 1:
            return torch.cat(results, dim=1)
        return results[0]

    def split_on_targets(self, hiddens, targets):
        # Split the targets into those in the head and in the tail
        split_targets = []
        split_hiddens = []

        # Determine to which split each element belongs (for each start split value, add 1 if equal or greater)
        # This method appears slower at least for WT-103 values for approx softmax
        #masks = [(targets >= self.splits[idx]).view(1, -1) for idx in range(1, self.nsplits)]
        #mask = torch.sum(torch.cat(masks, dim=0), dim=0)
        ###
        # This is equally fast for smaller splits as method below but scales linearly
        mask = None
        for idx in range(1, self.nsplits):
            partial_mask = targets >= self.splits[idx]
            mask = mask + partial_mask if mask is not None else partial_mask
        ###
        #masks = torch.stack([targets] * (self.nsplits - 1))
        #mask = torch.sum(masks >= self.split_starts, dim=0)
        for idx in range(self.nsplits):
            # If there are no splits, avoid costly masked select
            if self.nsplits == 1:
                split_targets, split_hiddens = [targets], [hiddens]
                continue
            # If all the words are covered by earlier targets, we have empties so later stages don't freak out
            if sum(len(t) for t in split_targets) == len(targets):
                split_targets.append([])
                split_hiddens.append([])
                continue
            # Are you in our split?
            tmp_mask = mask == idx
            split_targets.append(torch.masked_select(targets, tmp_mask))
            split_hiddens.append(hiddens.masked_select(tmp_mask.unsqueeze(1).expand_as(hiddens)).view(-1, hiddens.size(1)))
        return split_targets, split_hiddens

    def forward(self, weight, bias, hiddens, targets, verbose=False):
        if self.verbose or verbose:
            for idx in sorted(self.stats):
                print('{}: {}'.format(idx, int(np.mean(self.stats[idx]))), end=', ')
            print()

        total_loss = None
        if len(hiddens.size()) > 2: hiddens = hiddens.view(-1, hiddens.size(2))

        split_targets, split_hiddens = self.split_on_targets(hiddens, targets)

        # First we perform the first softmax on the head vocabulary and the tombstones
        start, end = self.splits[0], self.splits[1]
        head_weight = None if end - start == 0 else weight[start:end]
        head_bias = None if end - start == 0 else bias[start:end]

        # We only add the tombstones if we have more than one split
        if self.nsplits > 1:
            head_weight = self.tail_vectors if head_weight is None else torch.cat([head_weight, self.tail_vectors])
            head_bias = self.tail_bias if head_bias is None else torch.cat([head_bias, self.tail_bias])

        # Perform the softmax calculation for the word vectors in the head for all splits
        # We need to guard against empty splits as torch.cat does not like random lists
        combo = torch.cat([split_hiddens[i] for i in range(self.nsplits) if len(split_hiddens[i])])
        ###
        all_head_res = torch.nn.functional.linear(combo, head_weight, bias=head_bias)
        softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res, dim=-1)
        if self.verbose or verbose:
            self.stats[0].append(combo.size()[0] * head_weight.size()[0])

        running_offset = 0
        for idx in range(self.nsplits):
            # If there are no targets for this split, continue
            if len(split_targets[idx]) == 0: continue

            # For those targets in the head (idx == 0) we only need to return their loss
            if idx == 0:
                softmaxed_head_res = softmaxed_all_head_res[running_offset:running_offset + len(split_hiddens[idx])]
                entropy = -torch.gather(softmaxed_head_res, dim=1, index=split_targets[idx].view(-1, 1))
            # If the target is in one of the splits, the probability is the p(tombstone) * p(word within tombstone)
            else:
                softmaxed_head_res = softmaxed_all_head_res[running_offset:running_offset + len(split_hiddens[idx])]

                if self.verbose or verbose:
                    start, end = self.splits[idx], self.splits[idx + 1]
                    tail_weight = weight[start:end]
                    self.stats[idx].append(split_hiddens[idx].size()[0] * tail_weight.size()[0])

                # Calculate the softmax for the words in the tombstone
                tail_res = self.logprob(weight, bias, split_hiddens[idx], splits=[idx], softmaxed_head_res=softmaxed_head_res)

                # Then we calculate p(tombstone) * p(word in tombstone)
                # Adding is equivalent to multiplication in log space
                head_entropy = softmaxed_head_res[:, -idx]
                # All indices are shifted - if the first split handles [0,...,499] then the 500th in the second split will be 0 indexed
                indices = (split_targets[idx] - self.splits[idx]).view(-1, 1)
                # Warning: if you don't squeeze, you get an N x 1 return, which acts oddly with broadcasting
                tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res, dim=-1), dim=1, index=indices).squeeze()
                entropy = -(head_entropy + tail_entropy)
            ###
            running_offset += len(split_hiddens[idx])
            total_loss = entropy.float().sum() if total_loss is None else total_loss + entropy.float().sum()

        return (total_loss / len(targets)).type_as(weight)



AWD_LSTM

In [None]:

class AWDRNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, tie_weights=False):
        super(AWDRNNModel, self).__init__()
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        assert rnn_type in ['LSTM', 'QRNN', 'GRU'], 'RNN type is not supported'
        if rnn_type == 'LSTM':
            self.rnns = [torch.nn.LSTM(ninp if l == 0 else nhid, nhid if l != nlayers - 1 else (ninp if tie_weights else nhid), 1, dropout=0) for l in range(nlayers)]
            if wdrop:
                self.rnns = [WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop) for rnn in self.rnns]
        print("AWD ", self.rnns)
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.decoder = nn.Linear(nhid, ntoken)

        if tie_weights:
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.tie_weights = tie_weights

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)


    def forward(self, input, hidden, return_h=False):
        emb = embedded_dropout(self.encoder, input, dropout=self.dropoute if self.training else 0)

        emb = self.lockdrop(emb, self.dropouti)

        raw_output = emb
        new_hidden = []

        raw_outputs = []
        outputs = []
        for l, rnn in enumerate(self.rnns):
            current_input = raw_output
            raw_output, new_h = rnn(raw_output, hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.nlayers - 1:
                raw_output = self.lockdrop(raw_output, self.dropouth)
                outputs.append(raw_output)
        hidden = new_hidden

        output = self.lockdrop(raw_output, self.dropout)
        outputs.append(output)

        result = output.view(output.size(0)*output.size(1), output.size(2))
        if return_h:
            return result, hidden, raw_outputs, outputs
        return result, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return [(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_(),
                    weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else self.nhid)).zero_())
                    for l in range(self.nlayers)]


AWD-LSTM with pre-trained word embeddings

In [None]:
class EAWDRNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""
    def __init__(self, rnn_type, ntoken, encoder, nhid, nlayers, dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, tie_weights=False):
        super(EAWDRNNModel, self).__init__()
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        #self.encoder = nn.Embedding(ntoken, ninp)
        self.encoder = encoder
        assert rnn_type in ['LSTM', 'QRNN', 'GRU'], 'RNN type is not supported'
        if rnn_type == 'LSTM':
            self.rnns = [torch.nn.LSTM(encoder.encoding_size if l == 0 else nhid, nhid if l != nlayers - 1 else (encoder.encoding_size if tie_weights else nhid), 1, dropout=0) for l in range(nlayers)]
            if wdrop:
                self.rnns = [WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop) for rnn in self.rnns]
        print("AWD ", self.rnns)
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.decoder = nn.Linear(nhid, ntoken)
        self.init_layer(self.decoder)
        if tie_weights:
            self.decoder.weight = self.encoder.weight

        #self.init_weights()

        self.rnn_type = rnn_type
        #self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.tie_weights = tie_weights

    def init_layer(self, layer):
        if hasattr(layer, "bias"):
            if type(layer.bias) != type(None):
                nn.init.zeros_(layer.bias)
        if hasattr(layer, "weight"):
            nn.init.kaiming_normal_(layer.weight)

    def forward(self, input, hidden, return_h=False):
        emb = self.encoder(input)
        emb = self.drop(emb)
        #emb = embedded_dropout(self.encoder, input, dropout=self.dropoute if self.training else 0)

        #emb = self.lockdrop(emb, self.dropouti)
        raw_output = emb
        new_hidden = []

        raw_outputs = []
        outputs = []
        for l, rnn in enumerate(self.rnns):
            current_input = raw_output
            raw_output, new_h = rnn(raw_output, hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.nlayers - 1:
                raw_output = self.lockdrop(raw_output, self.dropouth)
                outputs.append(raw_output)
        hidden = new_hidden

        output = self.lockdrop(raw_output, self.dropout)
        outputs.append(output)

        result = output.view(output.size(0)*output.size(1), output.size(2))
        if return_h:
            return result, hidden, raw_outputs, outputs
        return result, hidden
    

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return [(weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else encoder.encoding_size)).zero_(),
                    weight.new(1, bsz, self.nhid if l != self.nlayers - 1 else (self.ninp if self.tie_weights else encoder.encoding_size)).zero_())
                    for l in range(self.nlayers)]

Data loader


In [None]:

class Corpus(object):
    """ Object to tokenize and store the text corpus

        path: path to the directory with the training, validation and test
            datasets ('train.txt','valid.txt','test.txt')
        embedding: gensim KeyedVectors object.
        vocab: dictionary from word to index (should contain all words in the
            vocabulary, with the ones with vector representations first).
        vectors: vector representations.
        load: whether to load the vocab and vectors, or derive from traing
            corpora. If true, vocab and vectors must be provided, otherwise,
            embedding should be provided.
    """

    def __init__(self, path, embedding=None, vocab=None, vectors=None,
                load=False, portion=1):
        if load:
            self._load(vocab,vectors)
        else:
            self.narrow_vocab(os.path.join(path, 'train.txt'), embedding)
        self.train = self.tokenize(os.path.join(path, 'train.txt'), portion)
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def _load(self, vocab, vectors):
        self.vocab = vocab
        self.vectors = vectors

    def narrow_vocab(self, path, embeddings):
        """ Find the vocabulary of the train dataset and make the vocabulary of
            the embedding the same.
        """
        assert os.path.exists(path)

        # Find all the distinct words in the file.
        vocabulary = set()
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                [vocabulary.add(word) for word in words if not word in vocabulary]

        # Find words in embedding and words not in it.
        emb_vocab = {}
        vocab_out = {}
        vectors = []
        for word in vocabulary:
            if word in embeddings.vocab:
                emb_vocab[word] = len(emb_vocab)
                vectors.append(embeddings.vectors[embeddings.vocab[word].index])
            else:
                vocab_out[word] = len(vocab_out)
        l = len(emb_vocab)
        # Create a single dictionary from word to its index.
        emb_vocab.update({key: value + l for key, value in vocab_out.items()})
        self.vocab = emb_vocab
        self.vectors = vectors

    def tokenize(self, path, portion=1):
        """ Returns the word indices for a text file (dataset) """
        assert os.path.exists(path)

        word2idx = lambda x: self.vocab[x]

        with open(path, 'r', encoding="utf8") as f:
            idss = []
            num_lines = 0
            for line in f:
                words = line.split() + ['<eos>']
                ids = [word2idx(word) for word in words]
                idss.append(torch.tensor(ids).type(torch.int64))
                num_lines += 1
            ids = torch.cat(idss[:int(num_lines * portion)])

        return ids


def batchify(data, bsz, device):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

def get_batch(source, i, seq_len=50):
    seq_len = min(seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

Encoding


In [None]:

class Encoder(torch.nn.Module):
    """ Word encoder. If the word already has a representation, returns that
        plus the output of the linear with input 0-tensor. If the word
        doesn't have a representation, returns default with the output of
        the linear with input a one-hot vector for that word.
    """

    def __init__(self, size, vocab_size, vectors, default='zero'):
        """ size: number of units in the linear layer.

            vocab_size: number of words in the vocabulary.

            vectors: transfer learning vector representation for the first words
            in the vocabulary, in order.

            default: representation for the vectors part when the word doesn't
            have a pre-trained vector representation.
        """
        super().__init__()
        vectors = torch.tensor(vectors)
        self.input_size = vocab_size - len(vectors)
        assert self.input_size > 0
        self.hidden_size = size
        self.size = size
        self.encoding_size = self.hidden_size + vectors.shape[1]
        self.linear = torch.nn.Linear(self.input_size, self.hidden_size)
        self.init_layer(self.linear)

        defaults = torch.zeros((self.input_size, vectors.shape[1]))
        self.vectors = torch.cat((vectors, defaults), 0)

        defaults = torch.zeros((vocab_size - self.input_size, self.input_size))
        one_hot = torch.zeros((self.input_size, self.input_size))
        # add the ones
        one_hot = one_hot.scatter(1,
                  torch.tensor([[i] for i in range(self.input_size)]), 1)
        self.linear_inputs = torch.cat((defaults, one_hot), 0)



    def encode1(self, inputs):
        return F.embedding(inputs, self.vectors)

    def encode2(self, inputs):
        return F.embedding(inputs, self.linear_inputs)

    def forward(self, batch):
         x1 = self.encode1(batch)
         x2 = self.encode2(batch)
         x2 = self.linear(x2)
         return torch.cat((x1,x2),-1)

    def init_layer(self, layer):
      if hasattr(layer, "bias"):
        if type(layer.bias) != type(None):
            torch.nn.init.zeros_(layer.bias)
        if hasattr(layer, "weight"):
            torch.nn.init.kaiming_normal_(layer.weight)

train functions

In [None]:

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

class Trainer():

    def __init__(self, model, corpus, criterion, device, logger = None,
                 batch_size = 25, seq_len = 35, learning_rate = 20,
                 log_interval=100, clip_grad= 0.25):
        self.device = device
        self.model = model.to(device)
        self.criterion = criterion
        self.train_data = batchify(corpus.train, batch_size, device)
        self.corpus = corpus
        self.epoch = -1
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.learning_rate = learning_rate
        self.log_interval = log_interval
        self.clip_grad = clip_grad
        if logger == None:
            self.logging = False
            self.logger = None
        else:
            self.logging = True
            self.logger = logger

    def train(self):
        self.epoch += 1
        self.model.train()
        total_loss = 0.
        start_time = time.time()
        number_tokens = len(self.corpus.vocab)
        hidden = self.model.init_hidden(self.batch_size)

        for batch, i in enumerate(range(0, self.train_data.size(0) - 1, self.seq_len)):
            data, targets = get_batch(self.train_data, i, seq_len=self.seq_len)
            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            self.model.zero_grad()
            hidden = repackage_hidden(hidden)
            output, hidden = self.model(data, hidden)
            loss = self.criterion(output.view(-1, number_tokens), targets.long())
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_grad)
            for p in self.model.parameters():
                p.data.add_(-self.learning_rate, p.grad.data) # Is this just Stochastic Gradient Descent?

            total_loss += loss.item()

            if batch % self.log_interval == 0 and batch > 0:
                cur_loss = total_loss / self.log_interval
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                        'loss {:5.2f} | ppl {:8.2f}'.format(
                    self.epoch, batch, len(self.train_data) // self.seq_len,
                    self.learning_rate, elapsed * 1000 / self.log_interval,
                    cur_loss, math.exp(cur_loss)))
                if self.logging:
                    self.logger.log_train(self.epoch, batch, cur_loss)
                total_loss = 0
                start_time = time.time()


def evaluate(model, corpus, criterion, device, batch_size = 10, seq_len = 35,
             set = 'valid'):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.vocab)
    data_source = batchify(getattr(corpus,set), batch_size, device)
    hidden = model.init_hidden(batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, seq_len):
            data, targets = get_batch(data_source, i, seq_len=seq_len)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets.long()).item()
    return total_loss / (len(data_source) - 1)


Training for AWDLSTM

In [None]:
class AWDTrainer():

    def __init__(self, model, corpus, criterion, optimizer, device, logger = None,
                 batch_size = 80, bptt = 70,
                 alpha = 2, beta = 1, log_interval=100, clip_grad= 0.25):
        self.device = device
        self.model = model.to(device)
        self.criterion = criterion
        self.optimizer = optimizer
        self.train_data = batchify(corpus.train, batch_size, device)
        self.corpus = corpus
        self.epoch = -1
        self.batch_size = batch_size
        self.bptt = bptt
        self.alpha = alpha
        self.beta = beta
        self.log_interval = log_interval
        self.clip_grad = clip_grad

    def train(self):
        # Turn on training mode which enables dropout.
        self.epoch += 1
        total_loss = 0
        start_time = time.time()
        ntokens = len(self.corpus.vocab)
        hidden = self.model.init_hidden(self.batch_size)
        batch, i = 0, 0
        while i < self.train_data.size(0) - 1 - 1:
            tmp = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
            # Prevent excessively small or negative sequence lengths
            seq_len = max(5, int(np.random.normal(tmp, 5)))
            # There's a very small chance that it could select a very long sequence length resulting in OOM
            # seq_len = min(seq_len, args.bptt + 10)

            lr2 = self.optimizer.param_groups[0]['lr']
            self.optimizer.param_groups[0]['lr'] = lr2 * seq_len / self.bptt
            self.model.train()
            data, targets = get_batch(self.train_data, i, seq_len=seq_len)
            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            hidden = repackage_hidden(hidden)
            self.optimizer.zero_grad()

            output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True)
            raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets)

            loss = raw_loss
            # Activiation Regularization
            if self.alpha: loss = loss + sum(self.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:])
            # Temporal Activation Regularization (slowness)
            if self.beta: loss = loss + sum(self.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:])
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            if self.clip_grad: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_grad)
            self.optimizer.step()

            total_loss += raw_loss.data
            self.optimizer.param_groups[0]['lr'] = lr2
            if batch % self.log_interval == 0 and batch > 0:
                cur_loss = total_loss.item() / self.log_interval
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                        'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                    self.epoch, batch, len(self.train_data) // self.bptt, self.optimizer.param_groups[0]['lr'],
                    elapsed * 1000 / self.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
                total_loss = 0
                start_time = time.time()
            ###
            batch += 1
            i += seq_len

def awdevaluate(model, corpus, criterion, device, batch_size=10, bptt = 70):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    ntokens = len(corpus.vocab)
    data_source = batchify(corpus.valid, batch_size, device)
    hidden = model.init_hidden(batch_size)
    for i in range(0, data_source.size(0) - 1, bptt):
        data, targets = get_batch(data_source, i, seq_len=bptt)
        output, hidden = model(data, hidden)
        total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss.item() / len(data_source)



Utils

In [None]:

def save_checkpoint(model, path, valid_loss, args={}):
    if path:
        to_save = {'params' : model.state_dict(), 'valid_loss': valid_loss,
                   'args': args}
        with open(path, 'wb') as f:
            pickle.dump(to_save, f)
        print('checkpoint saved to {}'.format(path))

def load_checkpoint(path):
    with open(path, 'rb') as f:
        checkpoint = pickle.load(f)
    return checkpoint

class Logger(object):

    def __init__(self, path):
        self.create_files(path)

    def create_files(self, path):
        if not os.path.exists(path):
            raise RuntimeError("the folder {} doesn't exist.")
        self.base_path = path
        self.train_log_file = os.path.join(path, 'train.csv')
        self.valid_log_file = os.path.join(path,'valid.csv')
        with open(self.train_log_file, 'w') as f:
            f.write("epoch,batches,time,loss,perplexity\n")
            f.write("nan,nan,{},nan,nan\n".format(time.time()))
        with open(self.valid_log_file,'w') as f:
            f.write("epoch,time,loss,perplexity\n")
            f.write("nan,{},nan,nan\n".format(time.time()))

    def log_valid(self, epoch, loss):
        line = '{},{},{},{}\n'.format(epoch, time.time(), loss, math.exp(loss))
        with open(self.valid_log_file, 'a') as f:
            f.write(line)

    def log_train(self, epoch, batches, loss):
        line = '{},{},{},{},{}\n'.format(epoch, batches, time.time(),
                                         loss, math.exp(loss))
        with open(self.train_log_file, 'a') as f:
            f.write(line)

    def log_description(self, args):
        path = os.path.join(self.base_path, 'description.txt')
        args = args.__dict__
        with open(path, 'w') as fout:
            fout.write(str(datetime.now()) + '\n\n')
            for key, val in args.items():
                fout.write('{}: {}\n'.format(key,val))


def load_model_corpora(checkpoint):
    """ Load the model the checkpoint pointed at by `checkpoint' is for and the
        corpora indicated in the arguments within the checkpoint.
    """
    try:
        checkpoint = load_checkpoint(checkpoint)
        args = checkpoint['args']
        params = checkpoint['params']
    except Exception as e:
        print('The following exception ocurred:')
        print(e)
        raise RuntimeError('The first object in checkpoint must be a '
              'dictionary containing at least [args,params].')
    # Use the arguments to create a model that is the same as the one we have
    # the parameters for.
    if args.load:
        with open(args.load,'rb') as f:
            stored_dict = pickle.load(f)
        corpora = Corpus(args.corpus,load=True,vocab=stored_dict['vocabulary'],
                   vectors=stored_dict['vectors'])
    else:
        # I never do load = False.
        corpora = None
    # create a binary pickle file 
    #f = open("drive/MyDrive/chekpoints/ptb.pkl","wb")

    # write the python object (dict) to pickle file
    #pickle.dump(corpora,f)

    # close file
    #f.close()

    if args.model == 'OLDLSTM':
        model = Old_model('LSTM', len(corpora.vocab), args.encoder_size,
                    args.hidden_size, args.layers, args.dropout)
    elif args.model == 'LSTM':
        encoder = Encoder(50, len(corpora.vocab), corpora.vectors)
        model = RNNModel(encoder.encoding_size, args.hidden_size,
                    len(corpora.vocab), args.layers, encoder, dropout=args.dropout)
    elif args.model == 'AWDLSTM':
        model = AWDRNNModel('LSTM', len(corpora.vocab), args.hidden_size, args.layers, encoder,
                            args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
    elif args.model == 'EAWDLSTM':
        encoder = Encoder(50, len(corpora.vocab), corpora.vectors)
        model = EAWDRNNModel('LSTM', len(corpora.vocab), encoder, 
                            args.hidden_size, args.layers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
    # load the parameters from checkpoint
    model.load_state_dict(params)
    return model, corpora


Main function

In [None]:

parser = argparse.ArgumentParser(
    description="In the future, train a LSTM language model using word embeddings.",
    formatter_class=argparse.ArgumentDefaultsHelpFormatter,)
parser.add_argument(
    "--corpus", default="drive/MyDrive/ptb/", type=str,
    help="Path where train.txt, valid.txt and test.txt are contained.")
parser.add_argument("--embedding-path", default="drive/MyDrive/vectors.bin", type=str,
                    help="Path for the binary file containing the embeddings.")
parser.add_argument("--epochs", default=20, type=int,
                    help='Number of epochs to train for.')
parser.add_argument("--lr", default=20, type=int, help='learning rate.')
parser.add_argument("--batch-size", default=64, type=int,
                    help='Number of batches to divide the data in.')
parser.add_argument("--seq-len", default=35, type=int,
                    help='length of the training sequences (backpropagation through time'
                    'will be truncated to this number of steps).')
parser.add_argument("--dropout", default=0.5, type=float,
                     help='dropout of the network.')
parser.add_argument('--dropouth', type=float, default=0.3,
                    help='dropout for rnn layers (0 = no dropout)')
parser.add_argument('--dropouti', type=float, default=0.65,
                    help='dropout for input embedding layers (0 = no dropout)')
parser.add_argument('--dropoute', type=float, default=0.1,
                    help='dropout to remove words from embedding layer (0 = no dropout)')
parser.add_argument('--wdrop', type=float, default=0.5,
                    help='amount of weight dropout to apply to the RNN hidden to hidden matrix')
parser.add_argument("--clip-grad", default=0.25, type=float,
                     help='gradient clipping.')
parser.add_argument("--layers", default=2, type=int,
                    help='Number of stacked RNN layers.')
parser.add_argument("--hidden-size", default=350, type=int,
                    help='The number of units each RNN layer has.')
parser.add_argument("--load", default='drive/MyDrive/chekpoints/ptb.pkl', type=str,
                    help='If provided, the path with vocabulary and vectors.')
parser.add_argument("--checkpoint", default='drive/MyDrive/chekpoints/checkpoints.pkl', type=str,
                    help='Path to store checkpoints of the model during training.')
parser.add_argument("--log-dir", default='drive/MyDrive/logs', type=str,
                    help='If provided, logs will be stored in the directory.')
parser.add_argument("--log-interval", default=100, type=int,
                    help='Number of batches between information is logged.')
parser.add_argument("--model", type=str, default='AWDLSTM',
                    help='type of recurrent net (LSTM, AWDLSTM, OLDLSTM, EAWDLSTM)')
parser.add_argument("--encoder-size", type=int, default=350)
parser.add_argument("--dataset-portion", type=float, default=1,
                    help="If provided, this is the proportion of the training "
                    "set to be used in training.")
parser.add_argument('--alpha', type=float, default=2,
                    help='alpha L2 regularization on RNN activation (alpha = 0 means no regularization)')
parser.add_argument('--beta', type=float, default=1,
                    help='beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)')
parser.add_argument('--wdecay', type=float, default=1.2e-6,
                    help='weight decay applied to all weights')
parser.add_argument("--tied", action = 'store_true', help="weight tying or not")
parser.add_argument("-f", "--file", required=False)

if __name__ == "__main__":
    args = parser.parse_args()
    if args.log_dir:
        logger = Logger(args.log_dir)
        logger.log_description(args)
    else:
        logger = None
    # if available use a GPU.
    if torch.cuda.is_available():
        device = torch.device('cuda')
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
        print("cuda")
    else:
        device = torch.device('cpu')
        torch.set_default_tensor_type('torch.FloatTensor')
        print("cpu")

    if args.load:
        with open(args.load,'rb') as f:
            stored_dict = pickle.load(f)
        corpora = Corpus(args.corpus,load=True,vocab=stored_dict['vocabulary'],
                   vectors=stored_dict['vectors'], portion=args.dataset_portion)
    else:
        # Load the pre-trained embeddings
        from gensim.models import KeyedVectors
        embeddings = KeyedVectors.load_word2vec_format(args.embedding_path,
                                                        binary=True)
        # Load the corpora, find the vocabulary and what is in the embeddings.
        corpora = Corpus(args.corpus, embeddings)
        # Don't need the embeddings any longer. corpora has a copy of the relevant
        # vectors.
        del embeddings

        #path = "drive/MyDrive/chekpoints/wk2.pkl"
        #to_save = {'vocabulary' : corpora.vocab, 'vectors': corpora.vectors}
        #with open(path, 'wb') as f:
            #pickle.dump(to_save, f)
        #print('corpora saved to {}'.format(path))

    if args.model == 'OLDLSTM':
        model = Old_model('LSTM', len(corpora.vocab), args.encoder_size,
                    args.hidden_size, args.layers, args.dropout)
        print("old")
    elif args.model == 'LSTM':
        encoder = Encoder(50, len(corpora.vocab), corpora.vectors)
        model = RNNModel(encoder.encoding_size, args.hidden_size,
                    len(corpora.vocab), args.layers, encoder, dropout=args.dropout)
        print("enhanced")
    elif args.model == 'AWDLSTM':
        model = AWDRNNModel('LSTM', len(corpora.vocab), args.encoder_size, args.hidden_size, args.layers,
                            args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
        print("AWDLSTM")
    elif args.model == 'EAWDLSTM' :
        encoder = Encoder(50, len(corpora.vocab), corpora.vectors)
        model = EAWDRNNModel('LSTM', len(corpora.vocab), encoder, 
                            args.hidden_size, args.layers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
        print("EAWDLSTM")
    lr = args.lr
    weight_decay = args.wdecay
    optimizer = torch.optim.SGD(model.parameters(), lr = args.lr, weight_decay=args.wdecay)
    if args.model =='AWDLSTM' or args.model == 'EAWDLSTM' :
        splits = []
        if len(corpora.vocab) > 500000:
            # One Billion
            # This produces fairly even matrix mults for the buckets:
            # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
            splits = [4200, 35000, 180000]
        elif len(corpora.vocab) > 75000:
            # WikiText-103
            splits = [2800, 20000, 76000]
        print('Using', splits)
        if args.model == 'AWDLSTM' :
            criterion = SplitCrossEntropyLoss(args.encoder_size, splits=splits, verbose=False)
        elif args.model == 'EAWDLSTM' :
            criterion = SplitCrossEntropyLoss(args.encoder_size, splits=splits, verbose=False)
        trainer = AWDTrainer(model, corpora, criterion, optimizer, device, logger,
                 80, 70,
                 2, 1, args.log_interval, args.clip_grad)
    else:
        criterion = torch.nn.CrossEntropyLoss()
        trainer = Trainer(model, corpora, criterion, device, logger,
                  args.batch_size, args.seq_len, args.lr, args.log_interval,
                  args.clip_grad)
    best_valid_loss = float("inf")
    for epoch in range(args.epochs):
        print('Time at the start of epoch {} is {}'.format(epoch,datetime.now()))
        trainer.train()
        if args.model == 'AWDLSTM' or args.model == 'EAWDLSTM':
            valid_loss = awdevaluate(model,corpora, criterion, device)
        else :
            valid_loss = evaluate(model,corpora, criterion, device)
        print('Validation loss: {:.2f}. Perplexity: {:.2f}'.format(valid_loss,
              math.exp(valid_loss)))
        if args.log_dir:
            logger.log_valid(epoch, valid_loss)
        save_checkpoint(model.to(torch.device('cpu')), args.checkpoint,
                        valid_loss, args)
        model = model.to(device)

        # Anneal the learning rate if the validation loss hasn't improved.
        if (valid_loss - best_valid_loss) < -0.01:
            best_valid_loss = valid_loss
        else:
            if args.model == 'AWDLSTM' or args.model =='EAWDLSTM':
                optimizer.param_groups[0]['lr'] /= 8.0
            else :
                trainer.learning_rate /= 4.0

cuda
Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
AWD  [WeightDrop(
  (module): LSTM(350, 350)
), WeightDrop(
  (module): LSTM(350, 350)
)]
AWDLSTM
Using []
Time at the start of epoch 0 is 2021-08-23 10:58:10.611101


  self.dropout, self.training, self.bidirectional, self.batch_first)


| epoch   0 |   100/  165 batches | lr 20.00000 | ms/batch 52.91 | loss  7.27 | ppl  1433.29 | bpc   10.485
Validation loss: 6.42. Perplexity: 614.08
checkpoint saved to drive/MyDrive/chekpoints/checkpoints.pkl
Time at the start of epoch 1 is 2021-08-23 10:58:20.471175
| epoch   1 |   100/  165 batches | lr 20.00000 | ms/batch 53.21 | loss  6.50 | ppl   668.29 | bpc    9.384
Validation loss: 6.13. Perplexity: 457.70
checkpoint saved to drive/MyDrive/chekpoints/checkpoints.pkl
Time at the start of epoch 2 is 2021-08-23 10:58:30.363732
| epoch   2 |   100/  165 batches | lr 20.00000 | ms/batch 53.08 | loss  6.25 | ppl   517.16 | bpc    9.014
Validation loss: 5.93. Perplexity: 377.77
checkpoint saved to drive/MyDrive/chekpoints/checkpoints.pkl
Time at the start of epoch 3 is 2021-08-23 10:58:40.195290
| epoch   3 |   100/  165 batches | lr 20.00000 | ms/batch 53.06 | loss  6.07 | ppl   433.85 | bpc    8.761
Validation loss: 5.83. Perplexity: 340.81
checkpoint saved to drive/MyDrive/chekpo

evaluate number agreement

In [None]:
# Test number agreeement task on a pre-trained model.

parser = argparse.ArgumentParser()

parser.add_argument('--checkpoint', type=str, default='drive/MyDrive/chekpoints/checkpoints.pkl',
        help='path to the checkpoint.')
parser.add_argument('--gold-file', type=str, default='drive/MyDrive/num_agr/subj_agr_filtered.gold',
        help='path to file containing context size, right target and wrong target.')
parser.add_argument('--text-file', type=str, default='drive/MyDrive/num_agr/subj_agr_filtered.text',
        help='path to file containing the sentences.')
parser.add_argument('--nonce', action='store_true',
        help='if provided, indicates the dataset is')
parser.add_argument("-f", "--file", required=False)

class Word2idx():
    def __init__(self, vocab):
        self.vocab = vocab
    def __call__(self, word):
        if word in self.vocab:
            return self.vocab[word]
        else:
            return self.vocab['<unk>']

def tokenize(path, word2idx):
    assert os.path.exists(path)
    with open(path, 'r', encoding="utf8") as f:
        idss = []
        lengths = []
        for line in f:
            words = line.split()
            lengths.append(len(words))
            ids = [word2idx(word) for word in words]
            idss.append(torch.tensor(ids).type(torch.int64))
        ids = torch.cat(idss)
    return ids, lengths

def batchify1(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

def result_(row, logits, word2idx):
    # 1 represents that it succeded at the number agreement task, 0
    # represents it didn't.
    print(row)
    logits = logits[row['idx'],0]
    right = word2idx(row['right'])
    wrong = word2idx(row['wrong'])
    print(logits)
    print(right)
    print(wrong)
    if logits[right] > logits[wrong]:
        return 1
    else:
        return 0

def nonce_gold(path):
    nonce_df = pd.read_csv(path, delimiter='\t')
    gold_df = pd.DataFrame()
    gold_df[['context','right']] = nonce_df.loc[nonce_df['class'] == 'correct',
                                    ['len_prefix', 'form']].reset_index(drop=True)
    gold_df[['wrong','attractors']] = nonce_df.loc[nonce_df['class'] == 'wrong',
                                    ['form', 'n_attr']].reset_index(drop=True)
    return gold_df

def main(arguments):
    # Get the data we need from the checkpoint
    model, corpora = load_model_corpora(arguments.checkpoint)
    # load the sentences
    word2idx = Word2idx(corpora.vocab)
    sentences, lengths = tokenize(arguments.text_file, word2idx)
    lengths = np.cumsum([0] + lengths[:-1])
    if arguments.nonce:
        gold = nonce_gold(arguments.gold_file)
    else:
        # load the number agreement data, which should be tab sepparated.
        gold = pd.read_csv(arguments.gold_file, delimiter='\t',
                    names=['context','right','wrong','attractors'])
    # Get the location of the target verbs.
    gold['idx'] = gold['context'] + lengths
    # Get the predictions.
    model.eval()
    sentences = batchify1(sentences, 1)
    hidden = model.init_hidden(1)
    input, _ = get_batch(sentences, 0, len(sentences))
    output, hidden = model(input, hidden)
    results = gold.apply(lambda x: result_(x, output, word2idx), axis=1)
    checkpoint = load_checkpoint(arguments.checkpoint)
    return results, checkpoint['valid_loss']

if __name__ == "__main__":
    args = parser.parse_args()
    results, _ = main(args)
    print(sum(results), len(results), sum(results)/len(results))

Applying weight drop of 0.5 to weight_hh_l0
Applying weight drop of 0.5 to weight_hh_l0
AWD  [WeightDrop(
  (module): LSTM(350, 350)
), WeightDrop(
  (module): LSTM(350, 350)
)]
context        11
right          is
wrong         are
attractors      0
idx            11
Name: 0, dtype: object
tensor(0.0070, grad_fn=<SelectBackward>)
3964
1894


IndexError: ignored

training curves

In [None]:
# Plot training curves of different training instances saved in a directory.
# Also serves as a small library of useful functions for my plots.

description_keys = {'dropout','hidden_size','clip_grad',
                    'dataset_portion', 'old_model'}

parser = argparse.ArgumentParser()
parser.add_argument('--directory', type=str, default= "drive/MyDrive/logs/", help='directory of description file')
parser.add_argument('--logs', type=str, nargs='+')
parser.add_argument('--description', type=str, 
                    default='description.txt', help='filename of description'
                    'file if one is required.')
parser.add_argument('--description-only', action='store_true')
parser.add_argument('--plot-name', type=str, default='plot.png')
parser.add_argument("-f", "--file", required=False)

def plot_curves(df_dict, fname='plot.png', x_axis='time'):
    column = 'perplexity'
    for name, df in df_dict.items():
        plt.plot(df[x_axis], df[column], label = name)
    plt.legend()
    plt.xlabel(x_axis)
    plt.ylabel(column)
    plt.savefig(fname)

def relative_time(df):
    """ Make the time be 0 at the beginning of training,
        and turn it into minutes """
    df.time = (df.time - df.time[0])/60
    return df[1:]

def load_dfs(dirs, names, file='valid.csv'):
    assert len(dirs) == len(names)
    dfs = dict()
    for name, dir in zip(names, dirs):
        filename = os.path.join(dir,name)
        dfs[name] = relative_time(pd.read_csv(filename))
    return dfs

def check_dir(path, test_file='valid.csv'):
    if os.path.isdir(path):
        # check test file is in the directory
        if test_file in os.listdir(path):
            return True
    return False

def get_dirs(directory, names=[], test_file='valid.csv'):
    if len(names)==0:
        names = os.listdir(directory)
    dirs = []
    for name in names.copy():
        if check_dir(directory, name):
            dirs.append(directory)
        else:
            names.remove(name)
    return dirs, names

def descriptions(dirs, file_to='description.txt', filename='description.txt'):
    with open(file_to, 'w') as fout:
        for dir in dirs:
            file_in = os.path.join(dir, filename)
            fout.write(dir + '\n')
            with open(file_in, 'r') as fin:
                for line in fin:
                    key = line.split(':')[0]
                    if key in description_keys:
                        fout.write(line)
            fout.write('\n')

def get_attributes(dirs, names, attributes, filename='description.txt'):
    dict = {}
    for dir,name in zip(dirs,names):
        file_in = os.path.join(dir, filename)
        dict[name] = {}
        with open(file_in, 'r') as fin:
            for line in fin:
                line = line.split(':')
                key = line[0]
                if key in attributes:
                    dict[name][key] = line[1].strip()

    return dict

if __name__=='__main__':
    args = parser.parse_args()

    directory = args.directory
    if args.logs:
        dirs, names = get_dirs(directory, args.logs)
    else:
        dirs, names = get_dirs(directory)
    if args.description:
        descriptions(dirs, file_to=args.description)
    if not args.description_only:
        df_dict = load_dfs(dirs, names)
        plot_curves(df_dict, args.plot_name, 'epoch')

AttributeError: ignored

Plot NUmber agreement

In [None]:
# Plot the perplexity of models against their performance on Linzen's
# number agreement task. Different markers for old and new model.
plt.rcParams.update({'font.size': 13})  

dir_new = 'train_data'
dir_old = 'train_data/normal'

dirs_new, _ = get_dirs(dir_new)
dirs_old, _ = get_dirs(dir_old)

def load_obj(dir, filename='num_agr_result.pkl'):
    path = os.path.join(dir, filename)
    with open(path, 'rb') as f:
        dict_ = pickle.load(f)
    return dict_

proportion = lambda x: sum(x)/len(x)

dicts_new = [load_obj(dir) for dir in dirs_new]
dicts_old = [load_obj(dir) for dir in dirs_old]

ppl_new = [d['perplexity'] for d in dicts_new]
ppl_old = [d['perplexity'] for d in dicts_old]

num_agr_new = [proportion(d['results']) for d in dicts_new]
num_agr_old = [proportion(d['results']) for d in dicts_old]


plt.plot(ppl_new, num_agr_new, 'rx', label='enhanced model')
plt.plot(ppl_old, num_agr_old, 'bo', label='baseline model')
plt.xlabel('validation perplexity')
plt.ylabel('number agreement accuracy')
plt.legend()
plt.savefig('num_agr.png')

Plot Script

In [None]:
# Plot the final validation perplexity of old and new models against the
# portion of the dataset used for training.

attributes = ['old_model','dataset_portion']
dir = 'train_data/half'

plt.rcParams.update({'font.size': 13})  

parser = argparse.ArgumentParser()
parser.add_argument('--measure', default='perplexity', choices=['num_agr','perplexity'])
parser.add_argument('--complete-baseline', type=str)
parser.add_argument('--complete-enhanced', type=str)
args = parser.parse_args()

def load_obj(dir, filename='num_agr_result.pkl'):
    path = os.path.join(dir, filename)
    with open(path, 'rb') as f:
        dict_ = pickle.load(f)
    return dict_

proportion = lambda x: sum(x)/len(x)

def accuracies(dirs, names, filename='num_agr_result.pkl'):
    results = [load_obj(dir)['results'] for dir in dirs]
    return [proportion(result) for result in results]

def perplexities(dirs, names, filename='num_agr_result.pkl'):
    return [load_obj(dir)['perplexity'] for dir in dirs]

dirs, names = get_dirs(dir)
if args.measure == 'num_agr':
    values = accuracies(dirs, names)
else:
    values = perplexities(dirs, names)

if args.complete_baseline:
    assert not args.complete_enhanced is None
    complete_b = np.array(perplexities(*get_dirs(args.complete_baseline)))
    complete_e = np.array(perplexities(*get_dirs(args.complete_enhanced)))
    complete = True
else:
    complete = False

attribs = get_attributes(dirs, names, attributes)
portion_values = [attribs[name]['dataset_portion'] for name in names]

old = [attribs[name]['old_model'] == 'True' for name in names]
new = [not val for val in old]

# Separate accuracies and portions into old and new
new_pairs = [(a,b) for a,b,c in zip(values, portion_values, new) if c]
old_pairs = [(a,b) for a,b,c in zip(values, portion_values, old) if c]

if complete:
    new_pairs.append((complete_e.mean(), '1.0'))
    old_pairs.append((complete_b.mean(), '1.0'))

# Sort the accuracies based on the portion used in training.
new_pairs = sorted(new_pairs, key = lambda x: x[1])
old_pairs = sorted(old_pairs, key = lambda x: x[1])

def unzip(array, i):
    return [tuple_[i] for tuple_ in array]

portions = unzip(new_pairs, 1)

new_accs = unzip(new_pairs, 0)
old_accs = unzip(old_pairs, 0)

fname = 'dataset_size.png'

# Make the plot.
xs = range(len(portions))
plt.plot(xs,old_accs,'o--b', label='baseline model')
plt.plot(xs,new_accs,'o--r', label='enhanced model')
plt.ylabel('number agreement accuracy')
plt.xlabel('portion of dataset used in training')
plt.xticks(xs, portions)
plt.legend()
plt.savefig(fname)

Analysis


In [None]:


def nonce_gold(path):
    nonce_df = pd.read_csv(path, delimiter='\t')
    gold_df = pd.DataFrame()
    gold_df[['context','right','type']] = nonce_df.loc[nonce_df['class'] == 'correct',
                                    ['len_prefix', 'form', 'type']].reset_index(drop=True)
    gold_df[['wrong','attractors']] = nonce_df.loc[nonce_df['class'] == 'wrong',
                                    ['form', 'n_attr']].reset_index(drop=True)
    return gold_df

def load_results(dir, filename='num_agr_result.pkl'):
    """ Extract the results of the number agreement task from the directory.
        filename is the name of the file in which they were stored.
    """
    path = os.path.join(dir, filename)
    with open(path, 'rb') as f:
        dict_ = pickle.load(f)
    return dict_['results']


def results_dataframe(dir, gold_path = 'num_agr/subj_agr_filtered.gold',
                            file_path = 'num_agr_result.pkl', nonce=False):
    """ Create a dataframe with information of each test sentence for the number
        agreement task, and whether it was predicted right by each saved model
        in dir with stored results.
    """
    dirs, names = get_dirs(dir, test_file=file_path)
    # Load the information about the sentences as a dataframe
    if nonce:
        gold = nonce_gold(gold_path)
    else:
        gold = pd.read_csv(gold_path, delimiter='\t',
                    names=['context','right','wrong','attractors'])
    # Load the results for each instance trained.
    results = [load_results(dir, file_path) for dir in dirs]
    # Add the results to the dataframe
    for name,result in zip(names,results):
        gold[name] = result
    return gold, names

def num_attractors_df(gold_df, names):
    """ names = name of columns with results (abstractly represent a model).
        gold_df = a dataframe with columns names and 'attractors'.
        returns dataframe with attractors as index and the performance of each
        model.
    """
    new_df = gold_df[names + ['attractors']].groupby('attractors')\
                    .aggregate(['sum','count'])
    new_df = new_df.swaplevel(axis=1)
    indices = list(zip(['proportion']*len(names),names))
    # Get the proportion of correct answers for each number of attractors.
    new_df[indices] = new_df['sum']/new_df['count']
    return new_df.swaplevel(axis=1)

proportion = lambda x: sum(x)/len(x)

def performance_df(gold_df, names):
    """ Return dataframe with overall performance for each of names """
    performances = [proportion(gold_df[name]) for name in names]
    new_df = pd.DataFrame(data=[performances],columns=names)
    return new_df

def get_stats(df, column='proportion', swaplevels=True):
    """ return the mean and standard deviation of for columns with column as a
        level (assumes multiindex columns).
    """
    if swaplevels:
        df = df.swaplevel(axis=1)
    values = df[column]
    return values.mean(axis=1).to_numpy(), values.std(axis=1).to_numpy()

def plot_num_attractors(attractors_new, attractors_old,
                        plot_name='acc_attractors.png'):
    """ Plot and save plot of variation of performance and corresponding std for
        input dataframes, as the number of attractors changes. Returns the
        matplotlib.pyplot.axes with the plot.
    """
    plt.rcParams.update({'font.size': 13})
    # Get the index and check
    xs = list(attractors_new.index)
    assert list(attractors_old.index) == xs
    means_new, stds_new = get_stats(attractors_new)
    means_old, stds_old = get_stats(attractors_old)
    # Make the plot
    fig, ax = plt.subplots()
    ax.errorbar(xs, means_new*100, fmt='o-r', yerr=stds_new*100,
                label='enhanced model', capsize = 5)
    ax.errorbar(xs, means_old*100, fmt='o-b', yerr=stds_old*100,
                label='baseline model', capsize = 5)
    ax.set_xticks(xs)
    ax.set_xlabel('number of attractors')
    ax.set_ylabel('accuracy')
    ax.grid(axis='y')
    ax.legend()
    plt.savefig(plot_name)
    return ax

def make_plot(old_dir, new_dir, gold_path = 'num_agr/subj_agr_filtered.gold',
             file_path = 'num_agr_result.pkl', nonce=False):
    new_df, new_names = results_dataframe(new_dir, gold_path, file_path, nonce)
    old_df, old_names = results_dataframe(old_dir, gold_path, file_path, nonce)

    new_df = num_attractors_df(new_df, new_names)
    old_df = num_attractors_df(old_df, old_names)

    return plot_num_attractors(new_df, old_df)

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')