In [None]:
!git clone https://github.com/leimao/Two_Layer_Hierarchical_Softmax_PyTorch

In [1]:
import argparse
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim

# import data
# import model
# import utils

In [2]:
import torch

In [3]:
model_saved = torch.load('model.pt')



In [4]:
model = model_saved['model']
encoder = model_saved['encoder']
hierarchical_softmax = model_saved['decoder']

In [15]:
model.add_module("encoder", encoder)
model.add_module("decoder", hierarchical_softmax)

In [16]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

In [17]:
import data
path = './data/ptb'
corpus = data.Corpus(path)

In [18]:
eval_batch_size = 10
train_data = batchify(corpus.train, 128)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [21]:
def get_batch(source, i, evaluation=False):
    seq_len = min(10, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len], volatile=evaluation)
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    return data, target

def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    #encoder.eval()
    model.eval()
    #decoder.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i in range(0, data_source.size(0) - 1, 10):
        data, targets = get_batch(data_source, i, evaluation=True)
        emb = encoder(data)
        output, hidden = model(emb, hidden)

        probs = hierarchical_softmax(output.view(-1, output.size(2)), targets)

        loss = -torch.mean(torch.log(probs))

        total_loss += len(data) * loss.data

        hidden = repackage_hidden(hidden)
    return total_loss[0] / len(data_source)

In [22]:
evaluate(val_data)

4.832055081514371

In [None]:
def SVDSoftmax(A, h, b, W_size, N_size, B, V_t):
    h = torch.mm(V_t,h)
    z = torch.zeros_like(b)
    torch.add(torch.mm(B[:, :W_size], h[:W_size]), b, out=z)

    top_k_ind = torch.topk(z, k=N_size, dim=0, sorted=False)[1][:,0]
    torch.add(torch.mm(B[top_k_ind], h), b[top_k_ind], out=z[top_k_ind])
    z_exp = torch.exp(z - torch.max(z))
    return z_exp / z_exp.sum()

In [27]:
model.decoder.layer_bottom_W.shape

torch.Size([100, 150, 100])

In [28]:
model.decoder.layer_top_W.shape

torch.Size([150, 100])

In [23]:
class HierarchicalSoftmaxWithSVD(nn.Module):
    def __init__(self, ntokens, nhid, ntokens_per_class = None):
        super(HierarchicalSoftmax, self).__init__()

        # Parameters
        self.ntokens = ntokens
        self.nhid = nhid

        if ntokens_per_class is None:
            ntokens_per_class = int(np.ceil(np.sqrt(ntokens)))

        self.ntokens_per_class = ntokens_per_class

        self.nclasses = int(np.ceil(self.ntokens * 1. / self.ntokens_per_class))
        self.ntokens_actual = self.nclasses * self.ntokens_per_class

        self.layer_top_W = nn.Parameter(torch.FloatTensor(self.nhid, self.nclasses), requires_grad=True)
        self.layer_top_b = nn.Parameter(torch.FloatTensor(self.nclasses), requires_grad=True)

        self.layer_bottom_W = nn.Parameter(torch.FloatTensor(self.nclasses, self.nhid, self.ntokens_per_class), requires_grad=True)
        self.layer_bottom_b = nn.Parameter(torch.FloatTensor(self.nclasses, self.ntokens_per_class), requires_grad=True)

        self.softmax = nn.Softmax(dim=1)

        self.init_weights()

    def init_weights(self):

        initrange = 0.1
        self.layer_top_W.data.uniform_(-initrange, initrange)
        self.layer_top_b.data.fill_(0)
        self.layer_bottom_W.data.uniform_(-initrange, initrange)
        self.layer_bottom_b.data.fill_(0)


    def forward(self, inputs, labels = None):

        batch_size, d = inputs.size()

        if labels is not None:

            label_position_top = labels / self.ntokens_per_class
            label_position_bottom = labels % self.ntokens_per_class

            layer_top_logits = torch.matmul(inputs, self.layer_top_W) + self.layer_top_b
            layer_top_probs = self.softmax(layer_top_logits)

            layer_bottom_logits = torch.squeeze(torch.bmm(torch.unsqueeze(inputs, dim=1), self.layer_bottom_W[label_position_top]), dim=1) + self.layer_bottom_b[label_position_top]
            layer_bottom_probs = self.softmax(layer_bottom_logits)

            target_probs = layer_top_probs[torch.arange(batch_size).long(), label_position_top] * layer_bottom_probs[torch.arange(batch_size).long(), label_position_bottom]

            return target_probs

        else:
            # Remain to be implemented
            layer_top_logits = torch.matmul(inputs, self.layer_top_W) + self.layer_top_b
            layer_top_probs = self.softmax(layer_top_logits)

            word_probs = layer_top_probs[:,0] * self.softmax(torch.matmul(inputs, self.layer_bottom_W[0]) + self.layer_bottom_b[0])

            for i in range(1, self.nclasses):
                word_probs = torch.cat((word_probs, layer_top_probs[:,i] * self.softmax(torch.matmul(inputs, self.layer_bottom_W[i]) + self.layer_bottom_b[i])), dim=1)

            return word_probs


HierarchicalSoftmax(
  (softmax): Softmax()
)