In [None]:
import torch
from torch import LongTensor
from torch.nn import Embedding, LSTM
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# Decoder
#credit: https://gist.githubusercontent.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec/raw/60dc6be30ba57aa5d0d036e6af8ff702782ded18/pad_packed_demo.py
## We want to run LSTM on a batch of 3 character sequences ['long_str', 'tiny', 'medium']
#
#     Step 1: Construct Vocabulary
#     Step 2: Load indexed data (list of instances, where each instance is list of word indices)
#     Step 3: Make Model
#  *  Step 4: Pad instances with 0s till max length sequence
#  *  Step 5: Sort instances by sequence length in descending order
#  *  Step 6: Embed the instances
#  *  Step 7: Call pack_padded_sequence with embeded instances and sequence lengths
#  *  Step 8: Forward with LSTM
#  *  Step 9: Call unpack_padded_sequences if required / or just pick last hidden vector
#  *  Summary of Shape Transformations

In [None]:
# We want to run LSTM on a batch following 3 character sequences
seqs = ['a small sentence',  # len = 3
        'little bit bigger sentence',      # len = 4
        'nothing to say']    # len = 3
test = 'new word'

In [None]:
## Step 1: Construct Vocabulary ##
##------------------------------##
# make sure <pad> idx is 0
vocab = ['<pad>', '<start>', '<end>', '<unk>'] + sorted(set([word for seq in seqs for word in seq.split()]))

In [None]:
vocab

['<pad>',
 '<start>',
 '<end>',
 '<unk>',
 'a',
 'bigger',
 'bit',
 'little',
 'nothing',
 'say',
 'sentence',
 'small',
 'to']

In [None]:
## Step 2: Load indexed data (list of instances, where each instance is list of character indices) ##
##-------------------------------------------------------------------------------------------------##
vectorized_seqs = [[vocab.index(tok) for tok in seq.split()]for seq in seqs]
vectorized_seqs

[[4, 11, 10], [7, 6, 5, 10], [8, 12, 9]]

In [None]:
## Step 3: Make Model ##
##--------------------##
embed = Embedding(len(vocab), 4) # embedding_dim = 4
lstm = LSTM(input_size=4, hidden_size=5, num_layers = 1, batch_first=True) # input_dim = 4, hidden_dim = 5

In [None]:
## Step 4: Pad instances with 0s till max length sequence ##
##--------------------------------------------------------##

# get the length of each seq in your batch
seq_lengths = LongTensor(list(map(len, vectorized_seqs)))

In [None]:
seq_lengths

tensor([3, 4, 3])

In [None]:
# seq_lengths => [3, 4, 3]
# batch_sum_seq_len: 3 + 4 + 3 = 10
# max_seq_len: 4

In [None]:
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
seq_tensor

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]])

In [None]:
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
    seq_tensor[idx, :seqlen] = LongTensor(seq)

In [None]:
seq_tensor

tensor([[ 4, 11, 10,  0],
        [ 7,  6,  5, 10],
        [ 8, 12,  9,  0]])

In [None]:
## Step 5: Sort instances by sequence length in descending order ##
##---------------------------------------------------------------##

seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
seq_tensor = seq_tensor[perm_idx]
seq_tensor

tensor([[ 7,  6,  5, 10],
        [ 4, 11, 10,  0],
        [ 8, 12,  9,  0]])

In [None]:
## Step 6: Embed the instances ##
##-----------------------------##

embedded_seq_tensor = embed(seq_tensor)
embedded_seq_tensor

tensor([[[ 0.7504,  0.1101,  0.6357, -0.3962],
         [ 0.5860,  0.1990, -1.1089, -0.9944],
         [-0.7872, -0.0387, -0.3513,  1.4797],
         [ 0.4854, -0.6458, -0.0287,  1.6919]],

        [[-1.5781, -0.4140,  0.6682,  1.3491],
         [-1.1741, -0.0588,  0.0357, -1.2057],
         [ 0.4854, -0.6458, -0.0287,  1.6919],
         [-1.2099, -1.0768, -0.6857,  0.6421]],

        [[-0.0508,  1.1241, -0.3398, -1.4403],
         [ 0.2659, -0.3261, -0.1594,  1.0797],
         [ 0.0780, -0.0936,  0.0746, -1.2973],
         [-1.2099, -1.0768, -0.6857,  0.6421]]], grad_fn=<EmbeddingBackward>)

In [None]:
embedded_seq_tensor.size()

torch.Size([3, 4, 4])

In [None]:
## Step 7: Call pack_padded_sequence with embeded instances and sequence lengths ##
##-------------------------------------------------------------------------------##

packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
# packed_input (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes
packed_input.data

tensor([[ 0.7504,  0.1101,  0.6357, -0.3962],
        [-1.5781, -0.4140,  0.6682,  1.3491],
        [-0.0508,  1.1241, -0.3398, -1.4403],
        [ 0.5860,  0.1990, -1.1089, -0.9944],
        [-1.1741, -0.0588,  0.0357, -1.2057],
        [ 0.2659, -0.3261, -0.1594,  1.0797],
        [-0.7872, -0.0387, -0.3513,  1.4797],
        [ 0.4854, -0.6458, -0.0287,  1.6919],
        [ 0.0780, -0.0936,  0.0746, -1.2973],
        [ 0.4854, -0.6458, -0.0287,  1.6919]],
       grad_fn=<PackPaddedSequenceBackward>)

In [None]:
packed_input.data.shape #(batch_wise_sum_seq_len X embedding_dim) = (10 X 4)

torch.Size([10, 4])

In [None]:
packed_input.batch_sizes

tensor([3, 3, 3, 1])

In [None]:
# visualization :
# little  bit    longer    sentence
# a       small  sentence 
# nothing to     say
# 3  3  3  1

In [None]:
# tensor([[-0.3227, -0.1044, -0.4612, -0.8055], #little
#         [ 0.6384,  0.5617,  0.6570,  1.0578], #a
#         [-0.7129,  0.3673,  0.0192, -0.4796], #nothing

#         [-0.6661, -1.5316,  0.6446, -1.3370], #bit
#         [-0.2879,  2.3274,  0.8726,  1.0885], #small
#         [-0.1367, -0.2717, -0.2533, -1.3797], #to

#         [-0.4653, -0.4362,  0.7046, -0.8728], #bigger
#         [-0.3567, -0.0277,  1.1684,  0.8097], #sentence
#         [ 0.9794, -0.4929, -1.6183, -0.6653], #say

#         [-0.3567, -0.0277,  1.1684,  0.8097]]) #sentence

In [None]:
## Step 8: Forward with LSTM ##
##---------------------------##

packed_output, (ht, ct) = lstm(packed_input)
# packed_output (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes

In [None]:
# ## Step 9: Call unpack_padded_sequences if required / or just pick last hidden vector ##
# ##------------------------------------------------------------------------------------##

# # unpack your output if required
# output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)

# # output.shape : ( batch_size X max_seq_len X hidden_dim) = (3 X 4 X 5)

# # Or if you just want the final hidden state?
# print(ht[-1])

In [None]:
## Summary of Shape Transformations ##
##----------------------------------##

# (batch_size X max_seq_len X embedding_dim) --> Sort by seqlen ---> (batch_size X max_seq_len X embedding_dim)
# (batch_size X max_seq_len X embedding_dim) --->      Pack     ---> (batch_sum_seq_len X embedding_dim)
# (batch_sum_seq_len X embedding_dim)        --->      LSTM     ---> (batch_sum_seq_len X hidden_dim)
# (batch_sum_seq_len X hidden_dim)           --->    UnPack     ---> (batch_size X max_seq_len X hidden_dim)

In [None]:
################ calculate loss ##############
# there are two ways to calculate losses
# using CrossEntropyLoss() = logSoftmax + NLLLoss()
# using NLLLoss()

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
#lets assume for the sake of tutorial that targets = packed_input
targets = seq_tensor

In [None]:
targets

tensor([[ 7,  6,  5, 10],
        [ 4, 11, 10,  0],
        [ 8, 12,  9,  0]])

In [None]:
targets = pack_padded_sequence(targets, seq_lengths.cpu().numpy(), batch_first=True)

In [None]:
targets

PackedSequence(data=tensor([ 7,  4,  8,  6, 11, 12,  5, 10,  9, 10]), batch_sizes=tensor([3, 3, 3, 1]), sorted_indices=None, unsorted_indices=None)

In [None]:
linear = nn.Linear(5, len(vocab)) #hidden_size, vocab_size

In [None]:
outputs = linear(packed_output.data)

In [None]:
outputs.size()

torch.Size([10, 13])

In [None]:
loss = criterion(outputs, targets.data) 
loss

tensor(2.5527, grad_fn=<NllLossBackward>)

In [None]:
criterion_2 = nn.NLLLoss()
loss = criterion(F.log_softmax(outputs, dim=1), targets.data) 
loss

tensor(2.5527, grad_fn=<NllLossBackward>)

In [None]:
########### Generation #################
# For generating, you will want to generate one word at a time, but for tutorial's sake we are reusing the outputs generated above 
# to dicuss the main difference between two approaches.

In [None]:
# Deterministic: get the maximum output from output at each step of generation
_, predicted = outputs.max(1)   

In [None]:
predicted

tensor([4, 4, 4, 3, 4, 4, 4, 4, 4, 4])

In [None]:
# Stochastic: sample from weighted softmax distribution
temperature = 1
probabilities = F.softmax(outputs.div(temperature).squeeze(0).squeeze(0), dim=1) 

In [None]:
probabilities

tensor([[0.0556, 0.0627, 0.0468, 0.0957, 0.1069, 0.0886, 0.0839, 0.0812, 0.0944,
         0.0692, 0.0480, 0.0744, 0.0925],
        [0.0512, 0.0638, 0.0502, 0.0884, 0.1183, 0.0891, 0.0934, 0.0711, 0.0910,
         0.0690, 0.0461, 0.0717, 0.0965],
        [0.0514, 0.0653, 0.0462, 0.0967, 0.1059, 0.0994, 0.0814, 0.0704, 0.1006,
         0.0757, 0.0417, 0.0768, 0.0886],
        [0.0575, 0.0613, 0.0431, 0.1086, 0.0987, 0.0893, 0.0787, 0.0858, 0.0953,
         0.0700, 0.0507, 0.0765, 0.0844],
        [0.0467, 0.0676, 0.0498, 0.0846, 0.1228, 0.1004, 0.0882, 0.0590, 0.0955,
         0.0756, 0.0396, 0.0738, 0.0964],
        [0.0506, 0.0602, 0.0439, 0.0945, 0.1044, 0.0948, 0.0914, 0.0766, 0.0966,
         0.0700, 0.0450, 0.0769, 0.0951],
        [0.0495, 0.0598, 0.0446, 0.0953, 0.1056, 0.0944, 0.0961, 0.0747, 0.0954,
         0.0697, 0.0451, 0.0759, 0.0939],
        [0.0474, 0.0583, 0.0430, 0.0898, 0.1115, 0.0942, 0.1003, 0.0722, 0.0915,
         0.0676, 0.0465, 0.0761, 0.1015],
        [0.0520,

In [None]:
predicted = torch.multinomial(probabilities.data, 1)
predicted

tensor([[6],
        [1],
        [9],
        [4],
        [8],
        [8],
        [3],
        [2],
        [1],
        [6]])

In [None]:
predicted = torch.multinomial(probabilities.data, 1)
predicted

tensor([[ 4],
        [ 8],
        [ 6],
        [ 3],
        [ 4],
        [ 3],
        [ 1],
        [10],
        [ 5],
        [ 6]])