In [2]:
import torch
from torch import LongTensor
from torch.nn import Embedding, LSTM
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn as nn
import torch.nn.functional as F

In [3]:
# Decoder
#credit: https://gist.githubusercontent.com/HarshTrivedi/f4e7293e941b17d19058f6fb90ab0fec/raw/60dc6be30ba57aa5d0d036e6af8ff702782ded18/pad_packed_demo.py
## We want to run LSTM on a batch of 3 character sequences ['long_str', 'tiny', 'medium']
#
#     Step 1: Construct Vocabulary
#     Step 2: Load indexed data (list of instances, where each instance is list of word indices)
#     Step 3: Make Model
#  *  Step 4: Pad instances with 0s till max length sequence
#  *  Step 5: Sort instances by sequence length in descending order
#  *  Step 6: Embed the instances
#  *  Step 7: Call pack_padded_sequence with embeded instances and sequence lengths
#  *  Step 8: Forward with LSTM
#  *  Step 9: Call unpack_padded_sequences if required / or just pick last hidden vector
#  *  Summary of Shape Transformations

In [4]:
# We want to run LSTM on a batch following 3 character sequences
seqs = ['a small sentence',  # len = 3
        'little bit bigger sentence',      # len = 4
        'nothing to say']    # len = 3
test = 'new word'

In [5]:
## Step 1: Construct Vocabulary ##
##------------------------------##
# make sure <pad> idx is 0
vocab = ['<pad>', '<start>', '<end>', '<unk>'] + sorted(set([word for seq in seqs for word in seq.split()]))

In [6]:
vocab

['<pad>',
 '<start>',
 '<end>',
 '<unk>',
 'a',
 'bigger',
 'bit',
 'little',
 'nothing',
 'say',
 'sentence',
 'small',
 'to']

In [7]:
## Step 2: Load indexed data (list of instances, where each instance is list of character indices) ##
##-------------------------------------------------------------------------------------------------##
vectorized_seqs = [[vocab.index(tok) for tok in seq.split()]for seq in seqs]
vectorized_seqs

[[4, 11, 10], [7, 6, 5, 10], [8, 12, 9]]

In [8]:
## Step 3: Make Model ##
##--------------------##
embed = Embedding(len(vocab), 4) # embedding_dim = 4
lstm = LSTM(input_size=4, hidden_size=5, num_layers = 1, batch_first=True) # input_dim = 4, hidden_dim = 5

In [9]:
## Step 4: Pad instances with 0s till max length sequence ##
##--------------------------------------------------------##

# get the length of each seq in your batch
seq_lengths = LongTensor(list(map(len, vectorized_seqs)))

In [10]:
seq_lengths

tensor([3, 4, 3])

In [11]:
# seq_lengths => [3, 4, 3]
# batch_sum_seq_len: 3 + 4 + 3 = 10
# max_seq_len: 4

In [56]:
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
seq_tensor

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]])

In [57]:
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
    seq_tensor[idx, :seqlen] = LongTensor(seq)

RuntimeError: The expanded size of the tensor (4) must match the existing size (3) at non-singleton dimension 0.  Target sizes: [4].  Tensor sizes: [3]

In [58]:
seq_tensor

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]])

In [59]:
## Step 5: Sort instances by sequence length in descending order ##
##---------------------------------------------------------------##

seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
seq_tensor = seq_tensor[perm_idx]
seq_tensor

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]])

In [60]:
## Step 6: Embed the instances ##
##-----------------------------##

embedded_seq_tensor = embed(seq_tensor)
embedded_seq_tensor

tensor([[[ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177]],

        [[ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177]],

        [[ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177],
         [ 0.9113,  1.0259, -1.4606, -1.1177]]], grad_fn=<EmbeddingBackward>)

In [61]:
embedded_seq_tensor.size()

torch.Size([3, 4, 4])

In [62]:
## Step 7: Call pack_padded_sequence with embeded instances and sequence lengths ##
##-------------------------------------------------------------------------------##

packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
# packed_input (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes
packed_input.data

tensor([[ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177],
        [ 0.9113,  1.0259, -1.4606, -1.1177]],
       grad_fn=<PackPaddedSequenceBackward>)

In [63]:
packed_input.data.shape #(batch_wise_sum_seq_len X embedding_dim) = (10 X 4)

torch.Size([10, 4])

In [64]:
packed_input.batch_sizes

tensor([3, 3, 3, 1])

In [65]:
# visualization :
# little  bit    longer    sentence
# a       small  sentence 
# nothing to     say
# 3  3  3  1

In [66]:
# tensor([[-0.3227, -0.1044, -0.4612, -0.8055], #little
#         [ 0.6384,  0.5617,  0.6570,  1.0578], #a
#         [-0.7129,  0.3673,  0.0192, -0.4796], #nothing

#         [-0.6661, -1.5316,  0.6446, -1.3370], #bit
#         [-0.2879,  2.3274,  0.8726,  1.0885], #small
#         [-0.1367, -0.2717, -0.2533, -1.3797], #to

#         [-0.4653, -0.4362,  0.7046, -0.8728], #bigger
#         [-0.3567, -0.0277,  1.1684,  0.8097], #sentence
#         [ 0.9794, -0.4929, -1.6183, -0.6653], #say

#         [-0.3567, -0.0277,  1.1684,  0.8097]]) #sentence

In [67]:
## Step 8: Forward with LSTM ##
##---------------------------##

packed_output, (ht, ct) = lstm(packed_input)
# packed_output (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes

In [68]:
# ## Step 9: Call unpack_padded_sequences if required / or just pick last hidden vector ##
# ##------------------------------------------------------------------------------------##

# # unpack your output if required
# output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)

# # output.shape : ( batch_size X max_seq_len X hidden_dim) = (3 X 4 X 5)

# # Or if you just want the final hidden state?
# print(ht[-1])

In [69]:
## Summary of Shape Transformations ##
##----------------------------------##

# (batch_size X max_seq_len X embedding_dim) --> Sort by seqlen ---> (batch_size X max_seq_len X embedding_dim)
# (batch_size X max_seq_len X embedding_dim) --->      Pack     ---> (batch_sum_seq_len X embedding_dim)
# (batch_sum_seq_len X embedding_dim)        --->      LSTM     ---> (batch_sum_seq_len X hidden_dim)
# (batch_sum_seq_len X hidden_dim)           --->    UnPack     ---> (batch_size X max_seq_len X hidden_dim)

In [70]:
################ calculate loss ##############
# there are two ways to calculate losses
# using CrossEntropyLoss() = logSoftmax + NLLLoss()
# using NLLLoss()

In [71]:
criterion = nn.CrossEntropyLoss()

In [72]:
#lets assume for the sake of tutorial that targets = packed_input
targets = seq_tensor

In [73]:
targets

tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]])

In [74]:
targets = pack_padded_sequence(targets, seq_lengths.cpu().numpy(), batch_first=True)

In [75]:
targets

PackedSequence(data=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), batch_sizes=tensor([3, 3, 3, 1]), sorted_indices=None, unsorted_indices=None)

In [76]:
linear = nn.Linear(5, len(vocab)) #hidden_size, vocab_size
print(len(vocab))

13


In [77]:
outputs = linear(packed_output.data)
print(packed_output.data)
print(outputs)

tensor([[-0.0998,  0.1086,  0.1034, -0.0476, -0.0935],
        [-0.0998,  0.1086,  0.1034, -0.0476, -0.0935],
        [-0.0998,  0.1086,  0.1034, -0.0476, -0.0935],
        [-0.1242,  0.1706,  0.1814, -0.0461, -0.1279],
        [-0.1242,  0.1706,  0.1814, -0.0461, -0.1279],
        [-0.1242,  0.1706,  0.1814, -0.0461, -0.1279],
        [-0.1282,  0.2032,  0.2331, -0.0360, -0.1404],
        [-0.1282,  0.2032,  0.2331, -0.0360, -0.1404],
        [-0.1282,  0.2032,  0.2331, -0.0360, -0.1404],
        [-0.1276,  0.2207,  0.2657, -0.0273, -0.1449]], grad_fn=<CatBackward>)
tensor([[ 0.4589, -0.2719, -0.2125,  0.4503,  0.0342,  0.3797, -0.4292, -0.3648,
          0.3602,  0.0482, -0.5181, -0.1793,  0.2977],
        [ 0.4589, -0.2719, -0.2125,  0.4503,  0.0342,  0.3797, -0.4292, -0.3648,
          0.3602,  0.0482, -0.5181, -0.1793,  0.2977],
        [ 0.4589, -0.2719, -0.2125,  0.4503,  0.0342,  0.3797, -0.4292, -0.3648,
          0.3602,  0.0482, -0.5181, -0.1793,  0.2977],
        [ 0.4700, 

In [78]:
outputs.size()

torch.Size([10, 13])

In [49]:
loss = criterion(outputs, targets.data) 
loss

tensor(2.5446, grad_fn=<NllLossBackward>)

In [50]:
criterion_2 = nn.NLLLoss()
loss = criterion(F.log_softmax(outputs, dim=1), targets.data) 
loss

tensor(2.5446, grad_fn=<NllLossBackward>)

In [37]:
########### Generation #################
# For generating, you will want to generate one word at a time, but for tutorial's sake we are reusing the outputs generated above 
# to dicuss the main difference between two approaches.

In [89]:
# Deterministic: get the maximum output from output at each step of generation
_, predicted = outputs.max(1)   

In [90]:
predicted

tensor([0, 0, 0, 3, 3, 3, 3, 3, 3, 3])

In [99]:
# Stochastic: sample from weighted softmax distribution
temperature = 1
probabilities = F.softmax(outputs.div(temperature).squeeze(0).squeeze(0), dim=1) 

In [100]:
probabilities

tensor([[0.1145, 0.0551, 0.0585, 0.1135, 0.0749, 0.1057, 0.0471, 0.0502, 0.1037,
         0.0759, 0.0431, 0.0605, 0.0974],
        [0.1145, 0.0551, 0.0585, 0.1135, 0.0749, 0.1057, 0.0471, 0.0502, 0.1037,
         0.0759, 0.0431, 0.0605, 0.0974],
        [0.1145, 0.0551, 0.0585, 0.1135, 0.0749, 0.1057, 0.0471, 0.0502, 0.1037,
         0.0759, 0.0431, 0.0605, 0.0974],
        [0.1144, 0.0571, 0.0614, 0.1176, 0.0703, 0.1086, 0.0470, 0.0467, 0.1002,
         0.0795, 0.0412, 0.0580, 0.0980],
        [0.1144, 0.0571, 0.0614, 0.1176, 0.0703, 0.1086, 0.0470, 0.0467, 0.1002,
         0.0795, 0.0412, 0.0580, 0.0980],
        [0.1144, 0.0571, 0.0614, 0.1176, 0.0703, 0.1086, 0.0470, 0.0467, 0.1002,
         0.0795, 0.0412, 0.0580, 0.0980],
        [0.1137, 0.0581, 0.0628, 0.1200, 0.0678, 0.1107, 0.0468, 0.0447, 0.0982,
         0.0815, 0.0405, 0.0562, 0.0990],
        [0.1137, 0.0581, 0.0628, 0.1200, 0.0678, 0.1107, 0.0468, 0.0447, 0.0982,
         0.0815, 0.0405, 0.0562, 0.0990],
        [0.1137,

In [102]:
predicted = torch.multinomial(probabilities.data, 1)
predicted

predi

torch.Size([10, 1])

In [98]:
predicted = torch.multinomial(probabilities.data, 1)
predicted

tensor([[10],
        [ 4],
        [ 5],
        [ 4],
        [11],
        [ 5],
        [ 1],
        [10],
        [ 7],
        [ 1]])