# Introduction

In [None]:
import torch
from torch import LongTensor
from torch.nn import Embedding, LSTM
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [3]:
## We want to run LSTM on a batch of 3 character sequences ['long_str', 'tiny', 'medium']
#
#     Step 1: Construct Vocabulary
#     Step 2: Load indexed data (list of instances, where each instance is list of character indices)
#     Step 3: Make Model
#  *  Step 4: Pad instances with 0s till max length sequence
#  *  Step 5: Sort instances by sequence length in descending order
#  *  Step 6: Embed the instances
#  *  Step 7: Call pack_padded_sequence with embeded instances and sequence lengths
#  *  Step 8: Forward with LSTM
#  *  Step 9: Call unpack_padded_sequences if required / or just pick last hidden vector
#  *  Summary of Shape Transformations

In [46]:
# We want to run LSTM on a batch following 3 character sequences
seqs = ['long_str',  # len = 8
        'tiny',      # len = 4
        'medium']    # len = 6

# Step 1: Construct Vocabulary

In [47]:
##------------------------------##
# make sure <pad> idx is 0
vocab = ['<pad>'] + sorted(set([char for seq in seqs for char in seq]))
vocab

['<pad>', '_', 'd', 'e', 'g', 'i', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'y']

# Step 2: Load indexed data

In [48]:
# list of instances, where each instance is list of character indices
# vectorized_seqs => [[6, 9, 8, 4, 1, 11, 12, 10],   --> 'long_str'
#                     [12, 5, 8, 14],                --> 'tiny'
#                     [7, 3, 2, 5, 13, 7]]           --> 'medium'

vectorized_seqs = [[vocab.index(tok) for tok in seq]for seq in seqs]
vectorized_seqs

[[6, 9, 8, 4, 1, 11, 12, 10], [12, 5, 8, 14], [7, 3, 2, 5, 13, 7]]

# Step 3: Make Model

In [9]:
# Build work embedding for encoding
embed = Embedding(len(vocab), 4) # embedding_dim = 4
embed

Embedding(15, 4)

In [10]:
# Build a LSTM model
lstm = LSTM(input_size=4, hidden_size=5, batch_first=True) # input_dim = 4, hidden_dim = 5
lstm

LSTM(4, 5, batch_first=True)

# Step 4: Pad instances with 0s till max length sequence

In [55]:
# get the length of each seq in your batch
# seq_lengths => [ 8, 4,  6]
# batch_sum_seq_len: 8 + 4 + 6 = 18
# max_seq_len: 8
seq_lengths = LongTensor(list(map(len, vectorized_seqs)))
seq_lengths

tensor([8, 4, 6])

In [50]:
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
seq_tensor

tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]])

In [51]:
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
    seq_tensor[idx, :seqlen] = LongTensor(seq)

# seq_tensor.shape : (batch_size X max_seq_len) = (3 X 8)
seq_tensor

tensor([[ 6,  9,  8,  4,  1, 11, 12, 10],
        [12,  5,  8, 14,  0,  0,  0,  0],
        [ 7,  3,  2,  5, 13,  7,  0,  0]])

# Step 5: Sort instances by sequence length in descending order

In [52]:
seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
seq_tensor = seq_tensor[perm_idx]

# seq_tensor => [[ 6  9  8  4  1 11 12 10]           # long_str
#                [ 7  3  2  5 13  7  0  0]           # medium
#                [12  5  8 14  0  0  0  0]]          # tiny
# seq_tensor.shape : (batch_size X max_seq_len) = (3 X 8)
seq_tensor

tensor([[ 6,  9,  8,  4,  1, 11, 12, 10],
        [ 7,  3,  2,  5, 13,  7,  0,  0],
        [12,  5,  8, 14,  0,  0,  0,  0]])

In [54]:
print("seq_lengths: tensor([8, 4, 6]) --> ", seq_lengths)
print("perm_idx", perm_idx)

seq_lengths: tensor([8, 4, 6]) -->  tensor([8, 6, 4])
perm_idx tensor([0, 2, 1])


# Step 6: Embed the instances

In [43]:
embedded_seq_tensor = embed(seq_tensor)
# tensor([[[-6.6082e-01, -1.0030e+00, -2.1691e+00, -9.9223e-01],   --> l
#          [ 1.1729e+00,  3.9620e-02,  6.6538e-02,  8.4358e-01],   --> o
#          [-4.6659e-01, -3.0592e-01,  8.5859e-01,  1.7894e+00],   --> n
#          [ 5.9985e-01, -7.2729e-01,  1.6543e+00, -4.0921e-03],   --> g
#          [ 4.4786e-01,  1.8161e+00, -2.0536e+00, -1.8928e+00],   --> _
#          [ 4.6203e-02, -8.1840e-02,  1.1860e-01, -1.1929e+00],   --> s
#          [ 1.0607e+00, -1.7305e-03,  1.7065e-01,  2.6640e-01],   --> t
#          [ 6.4725e-01,  1.0117e+00, -5.0601e-02,  1.0182e+00]],  --> r

#         [[ 1.0390e+00,  4.2287e-02,  8.3849e-01,  1.8263e-01],   --> m
#          [-1.4074e+00, -8.1444e-01, -2.3716e-01,  2.3193e-01],   --> e
#          [-6.2568e-01,  1.4961e+00, -1.0552e+00, -6.3288e-01],   --> d
#          [-6.9315e-01, -7.1027e-01,  1.2622e+00, -5.8106e-01],   --> i
#          [-1.5714e+00,  5.8442e-01, -1.2303e+00, -5.0362e-01],   --> u
#          [ 1.0390e+00,  4.2287e-02,  8.3849e-01,  1.8263e-01],   --> m 
#          [ 5.6696e-02,  6.3566e-01, -7.6484e-01,  2.8574e-01],   --> <pad>
#          [ 5.6696e-02,  6.3566e-01, -7.6484e-01,  2.8574e-01]],  --> <pad>

#         [[ 1.0607e+00, -1.7305e-03,  1.7065e-01,  2.6640e-01],   --> t
#          [-6.9315e-01, -7.1027e-01,  1.2622e+00, -5.8106e-01],   --> i
#          [-4.6659e-01, -3.0592e-01,  8.5859e-01,  1.7894e+00],   --> n
#          [ 5.2095e-01, -1.2459e+00,  1.3618e-01,  2.8436e-02],   --> y
#          [ 5.6696e-02,  6.3566e-01, -7.6484e-01,  2.8574e-01],   --> <pad>
#          [ 5.6696e-02,  6.3566e-01, -7.6484e-01,  2.8574e-01],   --> <pad>
#          [ 5.6696e-02,  6.3566e-01, -7.6484e-01,  2.8574e-01],   --> <pad>
#          [ 5.6696e-02,  6.3566e-01, -7.6484e-01,  2.8574e-01]]], --> <pad>
#        grad_fn=<EmbeddingBackward>)
# embedded_seq_tensor.shape : (batch_size X max_seq_len X embedding_dim) = (3 X 8 X 4)
embedded_seq_tensor

tensor([[[-6.6082e-01, -1.0030e+00, -2.1691e+00, -9.9223e-01],
         [ 1.1729e+00,  3.9620e-02,  6.6538e-02,  8.4358e-01],
         [-4.6659e-01, -3.0592e-01,  8.5859e-01,  1.7894e+00],
         [ 5.9985e-01, -7.2729e-01,  1.6543e+00, -4.0921e-03],
         [ 4.4786e-01,  1.8161e+00, -2.0536e+00, -1.8928e+00],
         [ 4.6203e-02, -8.1840e-02,  1.1860e-01, -1.1929e+00],
         [ 1.0607e+00, -1.7305e-03,  1.7065e-01,  2.6640e-01],
         [ 6.4725e-01,  1.0117e+00, -5.0601e-02,  1.0182e+00]],

        [[ 1.0390e+00,  4.2287e-02,  8.3849e-01,  1.8263e-01],
         [-1.4074e+00, -8.1444e-01, -2.3716e-01,  2.3193e-01],
         [-6.2568e-01,  1.4961e+00, -1.0552e+00, -6.3288e-01],
         [-6.9315e-01, -7.1027e-01,  1.2622e+00, -5.8106e-01],
         [-1.5714e+00,  5.8442e-01, -1.2303e+00, -5.0362e-01],
         [ 1.0390e+00,  4.2287e-02,  8.3849e-01,  1.8263e-01],
         [ 5.6696e-02,  6.3566e-01, -7.6484e-01,  2.8574e-01],
         [ 5.6696e-02,  6.3566e-01, -7.6484e-01,  2.8

# Step 7: Padded embedded instances

In [26]:
# Call pack_padded_sequence with embeded instances and sequence lengths
# packed_input (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes
packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
packed_input

PackedSequence(data=tensor([[-6.6082e-01, -1.0030e+00, -2.1691e+00, -9.9223e-01],
        [ 1.0390e+00,  4.2287e-02,  8.3849e-01,  1.8263e-01],
        [ 1.0607e+00, -1.7305e-03,  1.7065e-01,  2.6640e-01],
        [ 1.1729e+00,  3.9620e-02,  6.6538e-02,  8.4358e-01],
        [-1.4074e+00, -8.1444e-01, -2.3716e-01,  2.3193e-01],
        [-6.9315e-01, -7.1027e-01,  1.2622e+00, -5.8106e-01],
        [-4.6659e-01, -3.0592e-01,  8.5859e-01,  1.7894e+00],
        [-6.2568e-01,  1.4961e+00, -1.0552e+00, -6.3288e-01],
        [-4.6659e-01, -3.0592e-01,  8.5859e-01,  1.7894e+00],
        [ 5.9985e-01, -7.2729e-01,  1.6543e+00, -4.0921e-03],
        [-6.9315e-01, -7.1027e-01,  1.2622e+00, -5.8106e-01],
        [ 5.2095e-01, -1.2459e+00,  1.3618e-01,  2.8436e-02],
        [ 4.4786e-01,  1.8161e+00, -2.0536e+00, -1.8928e+00],
        [-1.5714e+00,  5.8442e-01, -1.2303e+00, -5.0362e-01],
        [ 4.6203e-02, -8.1840e-02,  1.1860e-01, -1.1929e+00],
        [ 1.0390e+00,  4.2287e-02,  8.3849e-01,  1

In [33]:
# PackedSequence(data=tensor([
#         [-6.6082e-01, -1.0030e+00, -2.1691e+00, -9.9223e-01],   --> l
#         [ 1.0390e+00,  4.2287e-02,  8.3849e-01,  1.8263e-01],   --> m
#         [ 1.0607e+00, -1.7305e-03,  1.7065e-01,  2.6640e-01],   --> t

#         [ 1.1729e+00,  3.9620e-02,  6.6538e-02,  8.4358e-01],   --> o
#         [-1.4074e+00, -8.1444e-01, -2.3716e-01,  2.3193e-01],   --> e
#         [-6.9315e-01, -7.1027e-01,  1.2622e+00, -5.8106e-01],   --> i

#         [-4.6659e-01, -3.0592e-01,  8.5859e-01,  1.7894e+00],   --> n
#         [-6.2568e-01,  1.4961e+00, -1.0552e+00, -6.3288e-01],   --> d
#         [-4.6659e-01, -3.0592e-01,  8.5859e-01,  1.7894e+00],   --> n

#         [ 5.9985e-01, -7.2729e-01,  1.6543e+00, -4.0921e-03],   --> g
#         [-6.9315e-01, -7.1027e-01,  1.2622e+00, -5.8106e-01],   --> i
#         [ 5.2095e-01, -1.2459e+00,  1.3618e-01,  2.8436e-02],   --> y

#         [ 4.4786e-01,  1.8161e+00, -2.0536e+00, -1.8928e+00],   --> _
#         [-1.5714e+00,  5.8442e-01, -1.2303e+00, -5.0362e-01],   --> u

#         [ 4.6203e-02, -8.1840e-02,  1.1860e-01, -1.1929e+00],   --> s
#         [ 1.0390e+00,  4.2287e-02,  8.3849e-01,  1.8263e-01],   --> m 

#         [ 1.0607e+00, -1.7305e-03,  1.7065e-01,  2.6640e-01],   --> t

#         [ 6.4725e-01,  1.0117e+00, -5.0601e-02,  1.0182e+00]    --> r
# ],grad_fn=<PackPaddedSequenceBackward>), 
# batch_sizes=tensor([3, 3, 3, 3, 2, 2, 1, 1]), sorted_indices=None, unsorted_indices=None)

# visualization :
# l  o  n  g  _  s  t  r   #(long_str)
# m  e  d  i  u  m         #(medium)
# t  i  n  y               #(tiny)
# 3  3  3  3  2  2  1  1   (sum = 18 [batch_sum_seq_len])

# Step 8: Forward with LSTM

In [37]:
# packed_output (PackedSequence is NamedTuple with 2 attributes: data and batch_sizes
packed_output, (ht, ct) = lstm(packed_input)
packed_output

PackedSequence(data=tensor([[ 0.0305, -0.0686, -0.1213,  0.2604,  0.1534],
        [-0.1996,  0.1842,  0.1184, -0.0071,  0.0386],
        [-0.1373,  0.1392,  0.0782,  0.0196,  0.0906],
        [-0.0504,  0.0870, -0.0280,  0.1770,  0.1564],
        [-0.0504,  0.0071,  0.0529,  0.2341,  0.0088],
        [-0.2570,  0.2009,  0.2084,  0.1811, -0.0631],
        [-0.0495,  0.1820,  0.1432,  0.1139,  0.0452],
        [-0.2350, -0.0707, -0.1286,  0.1738,  0.1241],
        [-0.1722,  0.2153,  0.2802,  0.1195, -0.0604],
        [-0.2376,  0.3563,  0.2385,  0.0958, -0.0380],
        [-0.2768,  0.1355,  0.0127,  0.2670, -0.0711],
        [-0.1071,  0.2318,  0.2240,  0.2047,  0.0324],
        [-0.2878, -0.0577, -0.0485,  0.1638,  0.1690],
        [-0.2172, -0.0717, -0.1759,  0.2423,  0.0177],
        [-0.3456,  0.0538, -0.0787,  0.2602,  0.1533],
        [-0.3336,  0.1145, -0.0381,  0.1986,  0.0484],
        [-0.3414,  0.1961,  0.0077,  0.2441,  0.1773],
        [-0.3967,  0.1628,  0.0038,  0.1719, 

In [34]:
# packed_output.data.shape : (batch_sum_seq_len X hidden_dim) = (18 X 5)
# PackedSequence(data=tensor([
#         [ 0.0305, -0.0686, -0.1213,  0.2604,  0.1534],   --> l
#         [-0.1996,  0.1842,  0.1184, -0.0071,  0.0386],   --> m
#         [-0.1373,  0.1392,  0.0782,  0.0196,  0.0906],   --> t

#         [-0.0504,  0.0870, -0.0280,  0.1770,  0.1564],   --> o
#         [-0.0504,  0.0071,  0.0529,  0.2341,  0.0088],   --> e
#         [-0.2570,  0.2009,  0.2084,  0.1811, -0.0631],   --> i

#         [-0.0495,  0.1820,  0.1432,  0.1139,  0.0452],   --> n
#         [-0.2350, -0.0707, -0.1286,  0.1738,  0.1241],   --> d
#         [-0.1722,  0.2153,  0.2802,  0.1195, -0.0604],   --> n

#         [-0.2376,  0.3563,  0.2385,  0.0958, -0.0380],   --> g
#         [-0.2768,  0.1355,  0.0127,  0.2670, -0.0711],   --> i
#         [-0.1071,  0.2318,  0.2240,  0.2047,  0.0324],   --> y

#         [-0.2878, -0.0577, -0.0485,  0.1638,  0.1690],   --> _
#         [-0.2172, -0.0717, -0.1759,  0.2423,  0.0177],   --> u

#         [-0.3456,  0.0538, -0.0787,  0.2602,  0.1533],   --> s
#         [-0.3336,  0.1145, -0.0381,  0.1986,  0.0484],   --> m 

#         [-0.3414,  0.1961,  0.0077,  0.2441,  0.1773],   --> t

#         [-0.3967,  0.1628,  0.0038,  0.1719,  0.2116]   --> r
# ], grad_fn=<CatBackward>), 

# batch_sizes=tensor([3, 3, 3, 3, 2, 2, 1, 1]), sorted_indices=None, unsorted_indices=None)
# visualization :
# l  o  n  g  _  s  t  r   #(long_str)
# m  e  d  i  u  m         #(medium)
# t  i  n  y               #(tiny)
# 3  3  3  3  2  2  1  1   (sum = 18 [batch_sum_seq_len])

In [35]:
# The hidden output of the last layer of each sub_lstm model
# tensor([[[-0.3967,  0.1628,  0.0038,  0.1719,  0.2116],    --> r
#          [-0.3336,  0.1145, -0.0381,  0.1986,  0.0484],    --> m 
#          [-0.1071,  0.2318,  0.2240,  0.2047,  0.0324]]],  --> y
#        grad_fn=<StackBackward>)
ht

tensor([[[-0.3967,  0.1628,  0.0038,  0.1719,  0.2116],
         [-0.3336,  0.1145, -0.0381,  0.1986,  0.0484],
         [-0.1071,  0.2318,  0.2240,  0.2047,  0.0324]]],
       grad_fn=<StackBackward>)

In [57]:
# tensor([[[-0.8016,  0.4717,  0.0093,  0.3356,  0.4591],
#          [-0.6821,  0.2103, -0.0774,  0.3886,  0.1060],
#          [-0.3206,  0.5972,  0.5197,  0.3295,  0.0666]]],
#        grad_fn=<StackBackward>)
# ct.shape, torch.Size([1, 3, 5])
ct

tensor([[[-0.8016,  0.4717,  0.0093,  0.3356,  0.4591],
         [-0.6821,  0.2103, -0.0774,  0.3886,  0.1060],
         [-0.3206,  0.5972,  0.5197,  0.3295,  0.0666]]],
       grad_fn=<StackBackward>)

# Step 9: unpack padded output sequence

In [39]:
# Call unpack_padded_sequences if required / or just pick last hidden vector
# unpack your output if required
output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)

In [40]:
# tensor([[[ 0.0305, -0.0686, -0.1213,  0.2604,  0.1534],   --> l
#          [-0.0504,  0.0870, -0.0280,  0.1770,  0.1564],   --> o
#          [-0.0495,  0.1820,  0.1432,  0.1139,  0.0452],   --> n
#          [-0.2376,  0.3563,  0.2385,  0.0958, -0.0380],   --> g
#          [-0.2878, -0.0577, -0.0485,  0.1638,  0.1690],   --> _
#          [-0.3456,  0.0538, -0.0787,  0.2602,  0.1533],   --> s
#          [-0.3414,  0.1961,  0.0077,  0.2441,  0.1773],   --> t
#          [-0.3967,  0.1628,  0.0038,  0.1719,  0.2116]],  --> r

#         [[-0.1996,  0.1842,  0.1184, -0.0071,  0.0386],   --> m
#          [-0.0504,  0.0071,  0.0529,  0.2341,  0.0088],   --> e
#          [-0.2350, -0.0707, -0.1286,  0.1738,  0.1241],   --> d
#          [-0.2768,  0.1355,  0.0127,  0.2670, -0.0711],   --> i
#          [-0.2172, -0.0717, -0.1759,  0.2423,  0.0177],   --> u
#          [-0.3336,  0.1145, -0.0381,  0.1986,  0.0484],   --> m 
#          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],   --> <pad>
#          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],  --> <pad>

#         [[-0.1373,  0.1392,  0.0782,  0.0196,  0.0906],   --> t
#          [-0.2570,  0.2009,  0.2084,  0.1811, -0.0631],   --> i
#          [-0.1722,  0.2153,  0.2802,  0.1195, -0.0604],   --> n
#          [-0.1071,  0.2318,  0.2240,  0.2047,  0.0324],   --> y
#          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],   --> <pad>
#          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],   --> <pad>
#          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],   --> <pad>
#          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]   --> <pad>
#        ],
#        grad_fn=<TransposeBackward0>)

# output.shape : ( batch_size X max_seq_len X hidden_dim) = (3 X 8 X 5)
output

tensor([[[ 0.0305, -0.0686, -0.1213,  0.2604,  0.1534],
         [-0.0504,  0.0870, -0.0280,  0.1770,  0.1564],
         [-0.0495,  0.1820,  0.1432,  0.1139,  0.0452],
         [-0.2376,  0.3563,  0.2385,  0.0958, -0.0380],
         [-0.2878, -0.0577, -0.0485,  0.1638,  0.1690],
         [-0.3456,  0.0538, -0.0787,  0.2602,  0.1533],
         [-0.3414,  0.1961,  0.0077,  0.2441,  0.1773],
         [-0.3967,  0.1628,  0.0038,  0.1719,  0.2116]],

        [[-0.1996,  0.1842,  0.1184, -0.0071,  0.0386],
         [-0.0504,  0.0071,  0.0529,  0.2341,  0.0088],
         [-0.2350, -0.0707, -0.1286,  0.1738,  0.1241],
         [-0.2768,  0.1355,  0.0127,  0.2670, -0.0711],
         [-0.2172, -0.0717, -0.1759,  0.2423,  0.0177],
         [-0.3336,  0.1145, -0.0381,  0.1986,  0.0484],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-0.1373,  0.1392,  0.0782,  0.0196,  0.0906],
         [-0.2570,  0.2009,  0.2084,  0.1811

In [41]:
input_sizes

tensor([8, 6, 4])

# Summary of Shape Transformations

In [42]:
# (batch_size X max_seq_len X embedding_dim) --> Sort by seqlen ---> (batch_size X max_seq_len X embedding_dim)
# (batch_size X max_seq_len X embedding_dim) --->      Pack     ---> (batch_sum_seq_len X embedding_dim)
# (batch_sum_seq_len X embedding_dim)        --->      LSTM     ---> (batch_sum_seq_len X hidden_dim)
# (batch_sum_seq_len X hidden_dim)           --->    UnPack     ---> (batch_size X max_seq_len X hidden_dim)