<a href="https://colab.research.google.com/github/bythyag/neural-networks/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn

In [2]:
# starting point: raw text input
input_raw1 = "Colin was here."
input_raw2 = "Colin went away subito."

In [3]:
vocabulary = ["Colin", "was", "here", "went", "away", "[UNK]", "[EOS]", "[PAD]"]
n_vocab = len(vocabulary)

In [4]:
# tokenization: sequence of tokens from the vocabulary
input_tok1 = ["Colin", "was", "here", "[EOS]", "[PAD]"]
input_tok2 = ["Colin", "went", "away", "subito", "[EOS]"]

# indexing: represent input as tensor of indices
input_ind1 = torch.tensor([0,1,2,6,7])
input_ind2 = torch.tensor([0,3,4,5,6])

In [11]:
batch_size = 2
sequence_length = 5

batched_input = torch.stack([input_ind1, input_ind2])
print(batched_input)

# or we can use this:
batched_input = torch.cat([input_ind1, input_ind2]).reshape(batch_size, sequence_length)
print(batched_input)

tensor([[0, 1, 2, 6, 7],
        [0, 3, 4, 5, 6]])
tensor([[0, 1, 2, 6, 7],
        [0, 3, 4, 5, 6]])


In [13]:
# embedding: vector-representation of each word (here 3 element vector representation for each word in vocab)
# needs parameter for the size of the embedding

n_embbeding_size = 3
embedding = nn.Embedding(n_vocab, n_embbeding_size)

#the parameters are initialised randomly and can be adjusted during the training
for p in embedding.parameters():
    print(p)


Parameter containing:
tensor([[ 0.4922,  0.9187, -0.3402],
        [-1.4696,  0.6587, -0.1652],
        [ 0.0319, -0.7485, -0.5190],
        [ 0.4224,  0.1670,  0.6420],
        [-0.2827,  0.4471,  0.1822],
        [ 0.3597, -0.4901,  1.1842],
        [-0.1385,  0.9766,  1.8635],
        [-0.1711,  1.1492, -1.1834]], requires_grad=True)


In [14]:
embedding(input_ind1)
#embedding for “[EOS] is [-0.1385,  0.9766,  1.8635]

tensor([[ 0.4922,  0.9187, -0.3402],
        [-1.4696,  0.6587, -0.1652],
        [ 0.0319, -0.7485, -0.5190],
        [-0.1385,  0.9766,  1.8635],
        [-0.1711,  1.1492, -1.1834]], grad_fn=<EmbeddingBackward0>)

In [15]:
# the 'embedding' takes batched input
input_embedding = embedding(batched_input)
print(input_embedding)

tensor([[[ 0.4922,  0.9187, -0.3402],
         [-1.4696,  0.6587, -0.1652],
         [ 0.0319, -0.7485, -0.5190],
         [-0.1385,  0.9766,  1.8635],
         [-0.1711,  1.1492, -1.1834]],

        [[ 0.4922,  0.9187, -0.3402],
         [ 0.4224,  0.1670,  0.6420],
         [-0.2827,  0.4471,  0.1822],
         [ 0.3597, -0.4901,  1.1842],
         [-0.1385,  0.9766,  1.8635]]], grad_fn=<EmbeddingBackward0>)


In [24]:
n_hidden_size = 4

# instantiate an LSTM
lstm = nn.LSTM(input_size  = n_embbeding_size,
               hidden_size = n_hidden_size,  # size of hidden state
               num_layers  = 2,              # number of stacked layers
               batch_first = True            # expect input in format (batch, sequence, token)
               )

In [25]:
lstm #it is just a single layer lmao

LSTM(3, 4, num_layers=2, batch_first=True)

In [27]:
output, (hidden, cell) = lstm(input = input_embedding)

print("\nLSTM embeddings in last layer for each word:\n", output)
print("---")
print("\nHidden state of last word for each layer:\n", hidden)
print("---")
print("\nCell state of last word for each layer:\n", cell)
print("---")
print(print("The first one is Layer 1, second one is layer 2, there the first row is batch 1, second row is batch 2"))


LSTM embeddings in last layer for each word:
 tensor([[[ 0.0837, -0.0807, -0.0312,  0.1637],
         [ 0.1266, -0.1247, -0.0420,  0.2936],
         [ 0.1810, -0.1779, -0.0865,  0.3208],
         [ 0.1973, -0.2049, -0.0803,  0.3576],
         [ 0.2229, -0.2326, -0.0998,  0.3593]],

        [[ 0.0837, -0.0807, -0.0312,  0.1637],
         [ 0.1433, -0.1437, -0.0539,  0.2488],
         [ 0.1817, -0.1860, -0.0675,  0.3049],
         [ 0.2094, -0.2160, -0.0793,  0.3325],
         [ 0.2180, -0.2310, -0.0744,  0.3617]]], grad_fn=<TransposeBackward0>)
---

Hidden state of last word for each layer:
 tensor([[[ 0.1893,  0.1909, -0.0941,  0.1243],
         [ 0.3339,  0.0964,  0.0149,  0.0789]],

        [[ 0.2229, -0.2326, -0.0998,  0.3593],
         [ 0.2180, -0.2310, -0.0744,  0.3617]]], grad_fn=<StackBackward0>)
---

Cell state of last word for each layer:
 tensor([[[ 0.6643,  0.2503, -0.1253,  0.4158],
         [ 0.6021,  0.1408,  0.0358,  0.2091]],

        [[ 0.5662, -0.5077, -0.1852,  0.7

In [30]:
output.shape

torch.Size([2, 5, 4])

In [29]:
# map the LSTM embeddings of the last word onto a vector
# of the same length as the vocabulary size

linear_map = nn.Linear(n_hidden_size, n_vocab)
weights = linear_map(output)
print(linear_map)
print(weights)

Linear(in_features=4, out_features=8, bias=True)
tensor([[[ 0.4399, -0.1575, -0.0394, -0.0152,  0.1835, -0.3252,  0.3059,
           0.1831],
         [ 0.4268, -0.2458, -0.0229, -0.0852,  0.2092, -0.3864,  0.3046,
           0.1490],
         [ 0.4575, -0.2866, -0.0530, -0.1003,  0.2315, -0.4173,  0.2870,
           0.1638],
         [ 0.4569, -0.3236, -0.0566, -0.1247,  0.2471, -0.4357,  0.2743,
           0.1642],
         [ 0.4751, -0.3400, -0.0763, -0.1271,  0.2587, -0.4462,  0.2617,
           0.1781]],

        [[ 0.4399, -0.1575, -0.0394, -0.0152,  0.1835, -0.3252,  0.3059,
           0.1831],
         [ 0.4528, -0.2381, -0.0547, -0.0651,  0.2160, -0.3749,  0.2855,
           0.1816],
         [ 0.4609, -0.2918, -0.0647, -0.0983,  0.2379, -0.4077,  0.2712,
           0.1810],
         [ 0.4710, -0.3241, -0.0767, -0.1156,  0.2528, -0.4265,  0.2593,
           0.1861],
         [ 0.4675, -0.3487, -0.0752, -0.1336,  0.2619, -0.4399,  0.2536,
           0.1823]]], grad_fn=<AddBackw

In [31]:
# get probabilities from output weights
def next_word_probabilities(weights):
    softmax = torch.nn.Softmax(dim=2)
    return(softmax(weights).detach().numpy().round(4))

# without any training: next word weights have high entropy
print(next_word_probabilities(weights))

[[[0.1757 0.0967 0.1088 0.1115 0.136  0.0818 0.1537 0.1359]
  [0.1774 0.0905 0.1131 0.1063 0.1427 0.0787 0.157  0.1344]
  [0.1835 0.0872 0.1101 0.105  0.1463 0.0765 0.1547 0.1368]
  [0.1847 0.0846 0.1105 0.1032 0.1497 0.0756 0.1539 0.1378]
  [0.188  0.0832 0.1083 0.1029 0.1514 0.0748 0.1518 0.1397]]

 [[0.1757 0.0967 0.1088 0.1115 0.136  0.0818 0.1537 0.1359]
  [0.1807 0.0906 0.1088 0.1077 0.1426 0.079  0.1529 0.1378]
  [0.1839 0.0866 0.1087 0.1051 0.1472 0.0772 0.1522 0.139 ]
  [0.1866 0.0843 0.1079 0.1038 0.15   0.0761 0.151  0.1403]
  [0.187  0.0827 0.1087 0.1025 0.1522 0.0755 0.151  0.1406]]]
