In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dataclasses import dataclass
from pathlib import Path

import torch as t
from torch.utils.data import DataLoader
from data import NamesDataset


In [3]:
DATAROOT = Path.home() / "mldata" / "makemore" 
RUNROOT = Path.home() / "mlruns" / "makemore" / "names"
EOT_TOKEN = "."

In [4]:
CONTEXT_LEN = 3
dataset = NamesDataset(DATAROOT / "names.txt", CONTEXT_LEN)
dataset[1:10]

(tensor([[ 0,  0,  5],
         [ 0,  5, 13],
         [ 5, 13, 13],
         [13, 13,  1],
         [ 0,  0,  0],
         [ 0,  0, 15],
         [ 0, 15, 12],
         [15, 12,  9],
         [12,  9, 22]]),
 tensor([13, 13,  1,  0, 15, 12,  9, 22,  9]))

In [5]:
traindl = DataLoader(dataset, batch_size=32, shuffle=True)

In [6]:
it = iter(traindl)

In [7]:
x, y = next(it)
print(x.shape, y.shape)
print(x[0], y[0])

torch.Size([32, 3]) torch.Size([32])
tensor([ 0,  0, 26]) tensor(8)


### Embedding Layer

Lets define the embedding layer using the nifty Embedding module. The $i^{th}$ row of the module is the embedding for the number $i$. E.g., `emb([5, 13, 9])` will output three rows, the first one will be `E[5]`, the second one will be `E[13]`, and the third one will be `E[9]`.

In [8]:
EMB_DIM = 2

In [9]:
emb = t.nn.Embedding(27, EMB_DIM)

In [10]:
emb(x[0])

tensor([[-1.1141,  0.3676],
        [-1.1141,  0.3676],
        [-0.9831, -0.6679]], grad_fn=<EmbeddingBackward0>)

In [11]:
print(emb.weight.data[x[0][0]])
print(emb.weight.data[x[0][1]])
print(emb.weight.data[x[0][1]])

tensor([-1.1141,  0.3676])
tensor([-1.1141,  0.3676])
tensor([-1.1141,  0.3676])


In [12]:
x1 = emb(x)
x1.shape

torch.Size([32, 3, 2])

In [13]:
x1[:3]

tensor([[[-1.1141,  0.3676],
         [-1.1141,  0.3676],
         [-0.9831, -0.6679]],

        [[-0.3845, -0.1170],
         [-0.1816,  0.9680],
         [-0.8184, -0.0974]],

        [[-1.1141,  0.3676],
         [-1.1141,  0.3676],
         [-1.1141,  0.3676]]], grad_fn=<SliceBackward0>)

### Linear Layer

The embedding layer will convert each row of 3 integers into a matrix of $3 \times 2$, with each row in the output being the vector representation of the corresponding character. We can combine these embedding outputs in different ways, in the lecture these 3 vectors are concatenated into a single vector. I can do this using the `view` method or the `flatten` method. Then each instance becomes a 6-element vector that is transformed by the linear layer into a 100-element vector.

In [14]:
HIDDEN_DIM = 100
fc1 = t.nn.Linear(in_features=CONTEXT_LEN * EMB_DIM, out_features=HIDDEN_DIM)

In [15]:
x2 = x1.flatten(start_dim=1)
x2.shape

torch.Size([32, 6])

In [16]:
x2[:3]

tensor([[-1.1141,  0.3676, -1.1141,  0.3676, -0.9831, -0.6679],
        [-0.3845, -0.1170, -0.1816,  0.9680, -0.8184, -0.0974],
        [-1.1141,  0.3676, -1.1141,  0.3676, -1.1141,  0.3676]],
       grad_fn=<SliceBackward0>)

In [17]:
x2 = x1.view(-1, CONTEXT_LEN * EMB_DIM)
print(x2.shape)
x2[:3]

torch.Size([32, 6])


tensor([[-1.1141,  0.3676, -1.1141,  0.3676, -0.9831, -0.6679],
        [-0.3845, -0.1170, -0.1816,  0.9680, -0.8184, -0.0974],
        [-1.1141,  0.3676, -1.1141,  0.3676, -1.1141,  0.3676]],
       grad_fn=<SliceBackward0>)

In [18]:
x3 = fc1(x2)
x3.shape

torch.Size([32, 100])

Now lets see if this works with a single instance. Note that I did not have add an additional batch-of-1 dimension to the input.

In [19]:
emb(x[0]).view(-1, CONTEXT_LEN * EMB_DIM).shape

torch.Size([1, 6])

In [20]:
fc1(emb(x[0]).view(-1, CONTEXT_LEN * EMB_DIM)).shape

torch.Size([1, 100])

### Non-Linearity
In the lecture non-linearity is given by the `tanh` function. This will squash each row of 100 floats into 100 floats where each element is between -1 and 1.

In [21]:
x4 = t.tanh(x3)
print(x4.shape)
print(t.min(x3), t.max(x3))
print(t.min(x4), t.max(x4))

torch.Size([32, 100])
tensor(-1.8588, grad_fn=<MinBackward1>) tensor(1.6359, grad_fn=<MaxBackward1>)
tensor(-0.9526, grad_fn=<MinBackward1>) tensor(0.9269, grad_fn=<MaxBackward1>)


### Final Layer
This is another linear layer that will output the logits.

In [22]:
fc2 = t.nn.Linear(in_features=HIDDEN_DIM, out_features=dataset.len_vocab())

In [23]:
dataset.len_vocab()

27

In [24]:
logits = fc2(x4)
print(logits.shape)

torch.Size([32, 27])


With a single row -

In [25]:
fc2(t.tanh(fc1(emb(x[0]).view(-1, CONTEXT_LEN * EMB_DIM)))).shape

torch.Size([1, 27])

### Loss
This model can be thought of as a multiclass classification problem where the 4th character is the "label" that the network is trying to predict. The 27 element vector output by the network is the probability distribution of each character. Cross entropy loss can be used here.

In [26]:
loss_fn = t.nn.CrossEntropyLoss()

In [27]:
loss = loss_fn(logits, y)
loss

tensor(3.3244, grad_fn=<NllLossBackward0>)

### Model
Getting everything together we get the following model -

In [28]:
from model import CharLangModel

model = CharLangModel(
    context_len=CONTEXT_LEN, 
    vocab_len=dataset.len_vocab(), 
    hidden_dim=HIDDEN_DIM, 
    emb_dim=EMB_DIM
)

In [29]:
logits = model.forward(x)
logits.shape

torch.Size([32, 27])

In [30]:
loss_fn(logits, y)

tensor(3.4355, grad_fn=<NllLossBackward0>)

In [31]:
y.shape

torch.Size([32])

When predicting from a single row, take care of the dims. A good practice is to send in a single row as a batch-of-1. In this model it is not neccessary. But the output here is a single batch of 1.

In [32]:
print(x[0].shape)
print(x[0].unsqueeze(0).shape)

torch.Size([3])
torch.Size([1, 3])


In [33]:
model.forward(x[0].unsqueeze(0)).shape

torch.Size([1, 27])

In [34]:
model.forward(x[0]).shape

torch.Size([1, 27])