# Bigram Model

Here we are trying to make a model which predict the next letter given a set of characters. 

We train very little data and compute but still, it's giving us some interesting results.

In [1]:
chars =  list("abcdefghijklmnopqrstuvwxyz -_,")
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for c, i in ctoi.items()}

In [2]:
def encode_text(text):
    return [ctoi[c] for c in text.lower()]

def decode_text(encoded_text):
    return "".join([itoc[i] for i in encoded_text])

In [3]:
import torch
import torch.nn as nn

vocab_size = len(chars)
block_size = 8
text = "hello world, this is Bread Modular"
data = torch.tensor(encode_text(text), dtype=torch.long)
data

tensor([ 7,  4, 11, 11, 14, 26, 22, 14, 17, 11,  3, 29, 26, 19,  7,  8, 18, 26,
         8, 18, 26,  1, 17,  4,  0,  3, 26, 12, 14,  3, 20, 11,  0, 17])

In [4]:
def get_batch(d):
    i = torch.randint(len(d) - block_size - 1, (1,))
    x = data[i : i + block_size]
    y = data[i + 1 : i + 1 + block_size]
    return x, y

In [5]:
class BigramModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()

        # first is the number of embeddings, second is the embedding dimension
        # here we use cross_entropy loss, so we need the dimension to be the same as no of embeddings
        # These are the weights of this model, so it trains how likely the next letter for a given letter
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx):
        # logits are just numbers shows the weights or predictions 
        logits = self.token_embedding(idx)
        return logits


In [6]:
## forward pass

model = BigramModel()
x = torch.tensor(encode_text("hello"))
logits = model.forward(x)
vocab_size, x.shape, logits.shape

(30, torch.Size([5]), torch.Size([5, 30]))

In [7]:
## training

model = BigramModel()

# this is the optimization which update the parameters etc
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

for step in range(2000):
    x, y = get_batch(data)
    logits = model(x)

    # calculate the loss
    loss = nn.functional.cross_entropy(logits, y)
    # updating the params
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 200 == 0:
        print(step)

0
200
400
600
800
1000
1200
1400
1600
1800


In [35]:
# sampling

encoded = encode_text("t")
input = torch.tensor(encoded)
output = encoded

for _ in range(20):
    logits = model(input)
    # converting logits into probabilities
    probs = torch.softmax(logits, dim=-1)
    # find the most likely one
    c_index = torch.multinomial(probs, num_samples = 1)
    # so update the input as the next time it will generate using the newly generated input
    input = torch.tensor([c_index.item()])
    # append it to the output list
    output.append(c_index.item())

decode_text(output)

'this mo morllllorldul'