## Using simple bigram architecture for building language model

In [2]:
import torch

In [3]:
with open("wizard_of_oz.txt", "r+", encoding="utf-8") as file:
    data = file.read()

In [4]:
print(len(data))
print(data[:200])

208602
﻿The Project Gutenberg eBook of The Wonderful Wizard of Oz, by L. Frank Baum

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with al


In [5]:
chars = sorted(set(data))
print(chars)
print(len(chars))

['\n', ' ', '!', '#', '&', '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '5', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '\ufeff']
81


In [6]:
str_to_int = { ch:i for i, ch in enumerate(chars)}
int_to_str = { i:ch for i, ch in enumerate(chars)}

encode = lambda s : [ str_to_int[c] for c in s]
decode = lambda i : ''.join([int_to_str[e] for e in i])

encode("hello")

[56, 53, 60, 60, 63]

In [7]:
decode([56, 53, 60, 60, 63])

'hello'

In [8]:
# character and word level tokenizers
data = torch.tensor(encode(data), dtype=torch.long)
print(data[:100])

tensor([80, 40, 56, 53,  1, 36, 66, 63, 58, 53, 51, 68,  1, 27, 69, 68, 53, 62,
        50, 53, 66, 55,  1, 53, 22, 63, 63, 59,  1, 63, 54,  1, 40, 56, 53,  1,
        43, 63, 62, 52, 53, 66, 54, 69, 60,  1, 43, 57, 74, 49, 66, 52,  1, 63,
        54,  1, 35, 74,  8,  1, 50, 73,  1, 32, 10,  1, 26, 66, 49, 62, 59,  1,
        22, 49, 69, 61,  0,  0, 40, 56, 57, 67,  1, 53, 22, 63, 63, 59,  1, 57,
        67,  1, 54, 63, 66,  1, 68, 56, 53,  1])


In [11]:
# validation and training splits
n = int(0.8 *len(data))

train_data = data[:n]
test_data = data[:n]

In [46]:
# simple bigram language model
block_size = 8
batch_size = 4

x = train_data[:block_size]
y = train_data[1:block_size + 1]

In [13]:
x, y

(tensor([80, 40, 56, 53,  1, 36, 66, 63]),
 tensor([40, 56, 53,  1, 36, 66, 63, 58]))

In [14]:
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    
    print("when input is : ", context, "then target is ", target)

when input is :  tensor([80]) then target is  tensor(40)
when input is :  tensor([80, 40]) then target is  tensor(56)
when input is :  tensor([80, 40, 56]) then target is  tensor(53)
when input is :  tensor([80, 40, 56, 53]) then target is  tensor(1)
when input is :  tensor([80, 40, 56, 53,  1]) then target is  tensor(36)
when input is :  tensor([80, 40, 56, 53,  1, 36]) then target is  tensor(66)
when input is :  tensor([80, 40, 56, 53,  1, 36, 66]) then target is  tensor(63)
when input is :  tensor([80, 40, 56, 53,  1, 36, 66, 63]) then target is  tensor(58)


In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(device)

cpu


In [18]:
randints = torch.randint(-100, 100, (5, ))
print(randints)

tensor([-36,  80,  90,  79, -35])


In [21]:
zeros = torch.zeros((2, 3))
print(zeros)

tensor([[0., 0., 0.],
        [0., 0., 0.]])


In [22]:
ones = torch.ones((3, 4))
print(ones)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])


In [23]:
i = torch.empty((3, 2))
print(i)

tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])


In [28]:
i = torch.reshape(torch.arange(10), shape=(2, 5))
print(i)

tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])


In [29]:
i = torch.linspace(2, 7, steps=5)
print(i)

tensor([2.0000, 3.2500, 4.5000, 5.7500, 7.0000])


In [30]:
i = torch.logspace(start=-10, end=10, steps=5)
print(i)

tensor([1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10])


In [33]:
probabilities = torch.tensor([0.1, 0.9])
samples = torch.multinomial(probabilities, num_samples=10, replacement=True)
print(samples)

tensor([1, 1, 0, 1, 0, 1, 1, 1, 1, 1])


In [37]:
tensor = torch.tensor([1, 2, 3, 4])
tensor = torch.cat((tensor, torch.tensor([5])), dim=0)
print(tensor)

tensor([1, 2, 3, 4, 5])


In [38]:
tensor = torch.tril(torch.ones(5, 5))
print(tensor)

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])


In [39]:
tensor = torch.triu(torch.ones(5, 5))
print(tensor)

tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])


In [48]:
def get_batch(split):
    data = train_data if split == "train"  else test_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    return x, y

In [49]:
x, y = get_batch("train")
x

tensor([[63, 62, 53,  1, 38, 63, 63, 61],
        [53, 62, 52,  8,  1, 49, 62, 52],
        [57, 68,  8,  1, 67, 49, 73, 57],
        [68, 56, 53,  1, 23, 63, 71, 49]])

In [55]:
import torch.nn as nn
import torch.nn.functional as F

In [54]:
class BiGram(nn.Module):
    def __init__(vocab_size, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
        
    def forward(self, index, targets):
        logits = self.token_embedding_table(index)
        
        B, T, C = logits.shape
        
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        
        loss = F.cross_entropy(logits, targets)
        return logits