In [13]:
import torch 

In [14]:
with open('dataset/wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars)
print(len(chars))

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [15]:
string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

encoded_hello = encode("Hello")
decoded_hello = decode(encoded_hello)
print(encoded_hello)
print(decoded_hello)

[32, 58, 65, 65, 68]
Hello


In [16]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([232477]) torch.int64
tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,  0,
         0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0,
         1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33,
        50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25,
        38, 28,  1, 39, 30,  1, 39, 50,  9,  1, 39, 50, 37, 25,  1, 39, 30,  1,
        39, 50,  9,  1, 29, 44, 27, 11,  0,  0,  1,  1, 33, 36, 36, 45, 43, 44,
        42, 25, 44, 29, 28,  1, 26, 49,  1, 34, 39, 32, 38,  1, 42, 11,  1, 38,
        29, 33, 36, 36,  0,  0,  1,  1, 26, 39, 39, 35, 43,  1, 39, 30,  1, 47,
        39, 38, 28, 29, 42,  1, 47, 33, 36, 36, 33, 25, 37,  1, 37, 39, 42, 42,
        39, 47,  1,  4,  1, 27, 39, 11,  9,  1, 33, 38, 27, 11,  1, 38, 29, 47,
         1, 49, 39, 42, 35,  0,  0,  0,  1,  1, 51, 33, 65, 65, 74, 72, 73, 71,
       

In [17]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [18]:
block_size = 8
train_data[:block_size+1]

tensor([80, 28, 39, 42, 39, 44, 32, 49,  1])

In [19]:
#Time Dimension of tensor
x = train_data[: block_size]
y = train_data[1: block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is: {target}")

when input is tensor([80]) the target is: 28
when input is tensor([80, 28]) the target is: 39
when input is tensor([80, 28, 39]) the target is: 42
when input is tensor([80, 28, 39, 42]) the target is: 39
when input is tensor([80, 28, 39, 42, 39]) the target is: 44
when input is tensor([80, 28, 39, 42, 39, 44]) the target is: 32
when input is tensor([80, 28, 39, 42, 39, 44, 32]) the target is: 49
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49]) the target is: 1


In [20]:
#Batch Size
torch.manual_seed(1337)
batch_size = 4 
block_size = 8

def get_batch(split):
    
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('target:')
print(yb.shape)
print(yb)

print('------')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[72, 61,  1, 42, 74, 65, 58, 71],
        [62, 79, 79, 58, 72,  1, 54, 67],
        [ 1, 58, 57, 60, 58,  1, 68, 59],
        [24,  3,  1, 72, 61, 58,  1, 72]])
target:
torch.Size([4, 8])
tensor([[61,  1, 42, 74, 65, 58, 71,  1],
        [79, 79, 58, 72,  1, 54, 67, 57],
        [58, 57, 60, 58,  1, 68, 59,  1],
        [ 3,  1, 72, 61, 58,  1, 72, 54]])
------
When input is [72] the target: 61
When input is [72, 61] the target: 1
When input is [72, 61, 1] the target: 42
When input is [72, 61, 1, 42] the target: 74
When input is [72, 61, 1, 42, 74] the target: 65
When input is [72, 61, 1, 42, 74, 65] the target: 58
When input is [72, 61, 1, 42, 74, 65, 58] the target: 71
When input is [72, 61, 1, 42, 74, 65, 58, 71] the target: 1
When input is [62] the target: 79
When input is [62, 79] the target: 79
When input is [62, 79, 79] the target: 58
When input is [62, 79, 79, 58] the target: 72
When input is [62, 79, 79, 58, 72] the target: 1
When input is [6

In [21]:
print(xb)

tensor([[72, 61,  1, 42, 74, 65, 58, 71],
        [62, 79, 79, 58, 72,  1, 54, 67],
        [ 1, 58, 57, 60, 58,  1, 68, 59],
        [24,  3,  1, 72, 61, 58,  1, 72]])


In [52]:
import torch 
import torch.nn as nn 
from torch.nn import functional as F 
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__ (self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets = None):
        
        logits = self.token_embedding_table(idx) #(B,T,C)
        
        if targets == None: 
            loss = None
            
        else: 
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)
print(decode(m.generate(idx = torch.zeros((1,1), dtype= torch.long), max_new_tokens=100)[0].tolist()))
        

torch.Size([256, 81])
tensor(4.8199, grad_fn=<NllLossBackward0>)

XjAN_LUE2O8q:0vx5Nnd],V8aPmStIpc&"n9YK﻿P,*5UWQ8a DuWyJq!;?apJzbe ?eaIdce6azd1f;?po2JlT﻿E5'qvBF;72ki)


In [23]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) #pytorch Optimizer

In [24]:
batch_size = 32
for steps in range(10000):
    
    xb, yb = get_batch('train')
    
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.467174530029297


In [25]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype= torch.long), max_new_tokens=400)[0].tolist()))


rs be.]E





a id  led hase h ven'ver, s
k,"

"
ly ad t thand."Oziooime o jor tof h u'mQDu'thed arad. Gkered?" sheces, wouloverak CALThrd gorey, AU6*)Jishe aser m abimpil; hedorcke lat imacen ousir t topamund.
"Thed My grfan thut t otlarst apit, waw faing ththendite tond o w. es Wind

" tea

"NShe thy whe acon whery, sed.

pe fo
buserunort h, ge tll!"Wior ty thad af thous,
"

"Do he o hemerie tou


In [26]:
torch.manual_seed(1337)
B,T,C = 4, 8, 2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [27]:
xbow = torch.zeros(B,T,C)
for b in range(B): 
    for t in range(T):
        xprev = x[b,:t+1] 
        xbow[b,t] = torch.mean(xprev, 0)

In [28]:
#version 2
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x #(B,T,T) @ (B,T,C) -> (B,T,C)
torch.allclose(xbow, xbow2) 

True

In [29]:
# version 3: Uses Softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x 
torch.allclose(xbow, xbow3)

True

In [43]:
#version 4: Self Attention
torch.manual_seed(1337)
B,T,C = 4, 8, 32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x) # (B,T,head_size)
q = query(x) # (B,T,head_size)
wei = q @ k.transpose(-2,-1) # (B,T,head_size) @ (B,head_size,T) -> (B,T,T)

tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x) # (B,T,head_size)
out = wei @ v # (B,T,T) @ (B,T,head_size) -> (B,T,head_size)
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [45]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [47]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [30]:
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [48]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2,-1) * head_size**-0.5 # (B,T,head_size) @ (B,head_size,T) -> (B,T,T)

In [49]:
k.var()

tensor(1.0449)

In [50]:
q.var()

tensor(1.0700)

In [51]:
wei.var()

tensor(1.0918)