In [3]:
import torch
import numpy as np

from torch._C import NoneType
from torch import nn
from torch.nn import functional as F

import os

In [4]:
# If the names have not been downloaded
if not os.path.exists('input.txt'):
    # download the names.txt file from github
    !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
        
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
print(len(text))
print(text[:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"vocabulary size = {vocab_size}")
print(''.join(chars))

vocabulary size = 65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [6]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] if c in stoi else -1 for c in s]
decode = lambda l: ''.join([itos[i] if i in itos else '' for i in l ])
tdecode = lambda l: ''.join([itos[int(i)] if int(i) in itos else '' for i in l ])

In [7]:
print(encode("hii there>>>"))
print(decode([1,2,3,4,5, 10000, 10000000]))

[46, 47, 47, 1, 58, 46, 43, 56, 43, -1, -1, -1]
 !$&'


In [8]:
print(decode(encode('hii there>>>')))

hii there


In [9]:
data = torch.tensor(encode(text), dtype=torch.long)

In [10]:
print(data.shape)
print(data.type())
print(len(text))
print(data[:10])

torch.Size([1115394])
torch.LongTensor
1115394
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


In [11]:
print(decode([int(x) for x in data[0:10]]))
print(tdecode(data[0:10]))

First Citi
First Citi


In [12]:
n = int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

all = torch.cat((train_data,val_data))
print(len(all)-len(data))

0


In [13]:
block_size = 8
batch_size = 4
train_data[1:block_size+1]

tensor([47, 56, 57, 58,  1, 15, 47, 58])

In [14]:
x=train_data[:block_size]
print(f"\n\nx = {x}")
y=train_data[1:block_size+1]
print(f"y   =   {y}\n\n")

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"context = {context}, target = {target}")



x = tensor([18, 47, 56, 57, 58,  1, 15, 47])
y   =   tensor([47, 56, 57, 58,  1, 15, 47, 58])


context = tensor([18]), target = 47
context = tensor([18, 47]), target = 56
context = tensor([18, 47, 56]), target = 57
context = tensor([18, 47, 56, 57]), target = 58
context = tensor([18, 47, 56, 57, 58]), target = 1
context = tensor([18, 47, 56, 57, 58,  1]), target = 15
context = tensor([18, 47, 56, 57, 58,  1, 15]), target = 47
context = tensor([18, 47, 56, 57, 58,  1, 15, 47]), target = 58


In [15]:
def get_batch(split):
    data = train_data if split =='train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[t:t+block_size] for t in ix])
    y = torch.stack([data[t+1: t+block_size+1] for t in ix])
    return x,y

In [16]:
torch.manual_seed(1337)
xb, yb = get_batch('train')
print(xb)
print(yb)
print(xb.shape, yb.shape)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
torch.Size([4, 8]) torch.Size([4, 8])


In [45]:
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # how the 'Bigram' part is related to the embedding dimensionality?
        # both num_embeddings and embedding_dim equal to vocab_size (!!!) 
        # manual: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx is the same as x: sequences of len block_size and targets are the
        # same as before (y), the correct expected outcomes
        # -- also looked up in the embedding table?

        logits = self.token_embedding_table(idx) #(B,T,C) batch, time, channel
        # channel is the vocabulary size (!!!); as if number of colurs = 65 :)
        #print(f"logits type: {type(logits)}, logits shape: {logits.shape}")
    
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            #
            # Strangely enough cross_entropy takes logits as "weights" and targets as "class labels"
            #
            loss = F.cross_entropy(logits, targets)
      
        return logits, loss
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

#     def generate(self, idx, max_new_tokens):
#         print(f"==> generate, idx.shape = {idx.shape}")
#         for _ in range(max_new_tokens):
#             logits, loss = self(idx);
#             logits = logits[:,-1,:] # the last time slice, (BxC)

#               # for each batch B get a distribution of classes C
#             probs = F.softmax(logits, dim=-1) 

#               # 1 sample from each row for each batch
#             idx_next = torch.multinomial(probs, num_samples=1) 
#             idx = torch.cat((idx, idx_next), dim = 1)

        return idx

In [18]:
torch.manual_seed(1337)
m = BigramLanguageModel(vocab_size)
print(f"xb.shape = {xb.shape}")
print(f"yb.shape = {yb.shape}")


logits, loss = m(xb,yb)
print(f"--> logits.shape = {logits.shape}")
print(f"--> this is because batch_size*block_size = {batch_size*block_size}\n")
ind = 12
print(f"logits at index = {ind}:\n{logits[ind,:].detach().numpy()}\n")
offset_argmax = torch.argmax(logits[ind,:])
val = logits[ind, offset_argmax]
print(f"offset = {offset_argmax}, value = {val}\n")
print(decode([ind]), ' --> ', decode([int(offset_argmax)]),'\n')
print(f"loss = {loss}, vs expected(?) {float(-torch.log(torch.Tensor([1/vocab_size])))}")


xb.shape = torch.Size([4, 8])
yb.shape = torch.Size([4, 8])
--> logits.shape = torch.Size([32, 65])
--> this is because batch_size*block_size = 32

logits at index = 12:
[ 0.2474621  -0.63485116 -1.2909483   1.1821823   0.14786936 -0.43331397
 -0.8269277   0.07280172 -1.2982308   0.39599574 -1.2460201   0.14583187
 -0.5699396  -1.3560567  -0.38120747 -0.8514603   1.1917949  -0.81080186
 -0.17326038 -0.47029358 -0.60004216 -1.3636268  -1.0889153   1.0108203
  0.85429174 -0.04411305  1.8016624   0.60141    -2.5448313  -0.48651642
  2.6412039   1.6052898   0.59007245  0.81368    -0.11238304 -0.30501363
  1.1426241   0.66372484 -0.7000075   0.9262019  -1.103203   -1.2124757
  0.6065394   0.5881612  -0.5452641   0.7654137   0.5691515   0.8859054
 -0.07004447  0.67918706 -0.02830357 -1.22435    -1.7192171   1.4801265
  0.9586657  -0.03378088  0.5083099  -0.2501664   2.0734181  -0.29940873
  0.04729307 -0.9625754   1.3064294  -0.22557093 -1.8304833 ]

offset = 30, value = 2.6412038803100586



In [19]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens = 100)[0].tolist()))

==> generate, idx.shape = torch.Size([1, 1])

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [20]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [21]:
#
# discussion about how to add custom parameters to Pytorch
# https://stackoverflow.com/questions/59234238/how-to-add-parameters-in-module-class-in-pytorch-custom-model
# you need to 
# --> 'register' the parameter:
#
# self.register_parameter(name='bias', param = torch.nn.Parameter(torch.randn(3)))
# 
# NOT SIMPLY:
#
# self.bias = torch.nn.Parameter(torch.rand())
#
# --> this is strange: why would call torch.nn.Parameter but not register it at the same time????
#

for ind,x in enumerate(m.parameters()):
    print(f"count = {ind}:\n{x}\n")

count = 0:
Parameter containing:
tensor([[ 0.1808, -0.0700, -0.3596,  ...,  1.6097, -0.4032, -0.8345],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [ 1.3035, -0.4501,  1.3471,  ...,  0.1910, -0.3425,  1.7955],
        ...,
        [ 0.4222, -1.8111, -1.0118,  ...,  0.5462,  0.2788,  0.7280],
        [-0.8109,  0.2410, -0.1139,  ...,  1.4509,  0.1836,  0.3064],
        [-1.4322, -0.2810, -2.2789,  ..., -0.5551,  1.0666,  0.5364]],
       requires_grad=True)



In [22]:
batch_size=32

for steps in range(1000):
    xb,yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 100 == 0:
        print(steps, loss.item())

0 4.692410945892334
100 4.621085166931152
200 4.549462795257568
300 4.345611572265625
400 4.25573205947876
500 4.214480876922607
600 4.124096870422363
700 3.9863951206207275
800 3.9517807960510254
900 3.837888717651367


In [23]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens = 500)[0].tolist()))

==> generate, idx.shape = torch.Size([1, 1])

Wh;;Sq.f ustNzknc
kwgOj$dhPWr,SV?hsusiKpgXXUh;Apmem d?hESXI.i;TrJgkiF-oKbXCAA -botrngFCHAUQkn$

pn$w-gHoi?wtd!
LLULIfSK'bAw :M.ZtOptXEQcL?hfaofqbPd?OnonQQJMap$aypupIBYGUsZaI'ottllo..k$W$Akp?yl?ajKlzY!lx&QQLW? t,bXFkyhl-dmVsHeckhRl,jSClgjuk:3Iv
?OqlrV;!Plxfzgy;;
'mRjuBQ&xk!$
h
SiruDJgKuDny,S$ERf.?GSV-ivvKcOvi-nQGX&q-YQbm dEM?px;Akr-IESq--wIWId
RFgXTpDUgM:CK$I!uo'IBT -
j?wfy fFr.&fiqtRS.ZttxGh' a!ogrn$zoZqbocL&yIffBDWNUboscuQqo.Fls,?,M?eZxHx?p?EV.mJiHqHnxT  bQpa;P fawiF$-QbWv&f:CVDCBfano,b?$Esev.?


In [24]:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
#
#  randn torch.creates a sample of N(0,1)
#  I imagine a sample of size B*T*C is just reshaped, what else?
#
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [25]:
T=3
wei = torch.tril(torch.ones(T,T))
wei = wei / torch.sum(wei,1, keepdim=True).to(torch.float64)
print(wei)
xx = torch.randint(100, (B,T,C)).to(torch.float64)
print(xx)
wei @ xx

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]], dtype=torch.float64)
tensor([[[58., 16.],
         [55., 92.],
         [14., 74.]],

        [[47.,  4.],
         [25., 60.],
         [55., 23.]],

        [[68., 79.],
         [42., 17.],
         [53., 49.]],

        [[44., 61.],
         [26., 28.],
         [36., 69.]]], dtype=torch.float64)


tensor([[[58.0000, 16.0000],
         [56.5000, 54.0000],
         [42.3333, 60.6667]],

        [[47.0000,  4.0000],
         [36.0000, 32.0000],
         [42.3333, 29.0000]],

        [[68.0000, 79.0000],
         [55.0000, 48.0000],
         [54.3333, 48.3333]],

        [[44.0000, 61.0000],
         [35.0000, 44.5000],
         [35.3333, 52.6667]]], dtype=torch.float64)

In [26]:
#
# If given a range of indices of length K as input, 
# Embedding returns a 2d, [K,vocab_size] tensor
#
torch.manual_seed(1337)
m = BigramLanguageModel(vocab_size)
print(f"xb.shape = {xb.shape}")
print(f"yb.shape = {yb.shape}")

_,_ = m(xb,yb)

t0=m.token_embedding_table(torch.tensor(0))
print(t0.shape)
t00=m.token_embedding_table(torch.tensor([0,0]))
print(t00.shape)
t5=m.token_embedding_table(torch.arange(5))
print(t5.shape)

torch.allclose(t00[0,:], t00[1,:])

xb.shape = torch.Size([32, 8])
yb.shape = torch.Size([32, 8])
torch.Size([65])
torch.Size([2, 65])
torch.Size([5, 65])


True

In [27]:
torch.manual_seed(1337)
B,T,C = 4,8,32 
x=torch.randn(B,T,C)

head_size=16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)


k = key(x)
q = query(x)
v = value(x)

wei = q @ k.transpose(-1,-2)

print(k.shape, q.shape, v.shape, wei.shape)

tril = torch.tril(torch.ones(T,T))
#
# this is no more:
#wei = torch.zeros(T,T)
#
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
#
#  NOTE that this is the 'decoder architecture'
#       to get the 'encoder architecture' just relax
#       the 'do not look in the future' constraint (no torch.tril), 
#       but just wei=F.softmax without the '-inf' inserted in the future
#       
#

#
# this is no more:
out = wei @ x
print(f"\nNo longer we have: {out.shape}")
#
# instead:

out = wei @ v
print(f"But instead: {out.shape}\n")

torch.Size([4, 8, 16]) torch.Size([4, 8, 16]) torch.Size([4, 8, 16]) torch.Size([4, 8, 8])

No longer we have: torch.Size([4, 8, 32])
But instead: torch.Size([4, 8, 16])



In [28]:
#
# non-zero weights are allowed to be different:
#
wei[0]

tensor([[0.0248, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0052, 0.0091, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0521, 0.0135, 0.2482, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3171, 0.0214, 0.1642, 0.1188, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0412, 0.0487, 0.1046, 0.0742, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1060, 0.5347, 0.2059, 0.1030, 0.7402, 0.0192, 0.0000, 0.0000],
        [0.4298, 0.3409, 0.1769, 0.2027, 0.0480, 0.8472, 0.2329, 0.0000],
        [0.0238, 0.0316, 0.1002, 0.5013, 0.0117, 0.1336, 0.7671, 1.0000]],
       grad_fn=<SelectBackward0>)

# Discussion of cross attention

querie (can) come from one source and keys and values from another. 
It's a natural separation if viewed from the standpoint of information retrieval: 
       
- keys are how data being 'retrieved' can be identified (found)      
- values are actual data retrieved

On the other hand 

- queries (can) come from a different source (search string in a search engine for example)

In [29]:
# Scaled attention 

#
# Without normalization:
#
print('\nwithout normalization:\n')
wei = q @ k.transpose(-1,-2)
print(f"q.var() = {q.var()}")
print(f"wei.var() = {wei.var()}")
print(f"q.var()/wei.var() = {q.var()/wei.var()}")

# Normalize by the head_size: 
wei = q @ k.transpose(-1,-2) * head_size**-0.5
print('\nafter normalization:\n')
print(f"q.var() = {q.var()}")
print(f"wei.var() = {wei.var()}")
print(f"q.var()/wei.var() = {q.var()/wei.var()}")


without normalization:

q.var() = 0.33860543370246887
wei.var() = 1.9223554134368896
q.var()/wei.var() = 0.1761409044265747

after normalization:

q.var() = 0.33860543370246887
wei.var() = 0.1201472133398056
q.var()/wei.var() = 2.8182544708251953


In [31]:
#
# keeping wei relatively "diffuse"
# lest softmax --> 1-hot
#

# for example

x = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])
print(torch.softmax(x, dim=-1))
print(torch.softmax(100*x, dim=-1))

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
tensor([4.2484e-18, 3.9754e-31, 2.0612e-09, 3.9754e-31, 1.0000e+00])


In [46]:
torch.manual_seed(111)
m = BigramLanguageModel(vocab_size)
opt = torch.optim.SGD(m.parameters(), lr=1e-4, momentum=0.9)
opt_start_state = opt.state_dict()
itt = 10

for _ in range(itt):
    xb, yb = get_batch('train')
    _, loss = m(xb,yb)
    print(loss.item())
    # why zero_grad each iteration?
    opt.zero_grad(set_to_none=True)
    loss.backward()
    opt.step()

opt_end_state = opt.state_dict()

opt_start_state == opt_end_state

torch.save(m, 'sgd.pt')

mm = torch.load('sgd.pt')

4.742486476898193
4.780079364776611
4.6079840660095215
4.688343524932861
4.654579162597656
4.767638206481934
4.644798755645752
4.694546222686768
4.6973443031311035
4.674440383911133


In [47]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(mm.generate(idx, max_new_tokens = 100)[0].tolist()))


r'nw!Ryt,Q?K-!J:'XyHE?,AlkpOymmmzF-cvS:mcvK,DaJ.'DIIfui.F
R;kNygBhuHF;TXFtcMrAj
!T&FXNSzFv&-Cgz,Xglw


In [48]:
m50K = torch.load('m50K.pt')
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m50K.generate(idx, max_new_tokens = 100)[0].tolist()))

AttributeError: Can't get attribute 'Block' on <module '__main__'>

In [49]:
dir(m50K)

NameError: name 'm50K' is not defined