In [35]:
# Read file

with open('dataset/input.txt','r', encoding='utf-8') as f:
    text = f.read()

In [36]:
# Length of text
print(len(text))

1115394


In [37]:
# Check the text
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [38]:
# Check unique characters 
chars = sorted(list(set(text)))
print(f'number of unique characters: {len(chars)}')
print(f'Characters: {"".join(chars)}')
vocab_size = len(chars)

number of unique characters: 65
Characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [39]:
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]  # Function takes a string and retuns encoded integers
decode = lambda l: ''.join([itos[i] for i in l]) # Function takes a list of integers and returns a decoded string

print(encode("hello there!"))
print(decode([25,33,52,24,23,1,43,61]))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43, 2]
MUnLK ew


In [40]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [41]:
# Split data

n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]




In [42]:
# train Block size/ Context length
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [43]:
# How context works with example
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")


when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [44]:
torch.manual_seed(1337)
batch_size = 4 
block_size = 8 

def get_batch(split):
    # generate a small batch of input data x and target y 
    data = train_data if split == 'train' else val_data 
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y 

xb, yb = get_batch('train')

print('inputs: ')
print(xb.shape)
print(xb)

print('targets: ')
print(yb.shape)
print(yb)

print('_______________')



for b in range (batch_size):    # batch dim
    for t in range(block_size): # time dim
        context = xb[b, :t+1]
        target  = yb[b,t]
        print(f'when context is {context.tolist()} the target is {target}')


inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
_______________
when context is [24] the target is 43
when context is [24, 43] the target is 58
when context is [24, 43, 58] the target is 5
when context is [24, 43, 58, 5] the target is 57
when context is [24, 43, 58, 5, 57] the target is 1
when context is [24, 43, 58, 5, 57, 1] the target is 46
when context is [24, 43, 58, 5, 57, 1, 46] the target is 43
when context is [24, 43, 58, 5, 57, 1, 46, 43] the target is 39
when context is [44] the target is 53
when context is [44, 53] the target is 56
when context is [44, 53, 56] the target is 1
when context is [44, 53, 56, 1] the target is 58
when cont

In [45]:
print()




In [46]:
import torch
import torch.nn as nn 
from torch.nn import functional as F  

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx) # Batch, Time(block size), Channel(vocab size) = BTC
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # print(B, T, C)

            logits = logits.view(B*T , C)
            targets = targets.view(B*T)
            
            loss = F.cross_entropy(logits, targets)
        
        return logits , loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context

        for _ in range(max_new_tokens):
            # get prediction
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:,-1,:] #becomes (B,C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # B,C
            # Sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B,1)
            #append sampled index to the running sequence
            idx = torch.cat((idx, idx_next),dim=1) #(B,T+1)
        return idx 
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(5.0364, grad_fn=<NllLossBackward0>)


In [47]:
# Check how the model outputs garbage before training

idx0 = torch.zeros((1,1),dtype=torch.long)
print(decode(m.generate(idx0, max_new_tokens=100)[0].tolist()))


lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [48]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [49]:
batch_size = 32 
learning_steps = 10000
for step in range(learning_steps):
    #sample the batch
    xb, yb = get_batch('train')

    #evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step%100 is 0:
        print("Step %d\tLoss %.4f" % (step, loss))
    
print('----done----')

Step 0	Loss 4.6477


  if step%100 is 0:


Step 100	Loss 4.5645
Step 200	Loss 4.4897
Step 300	Loss 4.3434
Step 400	Loss 4.2009
Step 500	Loss 4.1076
Step 600	Loss 3.9540
Step 700	Loss 3.9091
Step 800	Loss 3.8039
Step 900	Loss 3.7947
Step 1000	Loss 3.6683
Step 1100	Loss 3.6054
Step 1200	Loss 3.5995
Step 1300	Loss 3.5802
Step 1400	Loss 3.4669
Step 1500	Loss 3.3458
Step 1600	Loss 3.2836
Step 1700	Loss 3.1671
Step 1800	Loss 3.1332
Step 1900	Loss 3.1885
Step 2000	Loss 3.3209
Step 2100	Loss 3.0852
Step 2200	Loss 3.0418
Step 2300	Loss 3.0337
Step 2400	Loss 2.9010
Step 2500	Loss 2.9626
Step 2600	Loss 2.7994
Step 2700	Loss 2.9445
Step 2800	Loss 2.9192
Step 2900	Loss 2.7770
Step 3000	Loss 2.8779
Step 3100	Loss 2.7572
Step 3200	Loss 2.6434
Step 3300	Loss 2.6713
Step 3400	Loss 2.7567
Step 3500	Loss 2.6116
Step 3600	Loss 2.7023
Step 3700	Loss 2.6181
Step 3800	Loss 2.6877
Step 3900	Loss 2.5604
Step 4000	Loss 2.6919
Step 4100	Loss 2.7078
Step 4200	Loss 2.6816
Step 4300	Loss 2.6944
Step 4400	Loss 2.5496
Step 4500	Loss 2.6288
Step 4600	Loss 2.56

In [50]:
# Check how the model outputs garbage before training

idx0 = torch.zeros((1,1),dtype=torch.long)
print(decode(m.generate(idx0, max_new_tokens=1000)[0].tolist()))


M:
IUSh t,
F th he d ke alved.
Thupld, cipbll t
I: ir w, l me sie hend lor ito'l an e

I:
Gochosen ea ar btamandd halind wast, plt t wadyotl
I bel qunganonoth he m he de avellis knt, tond soran:

WI he toust are bot g e n t s d je hid t his IAces I my ig t
Ril'swoll e pupat inouleacends-athiqu heamer te
Wht s

MI wect!-lltherotheve t fe;
WAnd pporury t s ld tathat, ir V
IO thesecin teot tit ado ilorer.
Ply, d'stacoes, ld omat mealellly yererer EMEvesas ie IZEd pave mautoofareanerllleyomerer but?
The t,
Ith'dwitile w? beren to'd ff a atrts brey s

ESesenther:
Ithon f at pare ismamy an flictong m




Mameld h che IN: an y is aslo'daDut, t thethiceve fur t anowik
Wirghe f bot d at'prd
Anoper sof usy be, d s me cks bity.
Cis:
INILou f lendys.
Y anditont avenghe m, gs gl tis y.
Wie gh-mmo hizy s me f lourachigethuiclotif qDWeZPld:
LOubour Witamul we thiech l lisowarrew bland cedanidate, fafive withe thiulsosthis thatwancaurind th'gonimake

S oveprene?
Hear oumnanoupamak in:

The!
The f d s

# Working with Matrix
### Easing calculations using matrices

In [51]:
G = torch.randint(low=1,high=20, size=(3,2), dtype=torch.float32)
print(G)

tensor([[ 1.,  2.],
        [12., 18.],
        [14.,  1.]])


In [52]:

sum_mat = torch.ones((3,3), dtype=torch.float32)
print(sum_mat)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


In [53]:
# To sum the columns

print(sum_mat @ G)

tensor([[27., 21.],
        [27., 21.],
        [27., 21.]])


In [54]:
# To do cumilitive sum along columns

cumsum = torch.tril(sum_mat)
print(cumsum)
print(cumsum @ G)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[ 1.,  2.],
        [13., 20.],
        [27., 21.]])


In [55]:
# To do average along the rows

mean_mat = torch.tril(torch.ones((3,3), dtype=torch.float32)) 
mean_mat = mean_mat / mean_mat.sum(1, keepdim=True)

print(mean_mat)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])


In [56]:
# TO get the mean of a given 3x3 matrix
mean_mat @ G

tensor([[ 1.0000,  2.0000],
        [ 6.5000, 10.0000],
        [ 9.0000,  7.0000]])

In [57]:
# Higher dim example 
B, T, C = 4,8,2

x = torch.randn(B,T,C)
x.shape 

torch.Size([4, 8, 2])

In [58]:
# we want x[b,t] = mean {i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev,0)
        

In [59]:
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
wei 

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [60]:
xbow2 = wei @ x # (T,T) @ (B,T,C) ---> (B,T,T) @ (B, T, C) =---> (B,T,C)

In [61]:
# compare if they are equal 
torch.allclose(xbow, xbow2)

True

In [62]:
# Version 3 

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=1)

print(wei)

xbow3 = wei @ x 
torch.allclose(xbow2, xbow3)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

# Self Attention

In [63]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# Implement a single head of self att
head_size = 16 
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
print(key)

k = key(x)    # (B,T,16)
q = query(x)  # (B,T,16)
v = value(x)

print(k.shape)

wei = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) ---> (B,T,T)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=1)


out = wei @ v 
# out = wei @ x


out.shape

Linear(in_features=32, out_features=16, bias=False)
torch.Size([4, 8, 16])


torch.Size([4, 8, 16])

Testing fuctionality of nn.Linear in torch for myself

In [64]:
ex1 = torch.randn(2,3,7)
print(ex1.shape)
lex = nn.Linear(7, 11, bias=False)

j = lex(ex1)

print(j.shape)


torch.Size([2, 3, 7])
torch.Size([2, 3, 11])


Why divide by the sqrt(head_size) for the attention calculation -> TO normalize the variance <br>

Otherwise wei will start to converge towards one hot vectors

In [65]:
k1 = torch.randn(B,T,head_size)
q1 = torch.randn(B,T,head_size)
wei1 = q1 @ k1.transpose(-2,-1) 
wei_norm = wei * head_size**-0.5

In [66]:
print(k1.var())
print(q1.var())
print(wei1.var())
print(wei_norm.var())


tensor(0.9859)
tensor(1.0134)
tensor(15.6108)
tensor(0.0030, grad_fn=<VarBackward0>)


In [None]:
print(torch.softmax)