In [21]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np

In [2]:
with open('training_data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [22]:
print(len(text))
chars = sorted(list(set(text)))
print(len(chars))
print(''.join(chars))
vocab_size = len(chars)

320123
118
	
 !"#%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{|}~²¾×á˙αγθϵ–—‘’“”•…−√∧∨≈△▽


In [4]:
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for i, s in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

test = "heyy, here's some test text, baby"
print(encode(test))
print(decode(encode(test)))

[71, 68, 88, 88, 13, 2, 71, 68, 81, 68, 8, 82, 2, 82, 78, 76, 68, 2, 83, 68, 82, 83, 2, 83, 68, 87, 83, 13, 2, 65, 64, 65, 88]
heyy, here's some test text, baby


In [19]:
data = torch.tensor(encode(text))
print(data.shape, data.dtype)

torch.Size([320123]) torch.int64


splitting btw training and validation sets

In [15]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [16]:
block_size = 8
train_data[:block_size+1]

tensor([46, 14, 21,  1,  1, 34, 65, 82, 83])

demonstrating variable context between 1 and block size

In [17]:
x = train_data[:block_size]
Y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = Y[t]
    print(f'{context}:{target}')

tensor([46]):14
tensor([46, 14]):21
tensor([46, 14, 21]):1
tensor([46, 14, 21,  1]):1
tensor([46, 14, 21,  1,  1]):34
tensor([46, 14, 21,  1,  1, 34]):65
tensor([46, 14, 21,  1,  1, 34, 65]):82
tensor([46, 14, 21,  1,  1, 34, 65, 82]):83


batch dimension

In [47]:
torch.manual_seed(42)
batch_size = 32
block_size = 8

def get_batch(split:str):
    dat = train_data if split == 'train' else val_data
    ix = torch.randint(len(dat) - block_size, (batch_size,))
    x = torch.stack([dat[i:i+block_size] for i in ix])
    y = torch.stack([dat[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(f'inputs: {xb.shape}')
print(xb)
print(f'targets: {yb.shape}')
print(yb)

inputs: torch.Size([32, 8])
tensor([[ 83,  82,   2,  83,  71,  64,  83,   1],
        [ 78,  84,  81,  77,  64,  75,   2,  78],
        [ 82,   2,  78,  77,   2,  65,  68,  83],
        [ 14,  58,   2,  15,   1,  60,  19,  62],
        [ 78,  67,   2,  79,  81,  68,  67,  72],
        [ 77,  82,   2,  69,  78,  81,   2,  48],
        [ 81,  72,  78,   2,  64,  82,   2,  83],
        [  2,  82,  79,  64,  66,  68,  15,   1],
        [ 42,   2,  66,  78,  84,  75,  67,   2],
        [ 72,  83,  71,   2,  37,  53,  36,   1],
        [ 82,   2,  77,  78,  83,   2,  64,  77],
        [ 15,  78,  81,  70,  13,   2,  19,  17],
        [ 68,  67,   2,  72,  77,  83,  78,   2],
        [ 84,  72,  66,  74,   2,  86,  72,  83],
        [  2,  83,  71,  68,   2,  83,  86,  78],
        [  2,  83,  71,  68,   2,  77,  68,  87],
        [  2,  51,  64,  85,  68,  77, 106,  82],
        [ 67,   2,  82,  78,   2,  69,  64,  81],
        [  2,  86,  72,  83,  71,   2,  77,  68],
        [ 71,  64,  83

##### bigram language model (simplest)

In [32]:
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    '''
    bigram only looks at the previous character in predicting the next
    '''
    def __init__(self, vocab_size):
        super().__init__()
        # 
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        #
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T,)
            # cross_entropy expects channels as second dim
            loss = F.cross_entropy(logits, targets)
        else: 
            loss = None

        return logits, loss
    
    def generate(self, idx, max_new_tok):
        # idx is (B,T) array if indicies in current context
        for _ in range(max_new_tok):
            logits, loss = self(idx) # get predictions
            logits = logits[:, -1, :] # look at last timestep
            probs = F.softmax(logits, dim=-1) # get probabilities from softmax
            idx_next = torch.multinomial(probs, num_samples=1) # sample from prob dist
            idx = torch.cat((idx, idx_next), dim=1) # append sample
        return idx
    
m = BigramLanguageModel(vocab_size=vocab_size)
logits, loss = m(xb, yb)
print(logits.shape, loss)


idx = torch.zeros((1,1), dtype=torch.long) # 1x1 tensor of newline
print(decode(m.generate(idx, max_new_tok=50)[0].tolist()))



torch.Size([32, 118]) tensor(5.0969, grad_fn=<NllLossBackward0>)
	0•kjyd&fH¾fB(.~—AtF0*1<×t	B=’[1KK]FJpk=0∧/kgAX•|√*


In [33]:
# pyTorch optimizer
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [59]:
batch_size = 32
for steps in range(10000):

    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.6595726013183594


In [61]:
idx = torch.zeros((1,1), dtype=torch.long) # 1x1 tensor of newline
print(decode(m.generate(idx, max_new_tok=500)[0].tolist()))

	B:/6% melype C spe ales o a stitustithisteripuras. 16 ndrass n ovesong be ass arorthexios in MSingumicore n f t woons we f aivevios me Itnd ndongnffte stheleliewex pff tondde hefincer tatornin a as ge, cinous .
US. aripserg t d PTikithaghasse fre 
d? 9028. a o gll p sserer s rry llapr bellig afoncarfe 1. a, o teinveand ch mbehemithaturar 0 me tioflisingucowile mpapttiodendi
SA-
[30.  s resse s 7 ctanthond s 
iches timase en tandilureareroryingall cecongrk-pre MI  qum can led.5
eywhe OBiopodat an


##### self-attention

for a generative model, attention at a given character should only be applied to its predecessors``

In [76]:
B,T,C = 4,8,32
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 32])

weighted aggregation of predecessors

In [77]:
xbow = torch.zeros((B,T,C)) # x bag of words (average)
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev,0)

more efficient with matrix multiplication and lower triangle

In [79]:
wei = torch.tril(torch.ones(T,T))
wei = wei / torch.sum(wei, 1, keepdim=True)
xbow2 = wei @ x # (B,T,T) @ (B,T,C) -> (B,T,C)
torch.allclose(xbow,xbow2)

True

masking 0 to -inf and using softmax. Why? This allows us to have the weights learned (rathe than identical)

In [80]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T)) # affinity weights (dot prod of query and keys)
wei = wei.masked_fill(tril == 0, float('-inf')) # <- diff btw encoder and decoder blocks
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

attention is a ***communication*** mechanism. 

In [82]:
torch.manual_seed(42)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B,T,16)
q = query(x) # (B,T,16)
wei = q @ k.transpose(-2,-1) * head_size**-0.5 # (B,T,16) @ (B,16,T) ---> (B,T,T)
# 1/sqrt(head_size) normalizes the variance, which affects softmax "sharpness"

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out.shape


torch.Size([4, 8, 32])

In [83]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [106]:
import random

In [108]:
with open('training_data_cleaned.txt','r') as f_c:
    data_str = f_c.read()
data_shards = data_str.split('\n\n')
print(len(data_shards))
random.seed(42)
random.shuffle(data_shards)

67


In [109]:
a = 'test text is here bro-dogg :)'
a_shards = a.split(' ')
print(len(a_shards))
random.seed(42)
random.shuffle(a_shards)
new_a = ' '.join(a_shards)
print(new_a)

6
here text is bro-dogg test :)
