In [1]:
import torch
from scratchGPT.utils import Tokenizer, Dataset
import numpy as np

torch.manual_seed(41)

<torch._C.Generator at 0x13769de90>

In [2]:
with open('data/shakespeare.txt', 'r') as f:
    text = f.read()

In [3]:
tokenizer = Tokenizer(text)
print('Data as full text ------>')
print(text[:100])
print('Data as tokenized integers ------>')
print(tokenizer.encode(text[:100]))

Data as full text ------>
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
Data as tokenized integers ------>
[31, 56, 3, 24, 62, 4, 32, 56, 62, 56, 39, 51, 12, 17, 7, 30, 51, 33, 19, 3, 51, 4, 0, 51, 4, 14, 3, 19, 58, 51, 51, 40, 4, 43, 12, 46, 4, 33, 1, 3, 62, 5, 51, 3, 18, 4, 5, 51, 43, 3, 4, 64, 51, 4, 24, 14, 51, 43, 25, 41, 7, 7, 42, 28, 28, 17, 7, 57, 14, 51, 43, 25, 18, 4, 24, 14, 51, 43, 25, 41, 7, 7, 31, 56, 3, 24, 62, 4, 32, 56, 62, 56, 39, 51, 12, 17, 7, 26, 19, 1]


In [20]:
### model parameters
n = 20 # context size
v = len(tokenizer) # vocab size
d = 64 # embedding dimension
a = 4 # number of attention heads
d_k = d // a
d_v = d_k

In [14]:
T = torch.tensor(tokenizer.encode(text[:n])) # tokenized text
print(T.shape)

torch.Size([20])


In [None]:
E = torch.randn(v, d) # embedding matrix
print(E.shape)

torch.Size([65, 64])


In [87]:
def layer_norm(x: torch.Tensor, gamma=1, beta=0):
    return ((x - x.mean()) / x.std()) * gamma + beta

In [80]:
X = E[T]
print(X.shape) # n x d
X = layer_norm(X)

torch.Size([20, 64])


In [81]:
# query, key, value
W_Q = [torch.randn(d, d_k) for _ in range(a)]
W_K = [torch.randn(d, d_k) for _ in range(a)]
W_V = [torch.randn(d, d_v) for _ in range(a)]

print(W_Q[0].shape)

torch.Size([64, 16])


In [82]:
Q = [X @ W_Q[i] for i in range(a)]
K = [X @ W_K[i] for i in range(a)]
V = [X @ W_V[i] for i in range(a)]

In [83]:
def masked_querykey(Q, K):
    o = Q @ K.T
    masked = torch.tril(o)
    masked = torch.masked_fill(masked, masked == 0, float('-inf'))
    return masked

In [84]:
Ma = [masked_querykey(Q[i], K[i]) for i in range(a)]
Ma = [m/np.sqrt(d_k) for m in Ma]
print(Ma[0].shape)

torch.Size([20, 20])


In [85]:
def softmax(x: torch.Tensor):
    return torch.exp(x) / torch.sum(torch.exp(x))

In [86]:
SoftMa = [softmax(m) for m in Ma]
H = torch.concat([SoftMa[i] @ V[i] for i in range(a)], dim=1)
print(H.shape)

torch.Size([20, 64])


In [98]:
W_O = torch.randn(d, d)
O = H @ W_O
O = X + O
print(O.shape)

O = layer_norm(O)

torch.Size([20, 64])


In [96]:
# feed forward
W_1 = torch.randn(d, d)
W_2 = torch.randn(d, d)
b_1 = torch.randn(d)
b_2 = torch.randn(d)

In [101]:
def relu(x: torch.Tensor):
    return torch.max(x, torch.zeros_like(x))

In [102]:
F = relu(O @ W_1 + b_1) @ W_2 + b_2
F.shape

torch.Size([20, 64])

In [105]:
output = O + F
output.shape

torch.Size([20, 64])

In [109]:
unembedd = output[-1] @ E.T
unembedd.shape

torch.Size([65])

In [110]:
probabilities = softmax(unembedd)
probabilities.shape

torch.Size([65])