In [1]:
import torch
from scratchGPT.utils import Tokenizer, Dataset
import numpy as np

torch.manual_seed(41)

<torch._C.Generator at 0x107b9de90>

In [2]:
with open('data/shakespeare.txt', 'r') as f:
    text = f.read()

In [3]:
tokenizer = Tokenizer(text)
print('Data as full text ------>')
print(text[:100])
print('Data as tokenized integers ------>')
print(tokenizer.encode(text[:100]))

Data as full text ------>
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
Data as tokenized integers ------>
[43, 49, 16, 55, 56, 34, 21, 49, 56, 49, 19, 47, 27, 10, 61, 41, 47, 11, 52, 16, 47, 34, 64, 47, 34, 50, 16, 52, 0, 47, 47, 1, 34, 14, 27, 33, 34, 11, 53, 16, 56, 63, 47, 16, 48, 34, 63, 47, 14, 16, 34, 12, 47, 34, 55, 50, 47, 14, 30, 6, 61, 61, 32, 13, 13, 10, 61, 57, 50, 47, 14, 30, 48, 34, 55, 50, 47, 14, 30, 6, 61, 61, 43, 49, 16, 55, 56, 34, 21, 49, 56, 49, 19, 47, 27, 10, 61, 9, 52, 53]


In [4]:
### model parameters
n = 20 # context size
v = len(tokenizer) # vocab size
d = 64 # embedding dimension
a = 4 # number of attention heads
d_k = d // a
d_v = d_k

In [5]:
T = torch.tensor(tokenizer.encode(text[:n])) # tokenized text
print(T.shape)

torch.Size([20])


In [14]:
E = torch.normal(0, 0.02, (v, d)) # embedding matrix
P = torch.normal(0, 0.02, (n, d)) # positional encoding
print(E.shape)

torch.Size([65, 64])


In [15]:
def layer_norm(x: torch.Tensor, gamma=0.5, beta=0.5):
    return ((x - x.mean()) / x.std()) * gamma + beta

In [16]:
X = E[T] + P[:n] # n x d
print(X.shape) # n x d
X = layer_norm(X)

torch.Size([20, 64])


In [18]:
# query, key, value
W_Q = [torch.normal(0, 0.02, (d, d_k)) for _ in range(a)]
W_K = [torch.normal(0, 0.02, (d, d_k)) for _ in range(a)]
W_V = [torch.normal(0, 0.02, (d, d_v)) for _ in range(a)]

print(W_Q[0].shape)

torch.Size([64, 16])


In [19]:
Q = [X @ W_Q[i] for i in range(a)]
K = [X @ W_K[i] for i in range(a)]
V = [X @ W_V[i] for i in range(a)]

In [20]:
def masked_querykey(Q, K):
    o = Q @ K.T
    masked = torch.tril(o)
    masked = torch.masked_fill(masked, masked == 0, float('-inf'))
    return masked

In [21]:
Ma = [masked_querykey(Q[i], K[i]) for i in range(a)]
Ma = [m/np.sqrt(d_k) for m in Ma]
print(Ma[0].shape)

torch.Size([20, 20])


In [25]:
def softmax(x: torch.Tensor):
    return torch.exp(x) / torch.sum(torch.exp(x))

In [26]:
SoftMa = [softmax(m) for m in Ma]
H = torch.concat([SoftMa[i] @ V[i] for i in range(a)], dim=1)
print(H.shape)

torch.Size([20, 64])


In [28]:
W_O = torch.normal(0, 0.02, (d, d))
O = H @ W_O
O = X + O
print(O.shape)

O = layer_norm(O)

torch.Size([20, 64])


In [30]:
# feed forward
W_1 = torch.normal(0, 0.02, (d, d))
W_2 = torch.normal(0, 0.02, (d, d))
b_1 = torch.zeros(d)
b_2 = torch.zeros(d)

In [31]:
def relu(x: torch.Tensor):
    return torch.max(x, torch.zeros_like(x))

In [32]:
F = relu(O @ W_1 + b_1) @ W_2 + b_2
F.shape

torch.Size([20, 64])

In [34]:
output = O + F
output.shape

torch.Size([20, 64])

In [35]:
unembedd = output[-1] @ E.T
unembedd.shape

torch.Size([65])

In [36]:
probabilities = softmax(unembedd)
probabilities.shape

torch.Size([65])