In [1]:
import torch

In [17]:
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [18]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev, 0)
xbow

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [19]:
a = torch.tril(torch.ones(3,3))
a = a/torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10, (3,2)).float()
c = a @ b
print(c)

tensor([[8.0000, 6.0000],
        [6.5000, 4.0000],
        [5.6667, 4.0000]])


In [20]:
print(a)
print(b)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[8., 6.],
        [5., 2.],
        [4., 4.]])


In [21]:
weights = torch.tril(torch.ones(T, T))
weights = weights/weights.sum(1, keepdim=True)
xbow2 = weights @ x

In [22]:
xbow2

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [23]:
from torch.nn import functional as F

In [26]:
tril = torch.tril(torch.ones(1,1))
weights = torch.zeros((T,T))
weights = weights.masked_fill(tril==0, float("-inf"))
weights = F.softmax(weights, dim=-1)
xbow3 = weights @ x
xbow3[0]

tensor([[-0.0341,  0.1332],
        [-0.0341,  0.1332],
        [-0.0341,  0.1332],
        [-0.0341,  0.1332],
        [-0.0341,  0.1332],
        [-0.0341,  0.1332],
        [-0.0341,  0.1332],
        [-0.0341,  0.1332]])

In [25]:
xbow2[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [29]:
with open("tiny_shakespear.txt", "r") as stream:
    text = stream.read()
    
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !',-.:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
62


In [31]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda x: [stoi[c] for c in x]
decode = lambda x: "".join([itos[c] for c in x])

print(decode(encode("hello")))

hello


In [32]:
data = torch.tensor(encode(text))
data

tensor([29, 43, 36,  ..., 29, 30, 28])

In [35]:
block_size = 8
batch_size = 4
def get_batch():
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

x, y  = get_batch()
print(x)
print(y)

tensor([[ 7,  0, 18, 41,  1, 55, 43, 50],
        [56, 49, 39,  1, 37, 44, 53, 55],
        [36, 46, 40,  1, 60, 50, 56,  1],
        [48, 40,  6,  0,  0, 22, 40, 54]])
tensor([[ 0, 18, 41,  1, 55, 43, 50, 56],
        [49, 39,  1, 37, 44, 53, 55, 43],
        [46, 40,  1, 60, 50, 56,  1, 43],
        [40,  6,  0,  0, 22, 40, 54, 54]])


In [42]:
import torch.nn as nn
B, T = x.shape
tok_emb_table = nn.Embedding(vocab_size, 32)
pos_emb_table = nn.Embedding(block_size, 32)

tok_emb = tok_emb_table(x)
pos_emb = pos_emb_table(torch.arange(T))
X = tok_emb + pos_emb

print(x[0])
print(tok_emb[0][0])
print(pos_emb[0])

tensor([ 7,  0, 18, 41,  1, 55, 43, 50])
tensor([ 0.6144,  0.7587,  1.4836,  1.7352,  0.8804,  1.4760,  0.6311,  0.4358,
        -1.0027,  0.4420, -0.1036, -1.1312, -0.3247, -0.0322, -0.2953,  1.1953,
        -2.2604,  0.7947,  0.0066,  1.4796, -1.3218, -0.0290, -1.2789, -1.4618,
        -1.0839, -1.2710,  0.9819, -0.3834,  0.7289, -0.4512,  2.8492, -0.6424],
       grad_fn=<SelectBackward0>)
tensor([-6.6243e-03, -4.7309e-01, -1.3021e+00, -6.5491e-01,  1.2268e+00,
         9.2907e-01, -8.5324e-01, -1.2480e+00,  4.2771e-01,  5.7282e-01,
        -1.3896e+00, -5.1169e-01, -2.4553e-01, -1.9685e-01,  1.5873e+00,
        -1.8413e+00,  1.0065e-01,  2.7351e-01, -5.8766e-01, -1.4081e+00,
         1.1223e+00, -1.5770e-03,  1.0404e-01, -8.2817e-01, -1.0269e-01,
         6.0356e-01, -8.0791e-01, -2.7952e-01,  9.4910e-01, -1.7974e+00,
        -1.7857e+00, -2.1206e+00], grad_fn=<SelectBackward0>)


In [49]:
len([ 0.6144,  0.7587,  1.4836,  1.7352,  0.8804,  1.4760,  0.6311,  0.4358,
        -1.0027,  0.4420, -0.1036, -1.1312, -0.3247, -0.0322, -0.2953,  1.1953,
        -2.2604,  0.7947,  0.0066,  1.4796, -1.3218, -0.0290, -1.2789, -1.4618,
        -1.0839, -1.2710,  0.9819, -0.3834,  0.7289, -0.4512,  2.8492, -0.6424])

32

In [43]:
len([-6.6243e-03, -4.7309e-01, -1.3021e+00, -6.5491e-01,  1.2268e+00,
         9.2907e-01, -8.5324e-01, -1.2480e+00,  4.2771e-01,  5.7282e-01,
        -1.3896e+00, -5.1169e-01, -2.4553e-01, -1.9685e-01,  1.5873e+00,
        -1.8413e+00,  1.0065e-01,  2.7351e-01, -5.8766e-01, -1.4081e+00,
         1.1223e+00, -1.5770e-03,  1.0404e-01, -8.2817e-01, -1.0269e-01,
         6.0356e-01, -8.0791e-01, -2.7952e-01,  9.4910e-01, -1.7974e+00,
        -1.7857e+00, -2.1206e+00])

32

In [51]:
 0.7587+-4.7309e-01

0.28561000000000003

In [45]:
print(X[0][0])

tensor([ 0.6078,  0.2856,  0.1814,  1.0803,  2.1072,  2.4051, -0.2222, -0.8122,
        -0.5750,  1.0149, -1.4932, -1.6429, -0.5702, -0.2290,  1.2920, -0.6460,
        -2.1598,  1.0682, -0.5810,  0.0715, -0.1994, -0.0306, -1.1749, -2.2900,
        -1.1866, -0.6675,  0.1740, -0.6629,  1.6780, -2.2486,  1.0634, -2.7630],
       grad_fn=<SelectBackward0>)


In [46]:
len([ 0.6078,  0.2856,  0.1814,  1.0803,  2.1072,  2.4051, -0.2222, -0.8122,
        -0.5750,  1.0149, -1.4932, -1.6429, -0.5702, -0.2290,  1.2920, -0.6460,
        -2.1598,  1.0682, -0.5810,  0.0715, -0.1994, -0.0306, -1.1749, -2.2900,
        -1.1866, -0.6675,  0.1740, -0.6629,  1.6780, -2.2486,  1.0634, -2.7630])

32

In [48]:
tok_emb

tensor([[[ 0.6144,  0.7587,  1.4836,  ..., -0.4512,  2.8492, -0.6424],
         [ 0.3889, -0.3313,  0.6967,  ..., -0.7808, -0.6466,  0.8176],
         [-1.3996,  0.7401,  0.3637,  ..., -1.0731, -0.7737, -0.8434],
         ...,
         [ 1.7527, -0.1127,  0.4896,  ..., -0.1318, -0.2942, -1.7126],
         [-0.4028,  0.5934, -0.2005,  ...,  0.5076,  0.8531,  0.3289],
         [-0.5697, -0.5993, -0.5512,  ...,  0.0207, -1.1428,  0.0883]],

        [[ 0.6894, -0.7158, -0.6840,  ..., -0.3224, -0.0434, -0.4698],
         [ 0.2359,  0.0644, -0.2274,  ...,  0.3454, -1.5387, -1.7038],
         [ 0.5790,  1.1480, -1.0644,  ..., -0.1905,  1.4490,  1.8474],
         ...,
         [-1.3031, -0.4404, -1.1598,  ...,  0.0622,  0.7251,  0.3281],
         [-0.8758, -1.8439,  2.6802,  ..., -0.9770, -2.3757, -1.4104],
         [ 1.7527, -0.1127,  0.4896,  ..., -0.1318, -0.2942, -1.7126]],

        [[-1.0945, -2.7225, -1.5922,  ...,  1.1276,  1.3466, -0.4755],
         [ 0.7720, -0.8700,  0.2945,  ...,  1

In [53]:
C

2

In [62]:
# B,T,C = 
n_embd = 32
head_size = 16
key = nn.Linear(32, head_size, bias=False)
queries = nn.Linear(32, head_size, bias=False)
val = nn.Linear(32, head_size, bias=False)

k = key(X)
q = queries(X)
v = val(X)

wei = q @ k.transpose(-2, -1) * head_size**-0.5
tril = torch.tril(torch.ones(8,8))
wei = wei.masked_fill(tril==0, float("-inf"))
wei = F.softmax(wei, dim=-1)
out = wei @ v

wei[0]


tensor([[-0.7729, -1.0834, -0.5939,  1.6812,  2.3110, -0.4750,  3.2580,  3.5340],
        [ 1.3278,  5.2149,  1.7303,  3.6606,  4.2795, -2.9949,  3.8240,  2.6498],
        [-0.4212,  4.3974, -2.1648,  6.0795,  1.4370, -2.3196, -1.7588,  0.0981],
        [ 2.2616,  2.7659,  1.3712,  0.5402,  3.6694, -6.2997,  2.7413,  1.2735],
        [-1.2647, -4.1999,  0.9154, -1.6294, -2.6756,  0.4014, -2.7229, -2.5656],
        [-0.1244,  1.8457,  0.2667,  2.5553,  0.2832, -1.7920,  1.6791,  0.2111],
        [-1.8820, -5.0723, -3.0372, -1.8572,  0.2825,  0.3319,  0.7912,  3.6247],
        [-1.6062, -2.8592,  4.1347, -0.9649, -2.1262,  0.9069,  0.2497, -1.8599]],
       grad_fn=<SelectBackward0>)

In [64]:
out

tensor([[[ 0.6078,  0.2856,  0.1814,  ..., -2.2486,  1.0634, -2.7630],
         [ 0.3609, -0.0511,  1.9099,  ..., -1.6045, -0.2529, -1.7766],
         [ 0.3553, -0.0541,  1.9288,  ..., -1.5950, -0.2721, -1.7657],
         ...,
         [-0.0957,  1.6891, -0.5089,  ..., -1.6220, -1.4316, -1.2145],
         [-0.3532,  0.4330, -0.5455,  ..., -0.3181, -0.4278,  0.3477],
         [-1.3888,  0.7594,  0.1240,  ..., -0.5219, -2.2289, -2.5293]],

        [[ 0.6828, -1.1888, -1.9862,  ..., -2.1198, -1.8291, -2.5904],
         [ 0.4073, -0.3124, -0.2596,  ..., -1.1699, -1.4519, -3.5591],
         [ 0.2211,  0.3041,  0.8983,  ..., -0.5025, -1.1763, -4.1643],
         ...,
         [-0.2331,  0.3844,  0.1388,  ..., -0.0613, -0.2756, -1.3672],
         [ 0.0088, -0.1168, -0.7681,  ..., -0.6990, -0.8922, -0.8986],
         [-0.0214,  0.3691,  0.6458,  ..., -0.4186, -1.3051, -3.3776]],

        [[-1.1011, -3.1956, -2.8943,  ..., -0.6698, -0.4391, -2.5961],
         [ 0.5389, -0.8793,  1.0606,  ...,  0

In [66]:
wei[0]

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [2.0091e-02, 9.7991e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [8.0019e-03, 9.9060e-01, 1.3994e-03, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [3.0816e-01, 5.1025e-01, 1.2649e-01, 5.5103e-02, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [9.2265e-02, 4.9013e-03, 8.1626e-01, 6.4068e-02, 2.2506e-02, 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [3.8577e-02, 2.7664e-01, 5.7042e-02, 5.6247e-01, 5.7987e-02, 7.2796e-03,
         0.0000e+00, 0.0000e+00],
        [2.8794e-02, 1.1851e-03, 9.0699e-03, 2.9517e-02, 2.5081e-01, 2.6352e-01,
         4.1710e-01, 0.0000e+00],
        [2.9880e-03, 8.5358e-04, 9.3039e-01, 5.6746e-03, 1.7765e-03, 3.6885e-02,
         1.9116e-02, 2.3187e-03]], grad_fn=<SelectBackward0>)