In [131]:
pwd

'/Users/altayavci/Desktop'

In [132]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

In [133]:
torch.backends.mps.is_available()
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [134]:
device

device(type='mps')

In [135]:
torch.manual_seed(42)

<torch._C.Generator at 0x11fe39a50>

In [136]:
!wget https://raw.githubusercontent.com/hackerb9/ssa-baby-names/refs/heads/main/allnames.txt /Users/altayavci/Desktop/allnames.txt

--2025-02-12 00:20:42--  https://raw.githubusercontent.com/hackerb9/ssa-baby-names/refs/heads/main/allnames.txt
raw.githubusercontent.com (raw.githubusercontent.com) çözümleniyor... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
raw.githubusercontent.com (raw.githubusercontent.com)[185.199.109.133]:443 bağlanılıyor... bağlantı kuruldu.
HTTP isteği gönderildi, yanıt bekleniyor... 200 OK
Uzunluk: 755579 (738K) [text/plain]
Kayıt yeri: `allnames.txt.4'


2025-02-12 00:20:42 (3,91 MB/s) - `allnames.txt.4' kaydedildi [755579/755579]

Prepended http:// to '/Users/altayavci/Desktop/allnames.txt'
http:///Users/altayavci/Desktop/allnames.txt: Makine ismi geçersiz.
TAMAMLANDI --2025-02-12 00:20:42--
Toplam duvar saati zamanı: 0,6s
İndirilen: 1 dosya, 738K, 0,2s (3,91 MB/s) içerisinde


In [137]:
with open("allnames.txt", "r") as f:
    names = f.read().splitlines()
    names = [name.lower() for name in names]

In [138]:
alphabet = [' '] + sorted(list(set(''.join(names)))) + ['.']
itoc = {i: c for i, c in enumerate(alphabet)}
ctoi = {c: i for i, c in enumerate(alphabet)}

In [139]:
encode = lambda name : [ctoi[c] for c in name]
decode = lambda tokens : ''.join([itoc[i] for i in tokens])

In [140]:
n=int(0.9*len(names))
train_data, val_data = random_split(names, [n, len(names)-n])

class NameDataset(Dataset):
    def __init__(self, names):
        self.names = names
        self.ctoi = ctoi
        self.alphabet_size = len(alphabet)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx]
        x = [self.ctoi[c] for c in name]  # Convert characters to indices
        y = x[1:] + [self.ctoi[' ']]  # The next character to predict (shifted version of x)
        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)
        return x, y  

In [141]:
def pad_sequences(batch):
    max_len = max([len(x) for x, _ in batch])  # Find the max length in the batch
    padded_x = []
    padded_y = []

    for x, y in batch:
        padded_x.append(F.pad(x, (0, max_len - len(x)), "constant", ctoi[' ']))  # Pad x
        padded_y.append(F.pad(y, (0, max_len - len(x)), "constant", ctoi['.']))  # Pad y

    # Stack the padded sequences to create the batch
    return torch.stack(padded_x), torch.stack(padded_y)

In [142]:
train_dataset = NameDataset(train_data)
val_dataset = NameDataset(val_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_sequences)

name = next(iter(train_loader)) # Tuple of (x, target)

In [143]:
print(decode(name[0].tolist()[0]))
print(name[0][0]) # grab the 0th name
print(name[1][0]) # grab the 0th target

hanalee   
tensor([ 8,  1, 14,  1, 12,  5,  5,  0,  0,  0], device='mps:0')
tensor([ 1, 14,  1, 12,  5,  5,  0, 27, 27, 27], device='mps:0')


In [144]:
n_embd=len(alphabet)

x=torch.tensor(encode('altay')).unsqueeze(0) # to add batch dimension

xenc=F.one_hot(x, num_classes=n_embd).float()
print(xenc)

tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]])


In [145]:
print("xenc.shape",xenc.shape)
print("xenc.transpose(-2,-1).shape",xenc.transpose(-2,-1).shape)
print("xenc @ xenc.transpose(-2,-1)\n",xenc @ xenc.transpose(-2,-1)) # (5x28) * (28x5) -> (5x5)

xenc.shape torch.Size([1, 5, 28])
xenc.transpose(-2,-1).shape torch.Size([1, 28, 5])
xenc @ xenc.transpose(-2,-1)
 tensor([[[1., 0., 0., 1., 0.],
         [0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0.],
         [1., 0., 0., 1., 0.],
         [0., 0., 0., 0., 1.]]])


In [146]:
print((xenc @ xenc.transpose(-2,-1)) @ xenc) # (1x5x5) x (1x5x28) -> (1x5x28)

tensor([[[0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]])


In [147]:
(xenc @ xenc.transpose(-2,-1)).softmax(dim=-1)


tensor([[[0.3222, 0.1185, 0.1185, 0.3222, 0.1185],
         [0.1488, 0.4046, 0.1488, 0.1488, 0.1488],
         [0.1488, 0.1488, 0.4046, 0.1488, 0.1488],
         [0.3222, 0.1185, 0.1185, 0.3222, 0.1185],
         [0.1488, 0.1488, 0.1488, 0.1488, 0.4046]]])

In [148]:
attn = (xenc @ xenc.transpose(-2,-1)).softmax(dim=-1) @ xenc
print(attn)

tensor([[[0.0000, 0.6444, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.1185, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.1185, 0.0000, 0.0000, 0.0000,
          0.0000, 0.1185, 0.0000, 0.0000],
         [0.0000, 0.2977, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.4046, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.1488, 0.0000, 0.0000, 0.0000,
          0.0000, 0.1488, 0.0000, 0.0000],
         [0.0000, 0.2977, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.1488, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.4046, 0.0000, 0.0000, 0.0000,
          0.0000, 0.1488, 0.0000, 0.0000],
         [0.0000, 0.6444, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.1185, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.1185, 0.0000,

In [149]:
print("encode('altay')",encode('altay'))
print("attn.argmax(dim=-1)",attn.argmax(dim=-1))
print("decode(attn.argmax(dim=-1))",decode(attn.argmax(dim=-1).tolist()[0]))

encode('altay') [1, 12, 20, 1, 25]
attn.argmax(dim=-1) tensor([[ 1, 12, 20,  1, 25]])
decode(attn.argmax(dim=-1)) altay


In [150]:
B, T, C = xenc.shape
dk = C

query = nn.Linear(C, dk, bias=False)
key = nn.Linear(C, dk, bias=False) 
value = nn.Linear(C, dk, bias=False) 

Q = query(xenc) # B x T x dk
K = key(xenc) # B x T x dk
V = value(xenc) # B x T x dk

In [151]:
attn = ((Q @ K.transpose(-2,-1))/(dk**0.5)).softmax(dim=-1) @ V


In [152]:
decode(attn.argmax(dim=-1)[0].tolist())

'yyyyy'

In [153]:
for _ in range(10):  
  attn_probs = attn.softmax(dim=-1)  # Apply softmax to get probabilities over the vocabulary
  sampled_indices = torch.multinomial(attn_probs.view(-1, attn_probs.size(-1)), 1)
  print(decode(sampled_indices.T[0].tolist()))

kgmkk
v.o s
glbpn
wvuaa
ejirf
hzkjz
rqsdc
owhay
wbhyh
q.fj.


In [154]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by the number of heads"
        self.num_heads = num_heads
        self.dk = embed_dim // num_heads

        # Linear layers for query, key, and value (in the case of cross-attention, separate inputs are used)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)

    def forward(self, q, k, v):
        B, T, C = q.shape  # Assuming q, k, v have the same shape (B: batch size, T: sequence length, C: embedding dim)

        # Project Q, K, V using their respective linear layers
        q = self.q_proj(q)  # Shape: (B, T, C)
        k = self.k_proj(k)  # Shape: (B, T, C)
        v = self.v_proj(v)  # Shape: (B, T, C)

        # Reshape into (B, num_heads, T, dk)
        q = q.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        k = k.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        v = v.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)

        # Scaled dot-product attention
        attn_weights = (q @ k.transpose(-2, -1)) / (self.dk ** 0.5)  # (B, heads, T, T)
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_output = attn_weights @ v  # (B, heads, T, dk)

        # Combine heads back to (B, T, C)
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)

        # Final linear projection
        return self.out_proj(attn_output)


In [155]:
m = MultiHeadAttention(28, 4)
attn = m(xenc,xenc,xenc)

In [156]:
attn

tensor([[[-0.0633, -0.0449, -0.0528, -0.0069,  0.0101, -0.0150, -0.0305,
           0.0099, -0.0229, -0.0366,  0.0186, -0.0557, -0.0105, -0.0334,
          -0.0282,  0.0455, -0.0051,  0.0049,  0.0902, -0.0498, -0.0040,
           0.0289, -0.0061, -0.0033, -0.0384,  0.0400, -0.0423, -0.0429],
         [-0.0630, -0.0442, -0.0530, -0.0066,  0.0099, -0.0152, -0.0299,
           0.0095, -0.0230, -0.0363,  0.0186, -0.0551, -0.0108, -0.0333,
          -0.0279,  0.0458, -0.0044,  0.0054,  0.0898, -0.0498, -0.0039,
           0.0292, -0.0061, -0.0031, -0.0377,  0.0406, -0.0425, -0.0420],
         [-0.0635, -0.0447, -0.0528, -0.0066,  0.0102, -0.0154, -0.0304,
           0.0095, -0.0230, -0.0361,  0.0187, -0.0553, -0.0107, -0.0333,
          -0.0280,  0.0463, -0.0045,  0.0049,  0.0900, -0.0498, -0.0042,
           0.0295, -0.0066, -0.0035, -0.0384,  0.0407, -0.0428, -0.0429],
         [-0.0633, -0.0449, -0.0528, -0.0069,  0.0101, -0.0150, -0.0305,
           0.0099, -0.0229, -0.0366,  0.0186, -0

In [157]:
attn.shape

torch.Size([1, 5, 28])

In [158]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=4*28, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()

        # Multi-Head Attention
        self.self_attention = MultiHeadAttention(d_model, nhead)

        # Feedforward layer
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),  # First fully connected layer
            nn.ReLU(),                          # Non-linearity
            nn.Linear(dim_feedforward, d_model)  # Second fully connected layer
        )

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # Self-attention block
        attn_output = self.self_attention(src, src, src)
        src = self.norm1(src + attn_output)  # Add &amp; Norm

        # Feedforward block
        ff_output = self.feedforward(src)
        src = self.norm2(src + self.dropout(ff_output))  # Add &amp; Norm

        return src

In [159]:
encoder_layer = TransformerEncoderLayer(28, 4)
output = encoder_layer(xenc)

In [160]:
output.shape

torch.Size([1, 5, 28])

In [161]:
output.argmax(dim=-1) 

tensor([[ 1, 12, 20,  1, 25]])

In [162]:
output

tensor([[[-3.6392e-02,  4.8592e+00, -1.7832e-01, -1.1640e-01, -4.6500e-01,
          -6.2416e-01, -4.3235e-01,  8.0909e-02, -2.0204e-01, -1.8812e-01,
          -1.7107e-01, -2.5191e-01,  8.2323e-01, -3.7970e-01, -9.2976e-02,
           9.2541e-02, -1.9284e-01, -1.0954e+00, -2.5514e-01, -4.1936e-01,
           1.6585e-01,  1.2001e-01,  2.6042e-01, -7.4013e-01, -4.9338e-01,
          -3.3369e-01,  1.9683e-01,  6.9380e-02],
         [-1.1751e-01, -4.0006e-04, -3.2669e-03, -3.7973e-01, -3.0974e-02,
          -1.9953e-01, -4.6604e-01, -2.3603e-01, -2.8427e-02, -7.5042e-02,
          -1.1010e-01, -2.0972e-01,  5.1366e+00, -4.0702e-01, -9.0749e-02,
          -1.9671e-01, -5.5031e-01, -4.6640e-01, -2.5662e-01, -1.3365e-01,
          -3.2198e-01, -3.4788e-02, -2.5173e-02, -3.0440e-01, -1.2003e-01,
          -1.8262e-01, -1.1008e-01, -7.9275e-02],
         [-1.7732e-01, -3.3213e-01, -1.2561e-01, -2.4379e-01,  2.2722e-01,
           2.3009e-01, -2.4946e-01, -2.7524e-01, -3.8295e-01, -3.2562e-01,


In [163]:
import torch
import math

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=16):
        super(PositionalEncoding, self).__init__()

        # Create a long enough "position" tensor
        position = torch.arange(0, max_len).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * -(math.log(10000.0) / embed_dim))  # (embed_dim / 2)

        # Apply the sine and cosine functions
        pe = torch.zeros(max_len, embed_dim)
        pe[:, 0::2] = torch.sin(position * div_term)  # Apply sine to even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Apply cosine to odd indices

        # Register the positional encoding as a buffer (no gradient updates)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: Tensor of shape (batch_size, seq_len, embed_dim)
        return x + self.pe[:x.size(1)]  # Add the positional encoding to the input tensor

In [164]:
m = PositionalEncoding(28)

In [165]:
m.forward(xenc).shape

torch.Size([1, 5, 28])

In [166]:
m.forward(xenc)

tensor([[[ 0.0000e+00,  2.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
           0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
           0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  4.9510e-01,  8.6884e-01,  2.6506e-01,
           9.6423e-01,  1.3850e-01,  9.9036e-01,  7.1906e-02,  9.9741e-01,
           3.7267e-02,  9.9931e-01,  1.0193e+00,  9.9981e-01,  9.9998e-03,
           9.9995e-01,  5.1795e-03,  9.9999e-01,  2.6827e-03,  1.0000e+00,
           1.3895e-03,  1.0000e+00,  7.1969e-04,  1.0000e+00,  3.7276e-04,
           1.0000e+00,  1.9307e-04,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  8.6032e-01,  5.0976e-01,  5.1116e-01,
           8.5948e-01,  2.7434e-01,  9.6163e-01,  1.4344e-01,  9.8966e-01,


In [168]:
wpe = nn.Embedding(len(alphabet),28)

pos = torch.arange(0, T, dtype=torch.long).unsqueeze(0) # shape (1, T)
wpe(pos)

tensor([[[-1.8065, -0.9420,  0.1425,  0.3147,  0.1797,  0.4637, -0.6893,
          -0.3930,  1.8023, -0.0950, -2.7983, -0.8151,  1.1483,  0.6912,
          -0.1588,  0.6424,  0.6942, -0.0144,  0.5377, -0.9814,  0.8732,
           0.3436,  1.9760,  0.4262,  0.7866, -0.0064,  1.1634,  1.3267],
         [-1.3687,  1.2084, -0.9066, -1.1218,  0.1798,  0.4476, -0.2035,
          -1.3437,  2.1148, -0.4132, -0.1699,  0.5577, -0.3864,  0.1074,
          -0.2942,  0.1477,  2.0658,  0.4589, -0.7438, -0.1498, -0.6487,
          -0.8055,  0.7233,  0.3981, -0.6924, -0.7644,  0.6505, -0.3634],
         [-0.5435,  0.6724, -0.8850,  0.2334,  0.2622, -0.7276,  0.9897,
           0.5145, -0.3607, -0.4103, -0.3342,  0.5809, -3.0052, -0.3236,
           2.4828,  1.0565, -0.6862, -0.7511,  0.1387, -0.6260, -0.3275,
           0.5766,  0.6531,  0.5626, -0.1279, -0.1543,  0.4973,  0.2991],
         [-0.5058, -0.7808, -1.3996, -1.5269,  0.5652, -1.1678, -2.3982,
          -0.1564, -0.2024,  0.1575,  1.4432,  1

In [174]:
B, T, C = xenc.shape

print(f"Unmasked attention:\n {(xenc @ xenc.transpose(-2,-1)).softmax(dim=-1)}")
print()
wei = xenc @ xenc.transpose(-2,-1) 
wei = wei.masked_fill(torch.tril(torch.ones(T,T)) == 0, float('-inf')) 
wei = F.softmax(wei, dim=-1) 

print(f"Masked attention:\n {wei}")

Unmasked attention:
 tensor([[[0.3222, 0.1185, 0.1185, 0.3222, 0.1185],
         [0.1488, 0.4046, 0.1488, 0.1488, 0.1488],
         [0.1488, 0.1488, 0.4046, 0.1488, 0.1488],
         [0.3222, 0.1185, 0.1185, 0.3222, 0.1185],
         [0.1488, 0.1488, 0.1488, 0.1488, 0.4046]]])

Masked attention:
 tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2689, 0.7311, 0.0000, 0.0000, 0.0000],
         [0.2119, 0.2119, 0.5761, 0.0000, 0.0000],
         [0.3655, 0.1345, 0.1345, 0.3655, 0.0000],
         [0.1488, 0.1488, 0.1488, 0.1488, 0.4046]]])


In [187]:
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by the number of heads"
        self.num_heads = num_heads
        self.dk = embed_dim // num_heads

        # Linear layers for query, key, and value (in the case of cross-attention, separate inputs are used)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)

    def forward(self, q, k, v):
        B, T, C = q.shape  # Assuming q, k, v have the same shape (B: batch size, T: sequence length, C: embedding dim)

        # Project Q, K, V using their respective linear layers
        q = self.q_proj(q)  # Shape: (B, T, C)
        k = self.k_proj(k)  # Shape: (B, T, C)
        v = self.v_proj(v)  # Shape: (B, T, C)

        # Reshape into (B, num_heads, T, dk)
        q = q.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        k = k.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        v = v.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)

        # Scaled dot-product attention with mask
        attn_weights = (q @ k.transpose(-2, -1)) / (self.dk ** 0.5)  # (B, heads, T, T)
        mask = torch.tril(torch.ones(T, T)).to(q.device)
        attn_weights = attn_weights.masked_fill(mask == 0, float('-inf'))
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_output = attn_weights @ v  # (B, heads, T, dk)

        # Combine heads back to (B, T, C)
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)

        # Final linear projection
        return self.out_proj(attn_output)

In [188]:
m = MaskedMultiHeadAttention(28,4)
attn = m(xenc,xenc,xenc)

In [186]:
xenc.shape

torch.Size([1, 5, 28])

In [190]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerDecoderLayer, self).__init__()

        # Masked Multi-Head Attention
        self.self_attention = MaskedMultiHeadAttention(d_model, nhead)

        # Feedforward layer
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),  # First fully connected layer
            nn.ReLU(),                          # Non-linearity
            nn.Linear(dim_feedforward, d_model)  # Second fully connected layer
        )

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # Self-attention block
        attn_output = self.self_attention(src, src, src)
        src = self.norm1(src + attn_output)  # Add &amp; Norm

        # Feedforward block
        ff_output = self.feedforward(src)
        src = self.norm2(src + self.dropout(ff_output))  # Add &amp; Norm

        return src

encoder_layer = TransformerDecoderLayer(28, 4)
output = encoder_layer(xenc)

In [191]:
output.shape

torch.Size([1, 5, 28])

In [204]:
class RandomNameGenerator(nn.Module):
  def __init__(self, d_model, nhead, nlayers, max_length):
    super().__init__()

    self.d_model = d_model
    self.nhead = nhead
    self.embed = nn.Embedding(len(alphabet), d_model)
    #self.pe = PositionalEncoding(d_model)
    self.wpe = nn.Embedding(max_length,d_model)
    self.decoder = nn.ModuleList([TransformerDecoderLayer(d_model, nhead) for _ in range(nlayers)])

    self.linear = nn.Linear(d_model, len(alphabet))
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x):
    B, T = x.size()
    print(x)
    x = self.embed(x)

    #x = self.pe(x)
    pos = torch.arange(0, T, dtype=torch.long, device=x.device).unsqueeze(0) # shape (1, t)
    x = x + self.wpe(pos)

    for layer in self.decoder:
      x = layer(x)
    x = self.linear(x)
    return x

  @torch.no_grad()
  def generate(self, x, max_new_tokens):
    for _ in range(max_new_tokens):
      logits = self(x)
      logits = logits[:, -1, :]
      probs = self.softmax(logits)
      next_token = torch.multinomial(probs, num_samples=1)
      if next_token == ctoi[' ']:
        break
      x = torch.cat((x, next_token), dim=1)
    return x[:,1:] # drop the first seed character

In [194]:
torch.tensor([0]).unsqueeze(0).to(device)

tensor([[0]], device='mps:0')

In [226]:
m = RandomNameGenerator(32, 4,2,16).to(device)

print(decode(m.generate(torch.tensor([0]).unsqueeze(0).to(device),8).tolist()[0]))
print(f"Model Parameters: {sum(p.numel() for p in m.parameters())}")

tensor([[0]], device='mps:0')
tensor([[ 0, 15]], device='mps:0')
tensor([[ 0, 15, 25]], device='mps:0')
tensor([[ 0, 15, 25, 13]], device='mps:0')
tensor([[ 0, 15, 25, 13, 10]], device='mps:0')
tensor([[ 0, 15, 25, 13, 10,  3]], device='mps:0')
tensor([[ 0, 15, 25, 13, 10,  3, 17]], device='mps:0')
tensor([[ 0, 15, 25, 13, 10,  3, 17,  4]], device='mps:0')
oymjcqdo
Model Parameters: 277084


In [227]:
optimizer = torch.optim.AdamW(m.parameters(), lr=5e-4, weight_decay=0.01, betas=(0.9, 0.99), eps=1e-8)

for epoch in range(10):
  for xenc_batch, y_batch in train_loader:
    optimizer.zero_grad()

    logits = m(xenc_batch)
    logits = logits.view(-1, logits.size(-1))  # Shape: [batch_size * max_seq_len, vocab_size]
    y_batch = y_batch.view(-1)  # Shape: [batch_size * max_seq_len]

    # Compute the loss using CrossEntropyLoss
    loss = F.cross_entropy(logits, y_batch, ignore_index=ctoi['.'])

    # Backward pass
    m.zero_grad(set_to_none=True) # make sure ALL the gradients are set to zero
    loss.backward()

    optimizer.step()

  print(f"Epoch {epoch}, Loss: {loss}")

tensor([[13,  1, 12,  1, 11,  0,  0,  0,  0,  0],
        [12,  9, 12,  9, 15, 14, 14,  1,  0,  0],
        [19, 20,  5, 16,  5, 14,  0,  0,  0,  0],
        [20,  1, 11,  5, 12,  9,  1,  0,  0,  0],
        [ 7,  5, 14, 14,  9,  5, 22,  5,  0,  0],
        [20,  1, 25, 19,  8,  1, 23, 14,  1,  0],
        [ 3,  1, 19,  9,  1, 14,  1,  0,  0,  0],
        [10,  1, 19, 13,  5,  8,  0,  0,  0,  0],
        [ 2, 18,  5,  1, 26,  1,  5,  0,  0,  0],
        [19, 15,  8,  1, 14,  9,  0,  0,  0,  0],
        [ 5, 19,  1,  2,  5, 12,  0,  0,  0,  0],
        [ 4,  1, 11, 11, 15, 20,  1,  0,  0,  0],
        [12, 15, 18, 18,  1, 25, 14,  5,  0,  0],
        [11,  5, 14, 14,  5,  4,  9,  0,  0,  0],
        [ 5, 12,  1,  9, 19,  8,  1,  0,  0,  0],
        [26,  8,  1, 14,  5, 12,  0,  0,  0,  0],
        [14,  9,  1, 13,  1,  0,  0,  0,  0,  0],
        [ 1,  8, 19,  5,  5, 13,  0,  0,  0,  0],
        [13,  1, 18,  9, 15, 19,  0,  0,  0,  0],
        [14,  1,  4,  9,  5,  0,  0,  0,  0,  0],


: 

In [None]:
@torch.inference_mode()
def evaluate(model, dataset, batch_size=50, max_batches=None):
    model.eval()
    loader = DataLoader(dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=pad_sequences)
    losses = []
    for i, batch in enumerate(loader):
        X, Y = batch
        logits = model(X)
        logits = logits.view(-1, logits.size(-1))
        Y = Y.view(-1)  # Shape: [batch_size * max_seq_len]

        # Compute the loss using CrossEntropyLoss
        loss = F.cross_entropy(logits, Y, ignore_index=ctoi['.'])

        losses.append(loss.item())
        if max_batches is not None and i >= max_batches:
            break
    mean_loss = torch.tensor(losses).mean().item()
    model.train() # reset model back to training mode
    return mean_loss

In [None]:
train_loss = evaluate(m, train_dataset, batch_size=100, max_batches=10)
test_loss  = evaluate(m, val_dataset,  batch_size=100, max_batches=10)

print(f"Epoch {epoch}, Train-Loss: {train_loss} Val-Loss: {test_loss}")

In [None]:
# https://towardsdatascience.com/explaining-the-attention-mechanism-29a0e7b448a9/