In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MaskedSelfAttention(nn.Module):
    def __init__(self, embed_dim, atten_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, atten_dim, bias=False)
        self.key = nn.Linear(embed_dim, atten_dim, bias=False)
        self.value = nn.Linear(embed_dim, atten_dim, bias=False)

    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
        
        scores = torch.matmul(query, key.transpose(-2, -1))
        scores = scores / key.size(-1)**0.5

        tril = torch.tril(torch.ones(x.size(1), x.size(1))).to(x.device)
        masked_scores = scores.masked_fill(tril==0, float('-inf'))

        attention_weights = F.softmax(masked_scores, dim=-1)
        weighted_values = torch.matmul(attention_weights, value)

        return weighted_values

class MultiHeadMaskedSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()        
        atten_dim = embed_dim // num_heads
        self.heads = nn.ModuleList([MaskedSelfAttention(embed_dim, atten_dim) for _ in range(num_heads)])
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        head_outputs = []
        for head in self.heads:
            head_outputs.append(head(x))
        concatenated_heads = torch.cat(head_outputs, dim=-1)
        return self.fc(concatenated_heads)

In [1]:
!nvidia-smi

Sat May 24 17:57:42 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:01:00.0 Off |                  N/A |
| 30%   28C    P8              13W / 350W |     26MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, atten_dim):
        super().__init__()
        self.key = nn.Linear(embed_dim, atten_dim)
        self.query = nn.Linear(embed_dim, atten_dim)
        self.value = nn.Linear(embed_dim, atten_dim)
    
    def forward(self, x):
        key = self.key(x)
        query = self.query(x)
        value = self.value(x)

        scores = torch.matmul(query, key.transpose(-2,-1))
        scores = scores / key.size(-1) ** 0.5
        atten_weight = F.softmax(scores, -1)

        weighted_value = torch.matmul(atten_weight, value)

        return weighted_value
    
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        atten_dim = embed_dim // num_heads
        self.heads = nn.ModuleList([SelfAttention(embed_dim, atten_dim) for _ in range(num_heads)])
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        head_outputs = []
        for head in self.heads:
            head_outputs.append(head(x))
        concated_heads = torch.cat(head_outputs, dim=-1)
        return self.fc(concated_heads)

class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
    
    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.mha = MultiHeadAttention(embed_dim, num_heads)
        
        self.ln2 = nn.LayerNorm(embed_dim)
        self.ff = FeedForward(embed_dim, ff_dim)
    
    def forward(self, x):
        x = x + self.mha(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x
        

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MaskedSelfAttention(nn.Module):
    def __init__(self, embed_dim, atten_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, atten_dim, bias=False)
        self.key = nn.Linear(embed_dim, atten_dim, bias=False)
        self.value = nn.Linear(embed_dim, atten_dim, bias=False)
        
    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
        
        scores = torch.matmul(query, key.transpose(-2,-1))
        scores = scores / key.size(-1) ** 0.5

        trill = torch.trill(torch.ones(x.size(1), x.size(1))).to(x.device)
        masked_scores = scores.masked_fill(trill == 0, float('-inf'))
        atten_weight = F.softmax(masked_scores, -1)

        weighted_values = torch.matmul(atten_weight, value)
        return weighted_values

class MultiHeadMaskedSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        atten_dim = embed_dim // num_heads
        self.heads = nn.ModuleList([MaskedSelfAttention(embed_dim, atten_dim) for _ in range(num_heads)])
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        head_outputs = []
        for head in self.heads:
            head_outputs.append(head(x))
        concatenated_heads = torch.cat(head_outputs, dim=-1)
        return self.fc(concatenated_heads)

class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
    def forward(self, x):
        return self.net(x)

class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.mha = MultiHeadMaskedSelfAttention(embed_dim, num_heads)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.ff = FeedForward(embed_dim, embed_dim*4)
    
    def forward(self, x):
        x = x + self.mha(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x 

In [27]:
class TransformerGen(nn.Module):
    def __init__(self, char_size, embed_dim, n_heads, n_layers, block_size):
        super().__init__()
        self.block_size = block_size
        self.char_embedding = nn.Embedding(char_size, embed_dim)
        self.pos_embedding = nn.Embedding(block_size, embed_dim)
        self.transformer_blocks = nn.ModuleList(*[TransformerDecoderBlock(embed_dim, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, char_size)
    
    def forward(self, x):
        char_embeddings = self.char_embedding(x)
        positions = torch.arange(0, x.size(1)).to(x.device).unsqueeze(0)
        pos_embeddings = self.pos_embedding(positions)
        x = char_embeddings + pos_embeddings
        x = self.transformer_blocks(x)
        x = self.ln_f(x)
        logits = self.fc(x)
        return logits
    
    def generate():
        pass

In [28]:
n_embed = 32
n_heads = 4
n_layers = 4
block_size = 16

In [30]:
with open("deepLearning/transformer/bible.txt", "r") as f:
    text = f.read()

print(text[:1000])

KJV
King James Bible: Pure Cambridge Edition - Text courtesy of www.BibleProtector.com
Genesis 1:1	In the beginning God created the heaven and the earth.
Genesis 1:2	And the earth was without form, and void; and darkness [was] upon the face of the deep. And the Spirit of God moved upon the face of the waters.
Genesis 1:3	And God said, Let there be light: and there was light.
Genesis 1:4	And God saw the light, that [it was] good: and God divided the light from the darkness.
Genesis 1:5	And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day.
Genesis 1:6	And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters.
Genesis 1:7	And God made the firmament, and divided the waters which [were] under the firmament from the waters which [were] above the firmament: and it was so.
Genesis 1:8	And God called the firmament Heaven. And the evening and the morning were the second day.
Genesi

In [39]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)

	
 !(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ[]abcdefghijklmnopqrstuvwxyz—’
77


In [40]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda x: "".join(itos[i] for i in x)

In [50]:
encoded_text = encode(text)

data = torch.tensor(encoded_text, dtype=torch.long)

In [52]:
data.shape[0]

4602957

In [None]:
batch_size = 4
def get_batch(data, batch_size, block_size):
    ix = torch.randint(0, data.shape[0] - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

