<a href="https://colab.research.google.com/github/anshulsinghkamboj-ml/nlp-/blob/main/transfromers_self_attaentiom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import math
import torch.nn as nn

In [2]:

class TinySelfAttention(nn.Module):
  def __init__(self,embed_dim):
    super().__init__()
    self.embed_dim=embed_dim
    self.W_q=nn.Linear(embed_dim,embed_dim)
    self.W_k=nn.Linear(embed_dim,embed_dim)
    self.W_v=nn.Linear(embed_dim,embed_dim)

  def forward(self,x):
    q=self.W_q(x)
    k=self.W_k(x)
    v=self.W_v(x)

    scores = q@k.transpose(-2,-1)/math.sqrt(self.embed_dim)
    weights=torch.softmax(scores,dim=-1)
    output=torch.matmul(weights,v)

    return output,weights



In [3]:
# demo
x = torch.randn(1, 4, 8)  # batch=1, seq_len=4, embed_dim=8
attn = TinySelfAttention(embed_dim=8)
out, w = attn(x)
print(out.shape)  # (1,4,8)
print(w.shape)

torch.Size([1, 4, 8])
torch.Size([1, 4, 4])


In [4]:
class TinyMultiHead(nn.Module):
  def __init__(self,embed_dim,num_heads):
    super().__init__()
    self.embed_dim=embed_dim
    self.num_heads=num_heads
    self.head_dim=embed_dim//num_heads

    self.W_q=nn.Linear(embed_dim,embed_dim)
    self.W_k=nn.Linear(embed_dim,embed_dim)
    self.W_v=nn.Linear(embed_dim,embed_dim)

    self.out = nn.Linear(embed_dim, embed_dim)

  def head_split(self,x):
    B,T,C=x.size()
    x=x.view(B,T,self.num_heads,self.head_dim)
    return x.transpose(1,2)

  def combine_heads(self,x):
    B, H, T, D = x.size()
    x = x.transpose(1, 2).contiguous()
    return x.view(B, T, H * D)

  def forward(self,x):
    Q=self.head_split(self.W_q(x))
    K=self.head_split(self.W_k(x))
    V=self.head_split(self.W_v(x))

    scores=Q@K.transpose(-2,-1)/math.sqrt(self.head_dim)
    weights=torch.softmax(scores,dim=-1)

    out=weights@V

    out=self.combine_heads(out)
    out=self.out(out)

    return out,weights





In [5]:
x = torch.randn(1, 4, 8)  # batch=1, seq_len=4, embed_dim=8
attn = TinyMultiHead(embed_dim=8,num_heads=2)
out, w = attn(x)
print(out.shape)  # (1,4,8)
print(w.shape)

torch.Size([1, 4, 8])
torch.Size([1, 2, 4, 4])


In [6]:
class TinyFeedForward(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, embed_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

In [7]:
class TinyTransformerBlock(nn.Module):
  def __init__(self, embed_dim, num_heads, ff_hidden_dim):
        super().__init__()
        self.mha=TinyMultiHead(embed_dim,num_heads)
        self.ff=TinyFeedForward(embed_dim, ff_hidden_dim)

        self.ln1=nn.LayerNorm(embed_dim)
        self.ln1=nn.LayerNorm(embed_dim)

  def forward(self,x):
    attn_out, weights = self.mha(x)
    x = self.ln1(x + attn_out)
    ff_out = self.ff(x)
    x = self.ln2(x + ff_out)
    return x, weights

In [8]:
class TinyPositionalEmbedding(nn.Module):
    def __init__(self, max_len, embed_dim):
        super().__init__()
        self.pos_embedding = nn.Embedding(max_len, embed_dim)

    def forward(self, seq_len):
        # positions: [0, 1, 2, ..., seq_len-1]
        positions = torch.arange(seq_len).unsqueeze(0)  # (1, T)
        return self.pos_embedding(positions)            # (1, T, embed_dim)


In [9]:
class TinyInputEmbeddings(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding   = nn.Embedding(max_len, embed_dim)

    def forward(self, input_ids):
        # input_ids: (B, T)
        B, T = input_ids.shape

        token_embeds = self.token_embedding(input_ids)   # (B, T, D)

        positions = torch.arange(T, device=input_ids.device).unsqueeze(0)  # (1, T)
        pos_embeds = self.pos_embedding(positions)       # (1, T, D)

        return token_embeds + pos_embeds


In [10]:
class TinyLMHead(nn.Module):
    def __init__(self, embed_dim, vocab_size):
        super().__init__()
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        # x: (B, T, D)
        return self.fc(x)  # (B, T, vocab_size)


In [11]:
class TinyMultiHead(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.W_q = nn.Linear(embed_dim, embed_dim)
        self.W_k = nn.Linear(embed_dim, embed_dim)
        self.W_v = nn.Linear(embed_dim, embed_dim)

        self.out = nn.Linear(embed_dim, embed_dim)

    def split_heads(self, x):
        B, T, C = x.size()
        x = x.view(B, T, self.num_heads, self.head_dim)
        return x.transpose(1, 2)

    def combine_heads(self, x):
        B, H, T, D = x.size()
        x = x.transpose(1, 2).contiguous()
        return x.view(B, T, H * D)

    def forward(self, x):
        B, T, _ = x.size()

        Q = self.split_heads(self.W_q(x))
        K = self.split_heads(self.W_k(x))
        V = self.split_heads(self.W_v(x))

        scores = Q @ K.transpose(-2, -1) / (self.head_dim ** 0.5)

        # ---- CAUSAL MASK HERE ----
        mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
        scores = scores.masked_fill(mask, float('-inf'))
        # ---------------------------

        weights = torch.softmax(scores, dim=-1)
        out = weights @ V
        out = self.combine_heads(out)

        return self.out(out), weights


In [12]:
import torch
import torch.nn as nn

class TinyGPT(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim, num_heads, ff_hidden_dim, num_layers):
        super().__init__()

        self.embed = TinyInputEmbeddings(vocab_size, max_len, embed_dim)

        self.layers = nn.ModuleList([
            TinyTransformerBlock(embed_dim, num_heads, ff_hidden_dim)
            for _ in range(num_layers)
        ])

        self.lm_head = TinyLMHead(embed_dim, vocab_size)

    def forward(self, input_ids):
        # input_ids: (B, T)
        x = self.embed(input_ids)  # (B, T, D)

        attn_maps = []
        for layer in self.layers:
            x, weights = layer(x)
            attn_maps.append(weights)

        logits = self.lm_head(x)  # (B, T, vocab_size)
        return logits, attn_maps


In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# ---------------------------------------------
# Multi-Head Attention (with causal mask)
# ---------------------------------------------
class TinyMultiHead(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.W_q = nn.Linear(embed_dim, embed_dim)
        self.W_k = nn.Linear(embed_dim, embed_dim)
        self.W_v = nn.Linear(embed_dim, embed_dim)

        self.out = nn.Linear(embed_dim, embed_dim)

    def split_heads(self, x):
        B, T, C = x.size()
        x = x.view(B, T, self.num_heads, self.head_dim)
        return x.transpose(1, 2)  # (B, H, T, D)

    def combine_heads(self, x):
        B, H, T, D = x.size()
        x = x.transpose(1, 2).contiguous()
        return x.view(B, T, H * D)

    def forward(self, x):
        B, T, _ = x.size()

        Q = self.split_heads(self.W_q(x))
        K = self.split_heads(self.W_k(x))
        V = self.split_heads(self.W_v(x))

        # Attention scores
        scores = Q @ K.transpose(-2, -1) / math.sqrt(self.head_dim)

        # Causal mask: prevent looking ahead
        mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
        scores = scores.masked_fill(mask, float('-inf'))

        weights = F.softmax(scores, dim=-1)
        out = weights @ V
        out = self.combine_heads(out)
        return self.out(out), weights


# ---------------------------------------------
# Feed-Forward Network
# ---------------------------------------------
class TinyFeedForward(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, embed_dim)

    def forward(self, x):
        return self.fc2(F.relu(self.fc1(x)))


# ---------------------------------------------
# Transformer Block
# ---------------------------------------------
class TinyTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_hidden_dim):
        super().__init__()
        self.mha = TinyMultiHead(embed_dim, num_heads)
        self.ff = TinyFeedForward(embed_dim, ff_hidden_dim)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_out, weights = self.mha(x)
        x = self.ln1(x + attn_out)
        ff_out = self.ff(x)
        x = self.ln2(x + ff_out)
        return x, weights


# ---------------------------------------------
# Token + Positional Embeddings
# ---------------------------------------------
class TinyInputEmbeddings(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Embedding(max_len, embed_dim)

    def forward(self, input_ids):
        B, T = input_ids.shape
        token_embeds = self.token_embedding(input_ids)
        positions = torch.arange(T, device=input_ids.device).unsqueeze(0)
        pos_embeds = self.pos_embedding(positions)
        return token_embeds + pos_embeds


# ---------------------------------------------
# LM Head
# ---------------------------------------------
class TinyLMHead(nn.Module):
    def __init__(self, embed_dim, vocab_size):
        super().__init__()
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        return self.fc(x)


# ---------------------------------------------
# Full TinyGPT Model
# ---------------------------------------------
class TinyGPT(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim, num_heads, ff_hidden_dim, num_layers):
        super().__init__()
        self.embed = TinyInputEmbeddings(vocab_size, max_len, embed_dim)

        self.layers = nn.ModuleList([
            TinyTransformerBlock(embed_dim, num_heads, ff_hidden_dim)
            for _ in range(num_layers)
        ])

        self.lm_head = TinyLMHead(embed_dim, vocab_size)

    def forward(self, input_ids):
        x = self.embed(input_ids)
        attn_maps = []

        for layer in self.layers:
            x, attn = layer(x)
            attn_maps.append(attn)

        logits = self.lm_head(x)
        return logits, attn_maps


In [14]:
# ----------------------------------------------------
# Training Setup
# ----------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

vocab_size = 100
max_len = 32
embed_dim = 64
num_heads = 4
ff_hidden_dim = 128
num_layers = 2

model = TinyGPT(vocab_size, max_len, embed_dim, num_heads, ff_hidden_dim, num_layers).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Dummy dataset (replace with your tokenized text)
batch_size = 16
num_batches = 2000

for step in range(num_batches):
    input_ids = torch.randint(0, vocab_size, (batch_size, max_len)).to(device)
    target_ids = input_ids.clone()

    logits, _ = model(input_ids)

    B, T, V = logits.shape
    loss = criterion(logits.view(B*T, V), target_ids.view(B*T))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Step {step} — Loss: {loss.item():.4f}")


Step 0 — Loss: 4.7583
Step 100 — Loss: 2.7312
Step 200 — Loss: 0.9432
Step 300 — Loss: 0.3406
Step 400 — Loss: 0.1615
Step 500 — Loss: 0.0940
Step 600 — Loss: 0.0612
Step 700 — Loss: 0.0443
Step 800 — Loss: 0.0338
Step 900 — Loss: 0.0263
Step 1000 — Loss: 0.0211
Step 1100 — Loss: 0.0174
Step 1200 — Loss: 0.0146
Step 1300 — Loss: 0.0125
Step 1400 — Loss: 0.0106
Step 1500 — Loss: 0.0092
Step 1600 — Loss: 0.0082
Step 1700 — Loss: 0.0072
Step 1800 — Loss: 0.0063
Step 1900 — Loss: 0.0057


In [15]:
import torch.nn.functional as F

@torch.no_grad()
def generate(model, input_ids, max_new_tokens, temperature=1.0, top_k=None):
    model.eval()
    for _ in range(max_new_tokens):
        # crop to max_len if sequence grows
        input_condensed = input_ids[:, -model.embed.pos_embedding.num_embeddings:]

        logits, _ = model(input_condensed)

        # take last token's logits
        last_logits = logits[:, -1, :] / temperature

        # optional top-k filtering
        if top_k is not None:
            values, _ = torch.topk(last_logits, top_k)
            min_val = values[:, -1].unsqueeze(-1)
            last_logits = torch.where(last_logits < min_val, torch.full_like(last_logits, -1e10), last_logits)

        probs = F.softmax(last_logits, dim=-1)

        next_token = torch.multinomial(probs, num_samples=1)

        input_ids = torch.cat([input_ids, next_token], dim=1)

    return input_ids


In [16]:
start = torch.tensor([[12]], device=device)  # starting token
output = generate(model, start, max_new_tokens=30, temperature=1.0, top_k=5)
print(output)


tensor([[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
         12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]], device='cuda:0')


In [17]:
with open("harry1.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [18]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

def encode(s):
    return [stoi[c] for c in s]

def decode(tokens):
    return ''.join(itos[t] for t in tokens)


In [19]:
data = torch.tensor(encode(text), dtype=torch.long)
print("Dataset tokens:", len(data))


Dataset tokens: 439478


In [20]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]


In [21]:
def get_batch(split, batch_size=32, block_size=128):
    data_source = train_data if split == "train" else val_data
    ix = torch.randint(0, len(data_source) - block_size - 1, (batch_size,))

    x = torch.stack([data_source[i:i+block_size] for i in ix])
    y = torch.stack([data_source[i+1:i+block_size+1] for i in ix])

    return x.to(device), y.to(device)


In [22]:
embed_dim = 128
num_heads = 4
ff_hidden_dim = 256
num_layers = 4
max_len = 128

model = TinyGPT(vocab_size, max_len, embed_dim, num_heads, ff_hidden_dim, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()


In [25]:
steps = 10000
batch_size = 32
block_size = max_len

for step in range(steps):
    x, y = get_batch("train", batch_size, block_size)

    logits, _ = model(x)

    B, T, V = logits.shape
    loss = criterion(logits.view(B*T, V), y.view(B*T))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Log every so often
    if step % 200 == 0:
        val_x, val_y = get_batch("val", batch_size, block_size)
        with torch.no_grad():
            val_logits, _ = model(val_x)
            val_loss = criterion(val_logits.view(B*T, V), val_y.view(B*T))
        print(f"Step {step} — train loss: {loss.item():.4f} — val loss: {val_loss.item():.4f}")


Step 0 — train loss: 1.4981 — val loss: 1.5019
Step 200 — train loss: 1.4176 — val loss: 1.5171
Step 400 — train loss: 1.4386 — val loss: 1.5397
Step 600 — train loss: 1.4415 — val loss: 1.4658
Step 800 — train loss: 1.4068 — val loss: 1.4790
Step 1000 — train loss: 1.3528 — val loss: 1.4826
Step 1200 — train loss: 1.4247 — val loss: 1.5074
Step 1400 — train loss: 1.3136 — val loss: 1.4883
Step 1600 — train loss: 1.2862 — val loss: 1.4013
Step 1800 — train loss: 1.3701 — val loss: 1.4568
Step 2000 — train loss: 1.3063 — val loss: 1.4039
Step 2200 — train loss: 1.3103 — val loss: 1.3655
Step 2400 — train loss: 1.3174 — val loss: 1.3804
Step 2600 — train loss: 1.2595 — val loss: 1.4321
Step 2800 — train loss: 1.2989 — val loss: 1.3804
Step 3000 — train loss: 1.2845 — val loss: 1.4256
Step 3200 — train loss: 1.3113 — val loss: 1.4051
Step 3400 — train loss: 1.2953 — val loss: 1.4032
Step 3600 — train loss: 1.2877 — val loss: 1.3685
Step 3800 — train loss: 1.2231 — val loss: 1.4272
Step 40

In [26]:
start = torch.tensor([[stoi['T']]], device=device)  # any starting character
output = generate(model, start, max_new_tokens=500, temperature=0.8, top_k=20)
print(decode(output[0].tolist()))


The neast dream, Harry thought broke his head book as the whole are he was out in a bald ball watter full out how to the twin near window. “You could take you think they:

One thought was Snape say to sleep you, to have a corner leave at all.

“Yes, he’s but what we’re just enough to think all right,” said Hagrid chains are waved and something….an…this he had always little back at them, nothing lebes noise at the points wizard with the house cheers.

“His arms and to find out what you got a bear 
