# Transformers, Self-Attention, BERT, GPT

In [None]:
#!pip install transformers torch datasets sentencepiece accelerate

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModel,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM
)

## Self-Attention: The Core Idea

### Multi-Head Attention (Why many heads?)

Visualize self-attention using PyTorch

In [None]:
import torch
import torch.nn.functional as F

# Dummy sequence of 5 tokens, embedding dim = 4
x = torch.randn(5, 4)

Wq = torch.randn(4, 4)
Wk = torch.randn(4, 4)
Wv = torch.randn(4, 4)

Q = x @ Wq
K = x @ Wk
V = x @ Wv

scores = (Q @ K.T) / (4 ** 0.5)   # scaled dot-product attention
attn = F.softmax(scores, dim=-1)

context_vector = attn @ V

attn, context_vector

### Transformer Architecture (Encoder/Decoder)

BERT Explained (Encoder Only)

In [None]:
from transformers import pipeline
bert = pipeline("fill-mask", model="bert-base-uncased")
bert("The capital of France is [MASK].")

In [None]:
from transformers import pipeline
bert = pipeline("fill-mask", model="bert-base-uncased")
bert("The capital of France is [MASK].")

### GPT Explained (Decoder Only)

In [None]:
from transformers import pipeline
gpt = pipeline("text-generation", model="gpt2")
gpt("Once upon a time in Bangalore,")[0]["generated_text"]

#### BERT vs GPT (Simple Table)

#### Self-Attention is Parallel ‚Üí Fast on GPUs

In [None]:
def positional_encoding(seq_len, d_model):
    PE = torch.zeros(seq_len, d_model)
    pos = torch.arange(0, seq_len).unsqueeze(1).float()  # shape [seq_len, 1]

    for i in range(0, d_model, 2):
        div_term = 10000 ** (i / d_model)
        PE[:, i] = torch.sin(pos.squeeze(1) / div_term)  # shape [seq_len]
        PE[:, i+1] = torch.cos(pos.squeeze(1) / div_term)
    return PE

PE = positional_encoding(50, 16)

plt.figure(figsize=(8,4))
sns.heatmap(PE.numpy(), cmap="viridis")
plt.title("Positional Encoding Pattern")
plt.show()

Full Transformer Block Diagram

Mini Transformer Attention Visualization (HF)

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

model = AutoModel.from_pretrained("bert-base-uncased", output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

inputs = tokenizer("Hello I am learning transformers!", return_tensors="pt")
outputs = model(**inputs)

attn = outputs.attentions  
len(attn), attn[0].shape   # (layers, batch, heads, tokens, tokens)

In [None]:
#!pip install tf-keras

In [None]:
from transformers import pipeline

fill = pipeline("fill-mask", model="bert-base-uncased")
fill("The capital of France is [MASK].")

In [None]:
clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

clf("I really love deep learning. It is amazing!")

BERT Sentence Embeddings (Used in RAG, LangChain)

In [None]:
tok = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

text = "Transformers changed deep learning forever."
tokens = tok(text, return_tensors="pt")

with torch.no_grad():
    output = model(**tokens).last_hidden_state
    embedding = output.mean(dim=1)

embedding.shape

BERT Attention Heatmap

In [None]:
model = AutoModel.from_pretrained("bert-base-uncased", output_attentions=True)
tok = AutoTokenizer.from_pretrained("bert-base-uncased")

inputs = tok("Attention is all you need", return_tensors="pt")
outputs = model(**inputs)

attn = outputs.attentions[0][0]   # 1st layer, 1st batch = (heads, tokens, tokens)

plt.figure(figsize=(8,4))
sns.heatmap(attn.mean(0).detach().numpy(), cmap="Reds")
plt.title("BERT Attention Heatmap (Layer 1, Avg Heads)")
plt.show()

In [None]:
gen = pipeline("text-generation", model="gpt2", max_length=60)
gen("Once upon a time in Bangalore")[0]["generated_text"]

In [None]:
print("\n--- BERT (Understanding) ---")
print(fill("The weather today is very [MASK]."))

print("\n--- GPT (Generation) ---")
print(gen("The weather today is very")[0]["generated_text"])

How Transformers Power Agentic AI