In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

from transformers import BertTokenizer, BertModel





In [None]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)

model = BertModel.from_pretrained(
    model_name,
    output_attentions=True
)

model.eval()


In [None]:
sentence = "The mechanic inspected the engine because it was noisy."


In [None]:
inputs = tokenizer(sentence, return_tensors="pt")


In [None]:
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
tokens

In [None]:
with torch.no_grad():
    outputs = model(**inputs)

attentions = outputs.attentions


In [None]:
layer = 0
head = 0

attention_matrix = attentions[layer][0, head].cpu().numpy()

In [None]:
plt.figure(figsize=(10, 8))
plt.imshow(attention_matrix, cmap="viridis")

plt.xticks(range(len(tokens)), tokens, rotation=90)
plt.yticks(range(len(tokens)), tokens)

plt.colorbar()
plt.title("Self-Attention Heatmap (Layer 1, Head 0)")
plt.tight_layout()

plt.savefig("outputs/attention_heatmap.png")
plt.show()

A2 — Understand Positional Encoding

In [None]:
sentence_original = "The cat sat on the mat"
sentence_scrambled = "Mat the on sat cat the"


In [None]:
import torch.nn.functional as F

def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    # Last hidden state: (batch, seq_len, hidden_dim)
    token_embeddings = outputs.last_hidden_state

    # Mean pooling over tokens
    sentence_embedding = token_embeddings.mean(dim=1)

    return sentence_embedding


In [None]:
emb_original = get_sentence_embedding(sentence_original)
emb_scrambled = get_sentence_embedding(sentence_scrambled)

In [None]:
sentence_similarity = F.cosine_similarity(
    emb_original, emb_scrambled
)

sentence_similarity.item()

In [None]:
def get_token_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state.squeeze(0)


In [None]:
tokens_original = get_token_embeddings(sentence_original)
tokens_scrambled = get_token_embeddings(sentence_scrambled)

token_level_similarity = F.cosine_similarity(
    tokens_original.mean(dim=0),
    tokens_scrambled.mean(dim=0),
    dim=0
)

token_level_similarity.item()

A3 — Encoder vs Decoder Architecture

A3.1 — Encoder Task: Fill-Mask (BERT)

In [None]:
from transformers import pipeline

In [None]:
fill_mask = pipeline(
    "fill-mask",
    model="bert-base-uncased"
)

In [None]:
text = "Transformers are [MASK] at understanding context."

In [None]:
results = fill_mask(text)

for r in results[:5]:
    print(f"{r['token_str']:>12}  |  score = {r['score']:.4f}")

A3.2 — Decoder Task: Text Generation (GPT-2)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")

gpt_model.eval()

In [None]:
prompt = "Transformers are powerful because"

In [None]:
inputs = gpt_tokenizer(prompt, return_tensors="pt")

outputs = gpt_model.generate(
    **inputs,
    max_length=50,
    temperature=0.7,
    do_sample=True,
    num_return_sequences=3
)

for i, out in enumerate(outputs, 1):
    print(f"\nOutput {i}:")
    print(gpt_tokenizer.decode(out, skip_special_tokens=True))

B — Tokens, Embeddings & Context Windows

B1) Tokenization Strategies: Word vs Subword

In [46]:
from transformers import BertTokenizer, GPT2Tokenizer

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [47]:
words = [
    "unbelievable",
    "internationalization",
    "electroencephalography",
    "bioinformatics",
    "mecahnical"   
]


In [48]:
import pandas as pd

rows = []

for word in words:
    bert_tokens = bert_tokenizer.tokenize(word)
    gpt2_tokens = gpt2_tokenizer.tokenize(word)

    rows.append({
        "Word": word,
        "BERT Tokens": bert_tokens,
        "BERT Token Count": len(bert_tokens),
        "GPT-2 Tokens": gpt2_tokens,
        "GPT-2 Token Count": len(gpt2_tokens),
    })

token_table = pd.DataFrame(rows)
token_table


Unnamed: 0,Word,BERT Tokens,BERT Token Count,GPT-2 Tokens,GPT-2 Token Count
0,unbelievable,[unbelievable],1,"[un, bel, iev, able]",4
1,internationalization,"[international, ##ization]",2,"[international, ization]",2
2,electroencephalography,"[electro, ##ence, ##pha, ##log, ##raphy]",5,"[elect, ro, ence, phal, ography]",5
3,bioinformatics,"[bio, ##in, ##form, ##atics]",4,"[b, io, in, format, ics]",5
4,mecahnical,"[me, ##ca, ##hn, ##ical]",4,"[m, ec, ahn, ical]",4
