In [9]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import AutoTokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

sentence = "The bank approved the loan"
tokens = tokenizer.tokenize(sentence)

print("Subword Tokens:")
print(tokens)

Subword Tokens:
['the', 'bank', 'approved', 'the', 'loan']


In [11]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids = torch.tensor([token_ids])  # batch size = 1

print("Token IDs:")
print(token_ids)

Token IDs:
tensor([[1996, 2924, 4844, 1996, 5414]])


In [12]:
vocab_size = tokenizer.vocab_size
embedding_dim = 8

embedding_layer = nn.Embedding(vocab_size, embedding_dim)

embeddings = embedding_layer(token_ids)

print("Embeddings shape:", embeddings.shape)

Embeddings shape: torch.Size([1, 5, 8])


In [13]:
multihead_attn = nn.MultiheadAttention(
    embed_dim=8,
    num_heads=2,
    batch_first=True
)

In [14]:
output, attention_weights = multihead_attn(
    embeddings,  # Query
    embeddings,  # Key
    embeddings   # Value
)

In [15]:
attention_df = pd.DataFrame(
    attention_weights[0].detach().numpy(),
    index=tokens,
    columns=tokens
)

attention_df

Unnamed: 0,the,bank,approved,the.1,loan
the,0.17377,0.224055,0.260956,0.17377,0.167449
bank,0.204626,0.189426,0.18705,0.204626,0.214272
approved,0.184926,0.212523,0.172922,0.184926,0.244702
the,0.17377,0.224055,0.260956,0.17377,0.167449
loan,0.18172,0.221254,0.225934,0.18172,0.189372


## How to Read This Table (Sentence-Based)

- **Rows** → the **current token**
- **Columns** → the tokens it **attends to**

## Example Interpretation

If the row **`bank`** has higher values under:

- `approved`
- `loan`

### What This Means
- To understand **“bank”**, the model focuses more on **approved** and **loan**.
- From this context, the model learns:

> **bank = financial institution**

Not a river bank.


## What Multi-Head Attention Is Doing Here

Because **2 attention heads** are used:

### Head 1
- Focuses on **semantic meaning**
- Example:

bank ↔ loan


### Head 2
- Focuses on **action or relationship**
- Example:

approved ↔ bank


### Final Idea
- Each attention head looks at the sentence **from a different perspective**.
- All heads run **in parallel**.
- Their outputs are **combined** to form a richer, context-aware representation.


In [16]:
print("Final output shape:", output.shape)

Final output shape: torch.Size([1, 5, 8])
