# LLM from scratch
This notebook contains code for LLM-from-scratch book.

## Ch 3 - Attention Module

In [1]:
import torch
X = torch.tensor([
    [0.43, 0.15, 0.89], # Your     (x^1)
    [0.55, 0.87, 0.66], # journey  (x^2)
    [0.57, 0.85, 0.64], # starts (x^3)
    [0.22, 0.58, 0.33], # with (x^4)
    [0.77, 0.25, 0.10], # one (x^5)
    [0.05, 0.80, 0.55] # step (x^6)
])

# simple affinity : dot-product (to measure similarity)
def affinity(x, y):
    """Given 2 vectors, compute affinity"""
    return torch.dot(x, y)

# step 1 : calculate attention weights 
# idea : If query q : how much should each token of input X (i.e. x1, x2, ...) be weighed in importance 
# attention(query, x) for all x in input
query_idx = 1
query_token = X[query_idx]
attention_weights = torch.tensor([affinity(x_i, query_token) for (_, x_i) in enumerate(X)])
attention_weights = torch.tensor([a / attention_weights.sum() for a in attention_weights])
attention_weights = attention_weights.view(-1, 1)

print("\n\n-- attention --")
print(f"token[{query_idx}]: {query_token}")
print("A(.) is affinity")
for idx, score in enumerate(attention_weights):
    print(f"w({idx}) = A(x({query_idx}), x({idx})) : {score}")

# step 2 : compute context vectors  
# idea : Given query q and attention weights, create "information context" using weighted sum approach
# idea : "information context" tells LLM how to make use of all the input tokens
query = X[1]
list_context_vectors = attention_weights * X
context_vector = list_context_vectors.sum(dim=0, keepdim=True)
print("\n\n-- context --")
print("list_context_vectors : ", list_context_vectors.shape)
for idx, vec in enumerate(list_context_vectors):
    print(f"z({idx}) = w({idx})* x[{idx}] : {vec}")

print("\ncontext_wrt_query: ", context_vector.shape)
print(context_vector)

# step 3 - vectorize 
print("\n\n-- vectorize --")
attention_scores = X @ X.T # compute attention pair-wise for each x_i, x_j pair using dot-product 
attention_weights = torch.softmax(attention_scores, dim=-1) # row_i = attention weights w.r.t x_i
context_matrix = attention_weights @ X # output (n, k) where each row i is attention_context for x_i
print("context shape: ", context_matrix.shape)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/andylee/anaconda3/envs/llm/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/andylee/anaconda3/envs/llm/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/andylee/anaconda3/envs/llm/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io



-- attention --
token[1]: tensor([0.5500, 0.8700, 0.6600])
A(.) is affinity
w(0) = A(x(1), x(0)) : tensor([0.1455])
w(1) = A(x(1), x(1)) : tensor([0.2278])
w(2) = A(x(1), x(2)) : tensor([0.2249])
w(3) = A(x(1), x(3)) : tensor([0.1285])
w(4) = A(x(1), x(4)) : tensor([0.1077])
w(5) = A(x(1), x(5)) : tensor([0.1656])


-- context --
list_context_vectors :  torch.Size([6, 3])
z(0) = w(0)* x[0] : tensor([0.0625, 0.0218, 0.1295])
z(1) = w(1)* x[1] : tensor([0.1253, 0.1982, 0.1504])
z(2) = w(2)* x[2] : tensor([0.1282, 0.1911, 0.1439])
z(3) = w(3)* x[3] : tensor([0.0283, 0.0745, 0.0424])
z(4) = w(4)* x[4] : tensor([0.0830, 0.0269, 0.0108])
z(5) = w(5)* x[5] : tensor([0.0083, 0.1325, 0.0911])

context_wrt_query:  torch.Size([1, 3])
tensor([[0.4355, 0.6451, 0.5680]])


-- vectorize --
context shape:  torch.Size([6, 3])


In [34]:
# vectorize attention
attention_scores = X @ X.T
attention_weights = torch.softmax(attention_scores, dim=-1)
context_matrix = attention_weights @ X
print("\n\n-- attention scores and weights shapes --")
print("attention_scores shape: ", attention_scores.shape)
print("attention_weights shape: ", attention_weights.shape)
print("context shape: ", context_matrix.shape)




-- attention scores and weights shapes --
attention_scores shape:  torch.Size([6, 6])
attention_weights shape:  torch.Size([6, 6])
context shape:  torch.Size([6, 3])
