In [1]:
import numpy as np

In [2]:
d = {
    "apple": 10,
    "banana": 5,
    "chair": 2,
}

In [3]:
d.keys()

dict_keys(['apple', 'banana', 'chair'])

In [4]:
d.values()

dict_values([10, 5, 2])

In [5]:
query = "apple"

In [6]:
d[query]

10

In [7]:
# what happens if we want to make a fruit query with simulated attention?
query = "fruit"

In [8]:
d

{'apple': 10, 'banana': 5, 'chair': 2}

In [9]:
0.6 * d["apple"] + 0.4 * d["banana"] + 0.0 * d["chair"]

8.0

In [10]:
# we gave 0.6, 0.4, 0.0 as a simulated attention score in this example

In [11]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))

In [12]:
softmax(np.array([4.0, -1.0, 2.1]))

array([0.86482256, 0.00582713, 0.12935032])

In [13]:
def get_word_vector(word, d_k=8):
    """Hypothetical mapping that returns a word vector of size
    d_k for the given word. For demonstrative purposes, we initialize
    this vector randomly, but in practice this would come from a learned
    embedding or some kind of latent representation."""
    return np.random.normal(size=(d_k,))

def softmax(x):
    # assumes x is a vector
    return np.exp(x) / np.sum(np.exp(x))

def attention(q, K, v):
    # assumes q is a vector of shape (d_k)
    # assumes K is a matrix of shape (n_k, d_k)
    # assumes v is a vector of shape (n_k)
    return softmax(q @ K.T) @ v

def kv_lookup(query, keys, values):
    return attention(
        q = get_word_vector(query),
        K = np.array([get_word_vector(key) for key in keys]),
        v = values,
    )

# returns some float number
print(kv_lookup("fruit", ["apple", "banana", "chair"], [10, 5, 2]))

3.947858587289676


In [15]:
d = {
    "apple": [0.9, 0.2, -0.5, 1.0],
    "banana": [1.2, 2.0, 0.1, 0.2],
    "chair": [-1.2, -2.0, 1.0, -0.2]
}

In [16]:
softmax([3, 2, 1])

array([0.66524096, 0.24472847, 0.09003057])

In [17]:
softmax([30, 20, 10]) 

array([9.99954600e-01, 4.53978686e-05, 2.06106005e-09])

In [18]:
def softmax(x):
    # assumes x is a matrix and we want to take the softmax along each row
    # (which is achieved using axis=-1 and keepdims=True)
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

def attention(Q, K, V):
    # assumes Q is a matrix of shape (n_q, d_k)
    # assumes K is a matrix of shape (n_k, d_k)
    # assumes v is a matrix of shape (n_k, d_v)
    # output is a matrix of shape (n_q, d_v)
    d_k = K.shape[-1]
    return softmax(Q @ K.T / np.sqrt(d_k)) @ V
