[toc] 

# Attention_pooling

一般的 pooling，会将一个 `[batch_size, seq_len, hidden_size]` 的乘以一个 `[seq_len, 1]` 的 向量，得到一个 `[batch_size, hidden_size]` 的向量。

这样的坏处是每个 batch 的权重矩阵是相同的

## normal pool

In [None]:
import numpy as np

np.random.seed(12345)

batch_size = 2
seq_len = 3
hidden_size=  4

x = np.random.randn(batch_size, seq_len, hidden_size)
# print(x)
def normal_pooling(x, weight=None):
    batch_size, seq_len, hidden_size = x.shape
    if weight is None:
        weight = np.random.randn(1, seq_len)
        weight /= np.sum(weight)
        # print (weight)
    pooled = np.matmul(weight, x)
    return pooled.squeeze()

res = normal_pooling(x)
print(res)
res.shape

## attention pooling

attention pooling 是从一个 [1, hidden_size] 的 query 出发，最后构造出一个 attention_probs，使得对于不同的 batch 来说，权重也是不同的。

In [32]:
def dense(x, units, use_bias=False):
    input_dim = x.shape[-1]
    weights = np.random.randn(input_dim, units)
    if use_bias:
        bias = np.random.randn(units)
        return np.matmul(x, weights) + bias
    else:
        return np.matmul(x, weights)
    
def reshape_to_matrix(x):
    return x.reshape(-1, x.shape[-1])

def softmax(x):
    assert x.ndim == 2
    c = np.max(x, axis=1, keepdims=True)
    exp_x = np.exp(x-c) 
    sum_x = np.sum(exp_x, axis=1, keepdims=True)
    return exp_x / (sum_x + 10e-5)
    
def attention_pooling(x,
                      weight=None,
                      input_mask=None):
    '''
    last_layer_tensor shape = [batch_size, seq_length, hidden_size]
    input_mask shape = [batch_size, seq_length]
    '''
    batch_size, seq_size, hidden_size = x.shape

    # use a trainable vector to find effective part from the sequence
    query = np.random.randn(1, hidden_size)
    
    # layer norm
    query /= np.sum(query)
    
    # linear projection
    query = dense(query, hidden_size, use_bias=False)
    
    # shape = [batch_size * seq_length, hidden_size]
    key = reshape_to_matrix(x)
    
    # linear projection
    key = dense(key, hidden_size, use_bias=False)
    
    # calculate attention, shape = [batch_size * seq_length, 1]
    attention_scores = np.matmul(key, query.T)
    
    # trick: a = a / sqrt(d)
    attention_scores = np.multiply(
      attention_scores, 1.0 / np.sqrt(float(hidden_size)))
    
    # shape = [batch_size, seq_length]
    attention_scores = np.reshape(attention_scores, [batch_size, -1])
    # apply mask
    if input_mask is not None:
    # 1 means effective, 0 means not effective
        adder = (1.0 - np.cast(input_mask, np.float32)) * -10000.0
    # add to keep softmax ZERO
        attention_scores += adder
    
    # softmax
    # shape = [batch_size, seq_length]
    attention_probs = softmax(attention_scores)
    
    # pooling
    # shape = [batch_size, 1, seq_length]
    attention_probs = np.expand_dims(attention_probs, axis=1)
    
    # shape = [batch_size, 1, hidden_size]
    pooling_result = np.matmul(attention_probs, x)
    
    # shape = [batch_size, hidden_size]
    return np.squeeze(pooling_result, axis=1)

attention_pooling(x)

array([[-0.20455599,  0.47897794, -0.51925441, -0.55570914],
       [-1.99910161, -0.37118293,  1.66855777, -0.43718541]])