In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers


# 使用 CPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")

No GPU found


# 遮挡

## 填充

In [12]:
raw_inputs = [
    [83, 91, 1, 645, 1253, 927],
    [73, 8, 3215, 55, 927],
    [711, 632, 71],
]

padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(raw_inputs,
                                                              padding='post')
padded_inputs

array([[  83,   91,    1,  645, 1253,  927],
       [  73,    8, 3215,   55,  927,    0],
       [ 711,  632,   71,    0,    0,    0]], dtype=int32)

## 生成遮挡

### 嵌入层生成遮挡
> 设置属性`mask_zero=True`

In [4]:
embedding = layers.Embedding(
    input_dim=5000,
    output_dim=16,
    mask_zero=True,  # 遮挡索引 0 
)

masked_output = embedding(padded_inputs)

print(masked_output._keras_mask)
# 幕后会创建一个遮挡矩阵

tf.Tensor(
[[ True  True  True  True  True  True]
 [ True  True  True  True  True False]
 [ True  True  True False False False]], shape=(3, 6), dtype=bool)


### 遮挡层

In [13]:
masking_layer = layers.Masking()

unmasked_embedding = tf.cast(
    tf.tile(tf.expand_dims(padded_inputs, axis=-1), [1, 1, 10]), tf.float32)
# 模拟批量的词嵌入查找结果，batch，seq_len,embed_size

masked_embedding = masking_layer(unmasked_embedding)
print(masked_embedding._keras_mask)
# mask 结果：batch,seq_len，表征序列中哪个词被遮挡

tf.Tensor(
[[ True  True  True  True  True  True]
 [ True  True  True  True  True False]
 [ True  True  True False False False]], shape=(3, 6), dtype=bool)


## 遮挡的传播

### Sequential 模型接口


In [16]:
model = tf.keras.Sequential([
    layers.Embedding(
        input_dim=5000,
        output_dim=16,
        mask_zero=True, # 设置遮挡
    ),
    layers.LSTM(32) # LSTM 层会自动接受遮挡，忽略填充的值
])

### Functional 模型


In [None]:
inputs = tf.keras.Input(shape=(None, ), dtype='int32')
x = layers.Embedding(
    input_dim=5000,
    output_dim=16,
    mask_zero=True,  # 设置遮挡
)(inputs)
outputs = layers.LSTM(32)(x)

model = tf.keras.Model(inputs, outputs)

### 自定义层

In [17]:

class MyLayer(layers.Layer):
    def __init__(self, **kwargs):
        super(MyLayer, self).__init__(**kwargs)
        self.embedding = layers.Embedding(
            input_dim=5000,
            output_dim=16,
            mask_zero=True,  # 设置遮挡
        )
        self.lstm = layers.LSTM(32)

    def call(self, inputs):
        x = self.embedding(inputs)

        # 创建遮挡
        # 也可以手动创建，只需要形状 batch,timesteps 的 bool 类型张量
        mask = self.embedding.compute_mask(inputs)

        output = self.lstm(
            x,
            mask=mask,  # 传递遮挡参数，自动忽略被遮挡的单词
        )
        return output


layer = MyLayer()
x = np.random.random((32, 10)) * 100
x = x.astype('int32')
layer(x)

<tf.Tensor: shape=(32, 32), dtype=float32, numpy=
array([[-4.2033270e-03, -3.2198273e-03, -3.2618330e-03, ...,
        -5.1559320e-05, -7.5503844e-03, -2.6671903e-03],
       [-1.5757827e-03, -8.7853119e-04, -5.4395376e-03, ...,
         1.3437866e-03,  4.2481534e-03,  1.0843794e-03],
       [-2.6922526e-03,  6.8654576e-03,  3.0769468e-03, ...,
        -6.4012818e-03, -8.7724405e-04,  5.9047979e-03],
       ...,
       [ 3.6077171e-03,  2.4239554e-03, -1.6135219e-03, ...,
        -4.7010854e-03, -7.1414036e-04, -1.5066147e-03],
       [ 1.7981452e-04,  8.5821534e-03, -1.8688419e-04, ...,
        -3.0656459e-03, -1.5515659e-03, -1.7525189e-03],
       [ 2.2413780e-03,  3.6969897e-03,  2.5818348e-03, ...,
         4.7936742e-03, -1.7710224e-03, -1.6857670e-03]], dtype=float32)>

## 支持 mask 的自定义层
> 在层中定义 layer.compute_mask 方法，以默认的遮挡和给定输入，生成新的遮挡

In [23]:
masked_embedding

<tf.Tensor: shape=(3, 6, 10), dtype=float32, numpy=
array([[[8.300e+01, 8.300e+01, 8.300e+01, 8.300e+01, 8.300e+01,
         8.300e+01, 8.300e+01, 8.300e+01, 8.300e+01, 8.300e+01],
        [9.100e+01, 9.100e+01, 9.100e+01, 9.100e+01, 9.100e+01,
         9.100e+01, 9.100e+01, 9.100e+01, 9.100e+01, 9.100e+01],
        [1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00,
         1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00],
        [6.450e+02, 6.450e+02, 6.450e+02, 6.450e+02, 6.450e+02,
         6.450e+02, 6.450e+02, 6.450e+02, 6.450e+02, 6.450e+02],
        [1.253e+03, 1.253e+03, 1.253e+03, 1.253e+03, 1.253e+03,
         1.253e+03, 1.253e+03, 1.253e+03, 1.253e+03, 1.253e+03],
        [9.270e+02, 9.270e+02, 9.270e+02, 9.270e+02, 9.270e+02,
         9.270e+02, 9.270e+02, 9.270e+02, 9.270e+02, 9.270e+02]],

       [[7.300e+01, 7.300e+01, 7.300e+01, 7.300e+01, 7.300e+01,
         7.300e+01, 7.300e+01, 7.300e+01, 7.300e+01, 7.300e+01],
        [8.000e+00, 8.000e+00, 8.000e+00, 8

In [32]:
class Default(tf.keras.layers.Layer):
    def call(self, inputs):
        # 该层会生成张量列表格，默认的 遮挡 masking(inputs) 将不能满足要求
        return tf.split(inputs, 2, axis=1)
    
    def compute_mask(self, inputs, mask=None):
        if mask is None:
            return None
        # 默认的 mask
        return mask   
    
a, b = Default()(masked_embedding)
print(a._keras_mask) # 默认的 mask，不满足要求
print(b._keras_mask) # 无 mask

tf.Tensor(
[[ True  True  True  True  True  True]
 [ True  True  True  True  True False]
 [ True  True  True False False False]], shape=(3, 6), dtype=bool)


AttributeError: 'tensorflow.python.framework.ops.EagerTensor' object has no attribute '_keras_mask'

In [34]:
class TemporalSplit(tf.keras.layers.Layer):
    def call(self, inputs):
        # 该层会生成张量列别，默认的 遮挡 masking(inputs) 将不能满足要求
        return tf.split(inputs, 2, axis=1)

    def compute_mask(self, inputs, mask=None):
        if mask is None:
            return None
        # 将 mask 也拆分成列表
        return tf.split(mask, 2, axis=1)

first_half, second_half = TemporalSplit()(masked_embedding)
print(first_half._keras_mask)
print(second_half._keras_mask)

tf.Tensor(
[[ True  True  True]
 [ True  True  True]
 [ True  True  True]], shape=(3, 3), dtype=bool)
tf.Tensor(
[[ True  True  True]
 [ True  True False]
 [False False False]], shape=(3, 3), dtype=bool)


In [37]:
class CustomEmbedding(tf.keras.layers.Layer):
    def __init__(self, input_dim, output_dim, mask_zero=False, **kwargs):
        super(CustomEmbedding, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.mask_zero = mask_zero

    def build(self, input_shape):
        self.embeddings = self.add_weight(
            shape=(self.input_dim, self.output_dim),
            initializer='random_normal',
            dtype='float32',
        )

    def call(self, inputs):
        return tf.nn.embedding_lookup(self.embeddings, inputs)

    def compute_mask(self, inputs, mask=None):
        if not self.mask_zero:
            return None
        return tf.not_equal(inputs, 0)


layer = CustomEmbedding(10, 32, mask_zero=True)
x = np.random.random((3, 10)) * 9
x = x.astype('int32')

y = layer(x)
mask = layer.compute_mask(x)

print(mask)

tf.Tensor(
[[False  True  True False False  True False  True  True  True]
 [ True  True  True  True  True  True False  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]], shape=(3, 10), dtype=bool)


## 带 mask 参数的层
> 添加 `mask=None` 参数，

In [None]:
class MaskConsumer(tf.keras.layers.Layer):
    def call(self, inputs, mask=None):
        pass

## 总结：
- masking 让层知道应该跳过或忽略输入序列的特定的时间步
- 一些层，自动生成 masking，如：指定mask_zero=True 的 Embedding 层，Masking 层
- 一些层是 mask 的消费者，在其 `__call__` 方法中公开 mask 参数，如 RNN 层
- 函数式及序列式模型接口中，遮挡信息会自动传播

# 注意力中的遮挡
## 填充产生的遮挡
> 输入序列可能是填充为等长

In [42]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # 添加额外的维度来将填充加到
    # 注意力对数（logits）。
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch, 1, 1, seq_len)


x = tf.constant([[7, 6, 0], [1, 2, 3], [1, 0, 0]])
create_padding_mask(x)

<tf.Tensor: shape=(3, 1, 1, 3), dtype=float32, numpy=
array([[[[0., 0., 1.]]],


       [[[0., 0., 0.]]],


       [[[0., 1., 1.]]]], dtype=float32)>

In [40]:
def scaled_dot_product_attention(q, k, v, mask):
    """
    q: batch, q_len, q_dim
    k: batch, k_len, q_dim
    v: batch, k_len, v_dim
    
    mask: batch, 1,1, k_len
    """
    attention_score = tf.matmul(q, k, transpose_b=True)  # batch,q_len,k_len
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attetntion_score = attention_score / tf.math.sqrt(dk)

    # 被 mask 的位置对应的权重乘以 -1e9，sofmax时，该处的概率就变为 0 了
    if mask is not None:
        scaled_score += (mask * -1e9)

    attention_distribution = tf.nn.softmax(scaled_attetntion_score, -1)
    output = tf.matmul(attention_distribution, v)  # batch,q_len,v_dim
    return output

## 前瞻遮挡
- 前瞻遮挡（look-ahead mask）用于遮挡一个序列中的后续标记（future tokens）。
> 这意味着要预测第三个词，将仅使用第一个和第二个词。与此类似，预测第四个词，仅使用第一个，第二个和第三个词，依此类推。 

In [41]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)


x = tf.random.uniform((1, 3))
temp = create_look_ahead_mask(x.shape[1])
temp

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0., 1., 1.],
       [0., 0., 1.],
       [0., 0., 0.]], dtype=float32)>