# coverage

1. attention
2. loss

## Bahdanau Attention

![](img/attention.png)

In [27]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W_s = tf.keras.layers.Dense(units)
        self.W_h = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, dec_hidden, enc_output):
        # query为上次的GRU隐藏层
        # values为编码器的编码结果enc_output
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W_s(enc_output) + self.W_h(hidden_with_time_axis)))
       
        attention_weights = tf.nn.softmax(score, axis=1)
        
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector,attention_weights

## Coverage Attention

![](img/attention.png)

![](img/e_t.png)

## 改造$e^t$

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W_s = tf.keras.layers.Dense(units)
        self.W_h = tf.keras.layers.Dense(units)
        self.W_c = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, dec_hidden, enc_output, enc_pad_mask, use_coverage, prev_coverage):
        # query 隐藏层
        # values为 编码器的编码结果enc_output
        hidden_with_time_axis = tf.expand_dims(dec_hidden, 1)
        # self.W_s(values)  [batch_sz, max_len, units] self.W_h(hidden_with_time_axis) [batch_sz, 1, units]
        # self.W_c(prev_coverage) [batch_sz, max_len, units]  score [batch_sz, max_len, 1]    
        score = self.V(tf.nn.tanh(self.W_s(enc_output) + self.W_h(hidden_with_time_axis) + self.W_c(prev_coverage)))
        
        attention_weights = tf.nn.softmax(score, axis=1)
        # [batch_sz, max_len, enc_units]
        context_vector = attention_weights * enc_output
        # [batch_sz, enc_units]
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector,attention_weights

## mask + coverage

In [None]:
# 1 2 3 4 5 6 0 0 -> 1 1 1 1 1 1 0 0

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W_s = tf.keras.layers.Dense(units)
        self.W_h = tf.keras.layers.Dense(units)
        self.W_c = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, dec_hidden, enc_output, enc_pad_mask, use_coverage, prev_coverage):
        # query为上次的GRU隐藏层
        # values为编码器的编码结果enc_output
        # 在seq2seq模型中，St是后面的query向量，而编码过程的隐藏状态hi是values。

        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(dec_hidden, 1)

        if use_coverage and prev_coverage is not None:
            # self.W_s(values) [batch_sz, max_len, units] self.W_h(hidden_with_time_axis) [batch_sz, 1, units]
            # self.W_c(prev_coverage) [batch_sz, max_len, units]  score [batch_sz, max_len, 1]
            score = self.V(tf.nn.tanh(self.W_s(enc_output) + self.W_h(hidden_with_time_axis) + self.W_c(prev_coverage)))
            # attention_weights shape (batch_size, max_len, 1)

            mask = tf.cast(enc_pad_mask, dtype=score.dtype)
            masked_score = tf.squeeze(score, axis=-1) * mask
            masked_score = tf.expand_dims(masked_score, axis=2)

            attention_weights = tf.nn.softmax(masked_score, axis=1)
            coverage = attention_weights + prev_coverage
        else:
            # score shape == (batch_size, max_length, 1)
            # we get 1 at the last axis because we are applying score to self.V
            # the shape of the tensor before applying self.V is (batch_size, max_length, units)
            # 计算注意力权重值
            score = self.V(tf.nn.tanh(
                self.W_s(enc_output) + self.W_h(hidden_with_time_axis)))

            mask = tf.cast(enc_pad_mask, dtype=score.dtype)
            masked_score = tf.squeeze(score, axis=-1) * mask
            masked_score = tf.expand_dims(masked_score, axis=2)

            attention_weights = tf.nn.softmax(masked_score, axis=1)
            # attention_weights = masked_attention(attention_weights)
            if use_coverage:
                coverage = attention_weights

        # attention_weights sha== (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # # 使用注意力权重*编码器输出作为返回值，将来会作为解码器的输入
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector,attention_weights, coverage

## 2.2 coverage_loss

![](img/loss_t.png)

## log loss

In [34]:
# 定义损失函数
def loss_function(real, pred):
    pad_mask = tf.math.equal(real, pad_index)
    mask = tf.math.logical_not(pad_mask)
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

## log loss + mask batch loss

In [35]:
# 定义损失函数
def loss_function(real, pred, padding_mask):
    loss = 0
    for t in range(real.shape[1]):
        if padding_mask:
            loss_ = loss_object(real[:, t], pred[:, t, :])
            mask = tf.cast(padding_mask[:, t], dtype=loss_.dtype)
            loss_ *= mask
            loss_ = tf.reduce_mean(loss_, axis=0)  # batch-wise
            loss += loss_
        else:
            loss_ = loss_object(real[:, t], pred[:, t, :])
            loss_ = tf.reduce_mean(loss_, axis=0)  # batch-wise
            loss += loss_
    return tf.reduce_mean(loss)

就是将先前时间步的注意力权重加到一起得到所谓的覆盖向量 $c_t$ (coverage vector)，用先前的注意力权重决策来影响当前注意力权重的决策，这样就避免在同一位置重复，从而避免重复生成文本。计算上，先计算coverage vector $c_t$
![](img/c_t.png)
+ $c^t$就是一个长度为输入长度的向量
+ 第一项是之前时刻输入第一个词attention权重的叠加和

+ 加这个参数的目的是为了给attention之前生成词的信息，如果之前生成过这些词那么后续要抑制。抑制通过loss函数加惩罚项实现.

## 两个地方使用$c_t$:

+ 注意力权重的计算过程中 $e^t_i$
+ cov_loss

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        pass
        
    def call(self, dec_hidden, enc_output, enc_pad_mask, use_coverage, prev_coverage):
        if use_coverage and prev_coverage is not None:
            pass
            attention_weights = tf.nn.softmax(score, axis=1)
            coverage = attention_weights + prev_coverage
        else:    
            if use_coverage:
                coverage = attention_weights

![](img/cobloss.png)

以第一时刻为例，c1表示的是a0，那么a0跟a1当中选择最小的作为loss。

如果a0和a1关注的都是同样的分布，那么loss就会比较大，如果他们关注的是不同的分布，因为选择的是两者之中最小的那一个，所以这样的loss会比较小。目的就是想让他每一个时刻关注的分布是不一样的，这样可以避免repeat
原文链接：https://blog.csdn.net/ganxiwu9686/article/details/87521054

`<START> 举起 车辆 左 前轮 缸体 上 <STOP> <PAD> <PAD> `

`padding_mask`->`[1,1,1,1,1,1,1,0,0]`

In [28]:
def mask_coverage_loss(attn_dists, coverages, padding_mask):
    """
    Calculates the coverage loss from the attention distributions.
      Args:
        attn_dists coverages: [max_len_y, batch_sz, max_len_x, 1]
        padding_mask: shape (batch_size, max_len_y).
      Returns:
        coverage_loss: scalar
    """
    cover_losses = []
    # transfer attn_dists coverages to [max_len_y, batch_sz, max_len_x]
    attn_dists = tf.squeeze(attn_dists, axis=3)
    coverages = tf.squeeze(coverages, axis=3)

    
    for t in range(attn_dists.shape[0]):
        cover_loss_ = tf.reduce_sum(tf.minimum(attn_dists[t, :, :], coverages[t, :, :]), axis=-1)  # max_len_x wise
        cover_losses.append(cover_loss_)
    
    # change from[max_len_y, batch_sz] to [batch_sz, max_len_y]
    cover_losses = tf.stack(cover_losses, 1)

    # cover_loss_ [batch_sz, max_len_y]
    mask = tf.cast(padding_mask, dtype=cover_loss_.dtype)
    cover_losses *= mask
    
    # mean loss of each time step and then sum up
    loss = tf.reduce_sum(tf.reduce_mean(cover_losses, axis=0))  
    tf.print('coverage loss(batch sum):', loss)
    return loss

# loss改变

![](img/loss_t_coverage.png)

In [38]:
batch_loss = loss_function(dec_target[:, 1:], predictions)

In [39]:
batch_loss = loss_function(dec_target, predictions, padding_mask) + \
                         cov_loss_wt * coverage_loss(attentions, coverages, padding_mask)