原始链接：https://keras.io/examples/generative/text_generation_with_miniature_gpt/

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import os
import re
import string
import random

# 自注意力层
> TODO：填充的掩码，如何处理？

In [18]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads

        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = "
                f"{num_heads}")
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combined_heads = layers.Dense(embed_dim)

    @staticmethod
    def casual_attention_mask(n_dest, n_src, dtype):
        """
        n_dest： 目标序列长度
        n_src： 源序列长度
        return： [n_dest,n_src]
        """

        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        return tf.cast(m, dtype)

    def attention(self, query, key, value):
        """
        query/key/value: (batch_size, num_heads, seq_len, projection_dim)

        """

        # (batch_size, num_heads, seq_len, seq_len)
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)

        # 防止获取到 当前标记 后面标记的信息
        shape = tf.shape(scaled_score)
        dim_dest, dim_src = shape[2], shape[3]
        attention_mask = self.casual_attention_mask(
            dim_dest,
            dim_src,
            scaled_score.dtype,
        )
        attention_mask = tf.reshape(attention_mask, [1, 1, dim_dest, dim_src])
        scaled_score = scaled_score * attention_mask - 1e4 * (1 -
                                                              attention_mask)

        # (batch_size, num_heads, seq_len, seq_len)
        weights = tf.nn.softmax(scaled_score, axis=-1)

        # (batch_size, num_heads, seq_len, projection_dim)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(
            x,
            (batch_size, -1, self.num_heads, self.projection_dim),
        )
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs, **kwargs):
        # batch_size, seq_len, embedding_size
        batch_size = tf.shape(inputs)[0]

        # (batch_size, seq_len, embed_dim)
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)

        # (batch_size, num_heads, seq_len, projection_dim)
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)

        # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention,
                                      (batch_size, -1, self.embed_dim))
        # (batch_size, seq_len, embed_size)
        output = self.combined_heads(concat_attention)
        return output

In [15]:
# 创建掩码，遮掩 当前标记 之后的所有标记
n_dest = 4
n_src = 4


def create_attention_mask(n_dest, n_src, dtype=tf.float32):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    return tf.cast(m, dtype)


create_attention_mask(n_dest, n_src)

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]], dtype=float32)>

In [19]:
# 掩码操作
# batch_size=1,num_heads=1, seq_len=3
scaled_score = tf.random.uniform((1, 1, 3, 3), 0, 1, dtype=tf.float32)
print("Before mask:\n", scaled_score)

_, _, dim_dest, dim_src = tf.shape(scaled_score)
attention_mask = create_attention_mask(dim_dest, dim_src)

scaled_score = scaled_score * attention_mask - 1e4 * (1 - attention_mask)
print("After mask:\n", scaled_score)

Before mask:
 tf.Tensor(
[[[[0.10547268 0.7463316  0.45979226]
   [0.95134735 0.31935334 0.34476125]
   [0.6548263  0.9963918  0.38449323]]]], shape=(1, 1, 3, 3), dtype=float32)
After mask:
 tf.Tensor(
[[[[ 1.05472684e-01 -1.00000000e+04 -1.00000000e+04]
   [ 9.51347351e-01  3.19353342e-01 -1.00000000e+04]
   [ 6.54826283e-01  9.96391773e-01  3.84493232e-01]]]], shape=(1, 1, 3, 3), dtype=float32)


In [22]:
# batch_size=1, seq_len=3, embed_size=4
inputs = tf.random.uniform((1, 3, 4), 0, 1, dtype=tf.float32)
embed_dim = 10
num_heads = 5
attention = MultiHeadAttention(embed_dim, num_heads)
output = attention(inputs)
print(output)

tf.Tensor(
[[[-0.38252696  0.36132467 -0.13862547 -1.1787404  -0.09665567
    0.02887212 -0.34742364  0.69407684  0.4977634   0.11721583]
  [-0.3253528   0.27771837 -0.38126877 -1.2979437  -0.3482508
    0.20360255 -0.34971687  0.7869941   0.6485061   0.3667486 ]
  [-0.29071248  0.1254045  -0.55469835 -1.0486559  -0.58551824
    0.3572234  -0.20800653  0.6436509   0.69355655  0.6243084 ]]], shape=(1, 3, 10), dtype=float32)


# Transformer 层

In [23]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.attn = MultiHeadAttention(embed_dim, num_heads)
        
        # 前向层
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim),
        ])
        
        # 正则化层
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):  # batch_size,seq_len,embed_size
        attention_output = self.attn(inputs)  # batch_size,seq_len,embed_size
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(attention_output)
        ffn_output = self.ffn(out1)  # batch_size,seq_len,embed_size
        ffn_output = self.dropout2(ffn_output)  # batch_size,seq_len,embed_size
        return self.layernorm2(out1 + ffn_output)

In [24]:
ff_dim = 10
transformer = TransformerBlock(embed_dim, num_heads, ff_dim)
print(transformer(inputs))

tf.Tensor(
[[[-0.00332756  1.1323926   1.0909868  -1.5782332   0.22285575
    0.91380244  0.5024045  -0.2707299  -0.05485689 -1.9552947 ]
  [-0.00368661  1.4767317   0.65466416  0.23449448 -0.6320796
    0.9992423  -0.26263362 -0.07430581  0.02879716 -2.421224  ]
  [-0.16078229  1.4712099  -0.09554663  1.1715316  -1.1065551
    0.80093515 -0.24299382 -0.16937007  0.40726507 -2.0756936 ]]], shape=(1, 3, 10), dtype=float32)


# 嵌入层

In [25]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        # 词嵌入
        self.token_embed = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embed_dim,
        )
        # 位置编码,也是待训练参数
        self.pos_embed = layers.Embedding(
            input_dim=maxlen,
            output_dim=embed_dim,
        )

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_embed(positions)
        x = self.token_embed(x)
        return x + positions

In [26]:
maxlen = 5
vocab_size = 20
embed = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)

# batch=2, seq_len=5
x = tf.constant(np.random.randint(1, 15, (2, maxlen), dtype=np.int32))
print(embed(x))

tf.Tensor(
[[[ 0.00485352  0.04649875  0.01722922 -0.08977088  0.03237002
    0.03043242  0.0164837  -0.01169515 -0.03283117  0.07614163]
  [ 0.03652454  0.00519327 -0.04325895 -0.03345573 -0.0163762
   -0.01602948  0.06325639 -0.02854483  0.00524993 -0.04182397]
  [ 0.01462177  0.04132431  0.02875835  0.01675186 -0.01151166
    0.05898074  0.05975775  0.08203638  0.03354775 -0.00828075]
  [ 0.00346811  0.0666132   0.0304955  -0.08781236  0.04581798
    0.00912217 -0.01231779 -0.02989997  0.02603679  0.02221246]
  [ 0.0293895  -0.01145194  0.08149004 -0.03042657 -0.02672334
    0.04037578 -0.05371469 -0.04414493 -0.03230675  0.04327554]]

 [[-0.05091096  0.07347768  0.0649236  -0.04824299 -0.02064015
    0.00601267  0.00380222 -0.04921037  0.00894145 -0.00561922]
  [ 0.01696062  0.01604785  0.01192685 -0.04674814 -0.00258112
    0.06363203  0.02451729 -0.05413387  0.07132256 -0.02281509]
  [ 0.0071785   0.05307038  0.05278654  0.01842588 -0.04215529
    0.04731597  0.00161308  0.007695

# GPT 模型

In [27]:
vocab_size = 20000
maxlen = 100
embed_dim = 256
num_heads = 2
ff_dim = 256

In [28]:
def create_model():
    # batch,seq_len --> batch,seq_len,embed_size
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    
    # batch,seq_len,embed_size --> batch,seq_len,vocab_size
    transformer = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    model.compile("adam", loss=[loss_fn, None])
    return model

# 训练数据

In [100]:
batch_size = 32
filenames = []
directories = [
    "aclImdb/train/pos",
    "aclImdb/train/neg",
    "aclImdb/test/pos",
    "aclImdb/test/neg",
]
base_dir = "../datasets"

In [101]:
for dir in directories:
    dir = os.path.join(base_dir, dir)
    for f in os.listdir(dir):
        filenames.append(os.path.join(dir, f))

print(f"{len(filenames)} files")

50000 files


In [102]:
# 创建数据管道
random.shuffle(filenames)
text_ds = tf.data.TextLineDataset(filenames)
text_ds = text_ds.shuffle(buffer_size=256)
text_ds = text_ds.batch(batch_size)

for data in text_ds.take(1):
    tf.print(data)



In [103]:
# 数据预处理
def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """

    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")

In [104]:
# 文本数据向量化
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)


# 词汇表，列表的元素类型为 bytes
vocab = vectorize_layer.get_vocabulary()  

In [108]:
for text in text_ds.take(1):
    text = tf.expand_dims(text, -1)    
    tokenized_sentences = vectorize_layer(text)
    print(tokenized_sentences)
    print([vocab[idx] for idx in tokenized_sentences[0]])

tf.Tensor(
[[  13   16    5 ...    0    0    0]
 [   1  592  165 ...  912    3  278]
 [  14   16   34 ...  291    8 1539]
 ...
 [   1   65    1 ...   54  134   30]
 [  13    9    5 ...  498 3874    8]
 [   2  347  466 ...    0    0    0]], shape=(32, 101), dtype=int64)
[b"'s", b'movie', b'of', b'up', b'cannot', b'same', b'by', b'phil', b'to', b'sometimes', b'historic', b'a', b',', b'things', b'in', b'bold', b'is', b'possibly', b'years', b'gives', b'of', b'mike', b'everyone', b'a', b',', b'-the', b'his', b'version', b'to', b'about', b'of', b'minutes', b'statements', b'a', b',', b'michael', b'lives', b'in', b'awful', b'glory', b'either', b'but', b'into', b'yourself', b'a', b',', b'see', b'in', b'surgeon', b'to', b'screen', b'them', b'phil', b'to', b'bass', b'when', b'ff', b'appears', b'a', b'i', b'in', b'conclusion', b'film', b'made', b'move', b'wouldn', b'to', b'.', b'is', b'of', b'forward', b'tale', b'knowledge', b'a', b'that', b'movie', b'up', b'summer', b'a', b'the', b'the', b'the', 

In [88]:
# 建训练数据转换成 输入和标签
def prepare_lm_inputs_labels(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1] 
    y = tokenized_sentences[:, 1:] # 标签相对于输入，后移一位
    return x, y


text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)

for data in text_ds.take(1):
    print(data)

(<tf.Tensor: shape=(32, 100), dtype=int64, numpy=
array([[  11,    2, 1182, ...,    0,    0,    0],
       [  30,   15,    1, ..., 7800, 7764,   80],
       [  13,    9,    5, ...,  302,    2,  639],
       ...,
       [  12,  583,   73, ..., 1404,   11,    2],
       [  13,   52,  199, ...,   76,    5,  436],
       [  61, 1361,   22, ...,   14,    1,   52]])>, <tf.Tensor: shape=(32, 100), dtype=int64, numpy=
array([[   2, 1182,    7, ...,    0,    0,    0],
       [  15,    1,    4, ..., 7764,   80, 2533],
       [   9,    5, 2152, ...,    2,  639,   29],
       ...,
       [ 583,   73,  149, ...,   11,    2,   71],
       [  52,  199,    8, ...,    5,  436,    4],
       [1361,   22,    4, ...,    1,   52,   84]])>)


# 文本生成
以回调函数的形式实现

In [89]:
# 文本生成回调函数
class TextGenerator(keras.callbacks.Callback):
    def __init__(self,
                 max_tokens,
                 start_tokens,
                 index_to_word,
                 top_k=10,
                 print_every=1):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated])
        print(f"generated text:\n{txt}\n")

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)

        txt = " ".join([
            bytes.decode(self.detokenize(_))
            for _ in self.start_tokens + tokens_generated
        ])
        print(f"generated text:\n{txt}\n")

In [90]:
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

In [91]:
start_prompt = "this movie is"

# 需要将单词由 str 转换成 bytes，才能查词典
start_tokens = [word_to_index.get(str.encode(_), 1) for _ in start_prompt.split()]
num_tokens_generated = 40
text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)

# 训练模型

In [92]:
model = create_model()
model.fit(text_ds, verbose=2, epochs=30, callbacks=[text_gen_callback])

Epoch 1/30
generated text:
this movie is , . , . are to , for movie into a i as because because of star a , for as , gets to , for me could this of going is after were his of into see to up more

1575/1575 - 89s - loss: 5.4447 - dense_81_loss: 5.4447
Epoch 2/30
generated text:
this movie is , you intended a , how beautiful is they , for movie of more you and to that something role me , . . . are a i in of into for out , you and to , . are idea

1575/1575 - 87s - loss: 4.7760 - dense_81_loss: 4.7760
Epoch 3/30
generated text:
this movie is be ever this , us to be . a be two his out of take take worst so ? get , . . on it , help a i in of both place john is or ever a 's in of

1575/1575 - 84s - loss: 4.5598 - dense_81_loss: 4.5598
Epoch 4/30
generated text:
this movie is , kind you and that movie up rating she i movie ok this of you a this enough and i movie of both tired a , see movie up than front a that watch , how ; none that effects i

1575/1575 - 73s - loss: 4.4304 - dense_81_loss:

<tensorflow.python.keras.callbacks.History at 0x7fe360711c10>