In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.17.2
pandas 0.25.1
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath, 'r').read()

print(len(text))
print(text[0:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
# 1. generate vocab
# 2. building map char -> id
# 3. data -> id_data
# 4. abcd -> bcd<eos>
vocab = sorted(set(text)) # 生成词表
print(vocab)
print(len(vocab))

char2idx = {char : idx for idx,char in enumerate(vocab)} # 生成vocabulary对应的 idx : char -> char : idx
type(char2idx)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


dict

In [4]:
idx2char = np.array(vocab)
idx2char
# dict -> array

array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [5]:
# 将text中所有的字符转换成数字
text_as_int = np.array([char2idx[c] for c in text])
text_as_int.shape

(1115394,)

In [6]:
print(text_as_int[0:10])
print(text[0:10])

[18 47 56 57 58  1 15 47 58 47]
First Citi


In [7]:
# 去除前一个字母和后一个字母
def split_input_target(id_text):
    """
    abcde -> abcd, bcde
    """
    return id_text[0:-1], id_text[1:]

In [8]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100

seq_dataset = char_dataset.batch(
    seq_length + 1
    , drop_remainder = True
) # 切分成一个个 seq_length + 1

In [9]:
idx2char

array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [10]:
for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])

tf.Tensor(18, shape=(), dtype=int64) F
tf.Tensor(47, shape=(), dtype=int64) i


In [11]:
for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()]))) # 转成句子
    print('------------')

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int64)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
------------
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int64)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
------------


In [12]:
seq_dataset = seq_dataset.map(split_input_target)

for item_input,item_output in seq_dataset.take(2):
    print(item_input.numpy())
    print(item_output.numpy())
    print('-'*29)
    print(''.join(idx2char[item_input.numpy()]))
    print(''.join(idx2char[item_output.numpy()]))

[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59  1]
-----------------------------
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 

In [13]:
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size
    , drop_remainder=True
)

stateful: Boolean (default False). If True, the last state
    for each sample at index i in a batch will be used as initial
    state for the sample of index i in the following batch.
    
> stateful :布尔值(默认为False)。如果为真，则batch中索引i处的每个样本的最后一个状态将用作下以batch中索引i的样本的初始状态。


recurrent_initializer: Initializer for the `recurrent_kernel`
    weights matrix,

> recurrent_initializer: ' recurrent_kernel '权值矩阵的初始化器，

return_sequences: Boolean. Whether to return the last output
    in the output sequence, or the full sequence.
> return_sequences:布尔。是否返回最后的输出在输出序列中，或在整个序列中。

In [14]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim,
                               batch_input_shape = [batch_size, None]),
        keras.layers.LSTM(units = rnn_units,
                          stateful = True,
                          recurrent_initializer = 'glorot_uniform',
                          return_sequences = True),
        keras.layers.Dense(vocab_size),
    ])
    return model

model = build_model(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
simple_rnn (SimpleRNN)       (64, None, 1024)          1311744   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 1,395,009
Trainable params: 1,395,009
Non-trainable params: 0
_________________________________________________________________


In [15]:
seq_dataset.take(1)

<TakeDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [17]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)
    print(input_example_batch)
    print(target_example_batch)

(64, 100, 65)
tf.Tensor(
[[63  1 39 ... 54 56 53]
 [41 49  1 ... 43 39 56]
 [ 1 51 53 ... 45 56 47]
 ...
 [53 52  6 ... 59 56  1]
 [41 53 51 ...  0 26 53]
 [42 53 43 ...  1 41 53]], shape=(64, 100), dtype=int64)
tf.Tensor(
[[ 1 39 57 ... 56 53 58]
 [49  1 51 ... 39 56 58]
 [51 53 56 ... 56 47 52]
 ...
 [52  6  0 ... 56  1 58]
 [53 51 43 ... 26 53  6]
 [53 43 57 ... 41 53 52]], shape=(64, 100), dtype=int64)


In [18]:
tf.random.categorical
# 从categorical 分布 中抽取样本。

<function tensorflow.python.ops.random_ops.categorical(logits, num_samples, dtype=None, seed=None, name=None)>

In [21]:
example_batch_predictions.shape

TensorShape([64, 100, 65])

logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]` represents the unnormalized log-probabilities for all classes.

num_samples: 0-D.  Number of independent samples to draw for each row slice.

> logits : 带有形状' [batch_size, num_classes] '的二维张量。每个切片' [i，:] '表示所有类的未标准化的log-probabilities.

> num_samples: 0-D。为每个行切片绘制的独立样本的数目。

In [19]:
# random sampling.
# greedy, random.
sample_indices = tf.random.categorical(
    logits = example_batch_predictions[0]
    , num_samples = 1
)

print(sample_indices)

# (100, 65) -> (100, 1)
sample_indices = tf.squeeze(sample_indices, axis = -1)
print(sample_indices)

tf.Tensor(
[[16]
 [56]
 [26]
 [49]
 [20]
 [ 6]
 [43]
 [48]
 [50]
 [24]
 [ 9]
 [37]
 [55]
 [29]
 [10]
 [40]
 [25]
 [38]
 [40]
 [36]
 [12]
 [45]
 [62]
 [12]
 [55]
 [21]
 [ 4]
 [35]
 [20]
 [23]
 [ 0]
 [ 1]
 [ 4]
 [ 0]
 [39]
 [33]
 [ 5]
 [ 5]
 [17]
 [29]
 [50]
 [19]
 [34]
 [31]
 [14]
 [ 5]
 [51]
 [55]
 [36]
 [40]
 [36]
 [41]
 [43]
 [19]
 [ 7]
 [25]
 [53]
 [ 8]
 [40]
 [25]
 [50]
 [48]
 [29]
 [29]
 [51]
 [34]
 [24]
 [61]
 [52]
 [12]
 [61]
 [60]
 [46]
 [ 2]
 [35]
 [61]
 [21]
 [21]
 [ 4]
 [53]
 [35]
 [ 9]
 [ 7]
 [ 1]
 [ 5]
 [ 1]
 [ 6]
 [25]
 [40]
 [ 7]
 [47]
 [ 1]
 [39]
 [16]
 [40]
 [34]
 [44]
 [24]
 [21]
 [55]], shape=(100, 1), dtype=int64)
tf.Tensor(
[16 56 26 49 20  6 43 48 50 24  9 37 55 29 10 40 25 38 40 36 12 45 62 12
 55 21  4 35 20 23  0  1  4  0 39 33  5  5 17 29 50 19 34 31 14  5 51 55
 36 40 36 41 43 19  7 25 53  8 40 25 50 48 29 29 51 34 24 61 52 12 61 60
 46  2 35 61 21 21  4 53 35  9  7  1  5  1  6 25 40  7 47  1 39 16 40 34
 44 24 21 55], shape=(100,), dtype=int64)


In [22]:
print("Input: ", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Output: ", repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr("".join(idx2char[sample_indices])))

Input:  "y as much before:\nThis proveth Edward's love and Warwick's honesty.\n\nWARWICK:\nKing Lewis, I here pro"

Output:  " as much before:\nThis proveth Edward's love and Warwick's honesty.\n\nWARWICK:\nKing Lewis, I here prot"

Predictions:  "DrNkH,ejlL3YqQ:bMZbX?gx?qI&WHK\n &\naU''EQlGVSB'mqXbXceG-Mo.bMljQQmVLwn?wvh!WwII&oW3- ' ,Mb-i aDbVfLIq"


In [23]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)

model.compile(optimizer = 'adam', loss = loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.1882186


```python
# 深度学习训练很久，用个check points
output_dir = "./text_generation_checkpoints"

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

epochs = 100
history = model.fit(seq_dataset, epochs = epochs,
                    callbacks = [checkpoint_callback])
```

```python
model2 = build_model(vocab_size,
                     embedding_dim,
                     rnn_units,
                     batch_size = 1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.build(tf.TensorShape([1, None]))
# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B(Ab) -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...
model2.summary()
```

In [25]:
def generate_text(model, start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string] # 
    input_eval = tf.expand_dims(input_eval, 0) # 
    
    text_generated = []
    model.reset_states()
    
    for _ in range(num_generate):
        # 1. model inference -> predictions
        # 2. sample -> ch -> text_generated.
        # 3. update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # predictions : [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
        predicted_id = tf.random.categorical(
            predictions, num_samples = 1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        # s, x -> rnn -> s', y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, "All: ")
print(new_text)

NameError: name 'model2' is not defined