# RNN诗人

在这个notebook里我们尝试用RNN生成一些古诗。

参考资料：
- http://lib.csdn.net/article/aiframework/60534?knId=1756

目录：
- 准备工作
- 建模
- 训练并观察结果

## 准备工作

In [1]:
import io
import collections
import sys

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

Using TensorFlow backend.


准备数据。

In [2]:

poetry_file ='poetry.txt'

# 诗集
poems = []
with io.open(poetry_file, "r", encoding='utf-8',) as f:
    for line in f:
        try:
            title, content = line.strip().split(u':')
            content = content.replace(u' ',u'')
            if '_' in content or u'(' in content or u'（' in content or u'《' in content or u'[' in content:
                continue
            if len(content) < 5 or len(content) > 79:
                continue
            content = u'[' + content + u']'
            poems.append(content)
        except Exception as e:
            pass

# 按诗的字数排序
poems = sorted(poems, key=lambda line: len(line))
MAX_LEN = max(map(len, poems))
print('唐诗总数: ', len(poems))

spaced_poems = list(map(lambda poem: ' '.join(poem), poems))

partial_poems = []
next_chars = []
for poem in poems:
    for i in range(1, len(poem)):
        partial_poems.append(poem[:i])
        next_chars.append(poem[i])

# 统计每个字出现次数
# all_words = []
# for poem in poems:
#     all_words += [word for word in poem]
# counter = collections.Counter(all_words)
# count_pairs = sorted(counter.items(), key=lambda x: -x[1])
# words, _ = zip(*count_pairs)



唐诗总数:  34647


In [3]:
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(spaced_poems)
to_seq = lambda x: pad_sequences(tokenizer.texts_to_sequences(x), maxlen=MAX_LEN, padding='pre')

In [4]:
def batch_generator(texts, next_chars, batch_size, tokenizer, max_len):
    n = len(texts)
    n_words = len(tokenizer.word_counts)
    while True:
        inds = np.random.randint(0, n, size=batch_size)
        batch_texts = []
        batch_next_chars = []
        for ind in inds:
            batch_texts.append(texts[ind])
            batch_next_chars.append(next_chars[ind])
#         print(batch_texts)
        x_batch = pad_sequences(tokenizer.texts_to_sequences(map(lambda text: ' '.join(text), batch_texts)), maxlen=max_len, padding='pre')
        y_batch = tokenizer.texts_to_matrix(batch_next_chars)
        yield (x_batch, y_batch)
        

In [5]:
def batch_cycle_generator(texts, next_chars, batch_size, tokenizer, max_len):
    n = len(texts)
    n_words = len(tokenizer.word_counts)
    ind = 0
    while True:
        batch_texts = []
        batch_next_chars = []
        for _ in range(batch_size):
            batch_texts.append(texts[ind])
            batch_next_chars.append(next_chars[ind])
            ind = (ind + 1) % n
#         print(batch_texts)
        x_batch = pad_sequences(tokenizer.texts_to_sequences(map(lambda text: ' '.join(text), batch_texts)), maxlen=max_len, padding='pre')
        y_batch = tokenizer.texts_to_matrix(batch_next_chars)
        yield (x_batch, y_batch)
        

In [6]:
# x = to_seq(spaced_poems)
# y = np.zeros(x.shape, dtype=np.int32)
# y[:, :-1] = x[:, 1:]

# x = to_seq(map(lambda poem: ' '.join(poem), partial_poems))
# y = tokenizer.texts_to_matrix(next_chars)


In [7]:
# 参数
UNITS = 128
N_LAYERS = 2
DIM_EMBED = 50
VOCAB = len(tokenizer.word_counts) + 1
BATCH_SIZE = 64

## 建模

In [8]:
import keras
from keras.layers import Dense, Activation, LSTM, GRU, SimpleRNN, Input, Embedding, Dropout
from keras.models import Model

In [9]:
input_shape = (MAX_LEN, )
input_layer = Input(shape=input_shape)
z = input_layer

z = Embedding(VOCAB, DIM_EMBED, input_length=MAX_LEN, trainable=True)(z)
z = Dropout(0.4)(z)

z = GRU(UNITS, return_sequences=True)(z)
z = GRU(UNITS)(z)
z = Dense(VOCAB)(z)
z = Activation('softmax')(z)

model = Model(input_layer, z)
model.compile(
    loss=keras.losses.categorical_crossentropy,
    optimizer=keras.optimizers.RMSprop(),
)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 81)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 81, 50)            305500    
_________________________________________________________________
dropout_1 (Dropout)          (None, 81, 50)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 81, 128)           68736     
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               98688     
_________________________________________________________________
dense_1 (Dense)              (None, 6110)              788190    
_________________________________________________________________
activation_1 (Activation)    (None, 6110)              0         
Total para

## 训练并观察结果

In [10]:
gen = batch_generator(partial_poems, next_chars, BATCH_SIZE, tokenizer, MAX_LEN)

In [11]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)    
#     probas = np.random.multinomial(1, preds, 1)
#     return np.argmax(probas)
    return np.random.choice(len(preds), p=preds)

In [None]:
indices_char = {v: k for k, v in tokenizer.word_index.items()}

losses = []
# range(61)
for iteration in range(26):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    train_info = model.fit_generator(gen, 1000, epochs=1)
    losses.append(train_info.history['loss'][0])

    if iteration % 5 == 0:
        print('Generating text')
        for diversity in [0.2, 0.5, 1.0, 1.2]:
    #     for diversity in [0.2, 0.5, 1.0]:
            print()
            print('----- diversity:', diversity)

            generated = '['
            print('----- Generating with seed: "' + generated + '"')
            sys.stdout.write(generated)

            for i in range(81):
                x = pad_sequences(tokenizer.texts_to_sequences([' '.join(generated)]), maxlen=MAX_LEN, padding='pre')

                preds = model.predict(x, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
                if next_char == ']':
                    break
            print()


--------------------------------------------------
Iteration 0
Epoch 1/1

--------------------------------------------------
Iteration 4
Epoch 1/1

--------------------------------------------------
Iteration 5
Epoch 1/1
Generating text

----- diversity: 0.2
----- Generating with seed: "["
[山去春日去，江人见云中。]

----- diversity: 0.5
----- Generating with seed: "["
[五忆诗园去，天山见自闲。一秋有月日，河下一主人。]

----- diversity: 1.0
----- Generating with seed: "["
[御半日子宗饶南，介摘我处坐丝共。喜襟万云曾花楚，斯色身此住陇言。]

----- diversity: 1.2
----- Generating with seed: "["
[绿玉驭谷蛁，渚阳陶诲登。井聊千故民，相宗画凫珮。积看一津尘，春是卧缊寺。玉寻俱亦赤，天小畔嶂鹤。]

--------------------------------------------------
Iteration 6
Epoch 1/1

--------------------------------------------------
Iteration 10
Epoch 1/1
Generating text

----- diversity: 0.2
----- Generating with seed: "["
[白风春上，云风，]

----- diversity: 0.5
----- Generating with seed: "["
[忆古得家，三去来来。]

----- diversity: 1.0
----- Generating with seed: "["
[筐三明多缨，主对暮毛潺。唯壁文泉西，发方岂子寄。帆食半也底，祖后雄清春。舟还一花亦，空见不复远。]

----- diversity

In [None]:
plt.plot(losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.grid()

In [101]:
model.save('poet_2gru_26epochs.h5')