# Пример генерации текста рекурентной сетью

In [2]:
import numpy as np
import tensorflow as tf

print(tf.__version__)

2.2.0


## Загружаем исходные данные

In [None]:
with open('data/karenina/karenina.txt', 'rb') as src:
    binary = src.read()
    text = binary.decode('utf8').lower()
print('Corpus length:', len(text))

## Векторизуем текстовый корпус

In [None]:
chars = sorted(list(set(text)))
print('Total chars:', len(chars))

char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))

STEP = 3
SEQ_LENGTH = 40

sentences = []
next_chars = []
for i in range(0, len(text) - SEQ_LENGTH, STEP):
    sentences.append(text[i: i + SEQ_LENGTH])
    next_chars.append(text[i + SEQ_LENGTH])

print('Train sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), SEQ_LENGTH), dtype=np.uint8)
y = np.zeros(len(sentences), dtype=np.uint8)

for i, sentence in enumerate(sentences):
    for j, char in enumerate(sentence):
        X[i, j] = char_to_index[char]
    y[i] = char_to_index[next_chars[i]]

# one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=len(chars))

## Задаем архитектуру сети

In [None]:
# build the model: a single LSTM
print('Build model...')
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(chars),
                              output_dim=64,
                              input_length=SEQ_LENGTH),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.LSTM(128, return_sequences=False),
    tf.keras.layers.Dense(len(chars), activation='softmax')
], name='next_char_generator')

model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=1e-2)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## Возвращает индекс выбранного символа для вектора вероятностей

In [None]:
import sys

def sample(preds, temperature = 1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-32) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate(model, input_text, diversity, steps=400):
    print()
    print('----- Diversity:', diversity)

    sentence = input_text
    print('----- Generating with seed: "' + input_text + '"')
    sys.stdout.write(sentence)

    for i in range(steps):
        x = np.zeros((1, SEQ_LENGTH))
        for j, char in enumerate(sentence):
            x[0, j] = char_to_index[char]

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = index_to_char[next_index]

        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

## Запускаем обучение

In [None]:
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,
              batch_size=512,
              epochs=1)

    start_index = np.random.randint(len(text) - SEQ_LENGTH)
    seed_text = text[start_index: start_index + SEQ_LENGTH]

    for diversity in [0.1, 0.3, 0.5]:
        generate(model, seed_text, diversity)

## Предсказания на предобученной модели

In [None]:
model.load_weights('data/karenina/text-lstm.hdf5')

In [None]:
start_index = np.random.randint(len(text) - SEQ_LENGTH)
seed_text = text[start_index: start_index + SEQ_LENGTH]

for diversity in [0.1, 0.3, 0.5]:
    generate(model, seed_text, diversity)