# Генерация текста

Попробуйте изменить параметры нейронной сети, генерирующей текст таким образом, чтобы добиться генерации как можно более осмысленного текста. Пришлите лучший текст из получившихся и опишите предпринятые для его получения действия.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from keras.utils import to_categorical

from keras.layers import Dropout, Bidirectional
from keras import regularizers
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout
from keras.layers import LSTM
from keras.callbacks import EarlyStopping
# from keras.layers import Conv1D

In [2]:
!wget -nc https://github.com/aspushkarev/Intro-neural-network/blob/master/RNN/alice_in_wonderland.txt

File ‘alice_in_wonderland.txt’ already there; not retrieving.



In [3]:
data = open('alice_in_wonderland.txt').read()
corpus = data.lower().split("\n")
corpus[:10]

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '<!doctype html>',
 '<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"  data-a11y-animated-images="system">',
 '  <head>']

Создадим объект для токенизации

In [4]:
total_words = 1000
tokenizer = Tokenizer(num_words=total_words)
tokenizer.fit_on_texts(corpus)

Выполним токенизацию и кодирование

In [5]:
print(corpus[0])
print(tokenizer.texts_to_sequences([corpus[0]]))
print(tokenizer.word_index['from'], tokenizer.word_index['carroll'])


[[]]
203 1523


Создадим последовательность токенов

In [6]:
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		sequence = token_list[:i+1]
		input_sequences.append(sequence)
input_sequences[:10]

[[926, 10],
 [926, 10, 106],
 [926, 10, 106, 281],
 [926, 10, 106, 281, 10],
 [926, 10, 106, 281, 10, 702],
 [926, 10, 106, 281, 10, 702, 703],
 [926, 10, 106, 281, 10, 702, 703, 702],
 [926, 10, 106, 281, 10, 702, 703, 702, 10],
 [926, 10, 106, 281, 10, 702, 703, 702, 10, 460],
 [926, 10, 106, 281, 10, 702, 703, 702, 10, 460, 703]]

In [7]:
# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
print(max_sequence_len)

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences[:10]

1955


array([[  0,   0,   0, ...,   0, 926,  10],
       [  0,   0,   0, ..., 926,  10, 106],
       [  0,   0,   0, ...,  10, 106, 281],
       ...,
       [  0,   0,   0, ..., 703, 702,  10],
       [  0,   0,   0, ..., 702,  10, 460],
       [  0,   0,   0, ...,  10, 460, 703]], dtype=int32)

In [8]:
predictors, label = input_sequences[:,:-1], input_sequences[:,-1]

label = to_categorical(label, num_classes=total_words)
label.shape

(158178, 1000)

In [9]:
predictors[0], np.argmax(label[0])

(array([  0,   0,   0, ...,   0,   0, 926], dtype=int32), 10)

In [10]:
early_stop = EarlyStopping(monitor='val_loss', patience=10)

model = Sequential()
model.add(Embedding(input_dim=total_words,
                    output_dim=512,
                    input_length=max_sequence_len-1))

model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Bidirectional(LSTM(512, return_sequences=True)))
model.add(LSTM(1024))
model.add(Dropout(0.2))
model.add(Dense(total_words * 2 , activation='relu'))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1954, 512)         512000    
                                                                 
 bidirectional (Bidirectiona  (None, 1954, 512)        1574912   
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 1954, 1024)       4198400   
 nal)                                                            
                                                                 
 lstm_2 (LSTM)               (None, 1024)              8392704   
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 2000)              2

In [None]:
history = model.fit(predictors, label,
                    epochs=1,
                    validation_split=0.2,
                    callbacks=[early_stop],
                    verbose=1)



Тест

In [1]:
acc = history.history['accuracy']
acc_v = history.history['val_accuracy']
loss = history.history['loss']
loss_v = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, label='Train')
plt.plot(epochs, acc_v, label='Test')
plt.legend()
plt.title('accuracy')

plt.figure()

plt.plot(epochs, loss, label='Train')
plt.plot(epochs, loss_v, label='Test')
plt.title('loss')
plt.legend()

plt.show()

NameError: ignored

Строим тест генератора

In [None]:
seed_text = "Alice goes down a rabbit hole to find"
# seed_text = "She is my girl, I see in my hend"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted_proba = model.predict(token_list, verbose=0)
    predicted_classes = np.argmax(predicted_proba, axis=1)

    output_word = tokenizer.index_word[predicted_classes[0]]
    seed_text += " " + output_word

print(seed_text)

In [None]:
predicted_proba