In [11]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [13]:
file = open("pushkin_pss.txt", encoding='utf-8', newline='').read()

In [15]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('russian'), tokens)
    return " ".join(filtered)

In [17]:
processed_inputs = tokenize_words(file)

In [19]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [21]:
input_len = len(processed_inputs)
vocab_len = len(chars)

In [23]:
seq_length = 100
x_data = []
y_data = []

In [25]:
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [27]:
n_patterns = len(x_data)

In [28]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [29]:
y = np_utils.to_categorical(y_data)

In [30]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [31]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [32]:
filepath = "pushkin_model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [35]:
#model.load_weights(filepath)
#model.fit(X, y, epochs=16, batch_size=256, callbacks=desired_callbacks)

Epoch 1/19
Epoch 00001: loss improved from inf to 2.50579, saving model to pushkin_model_weights_saved.hdf5
Epoch 2/19
Epoch 00002: loss improved from 2.50579 to 2.34097, saving model to pushkin_model_weights_saved.hdf5
Epoch 3/19
Epoch 00003: loss improved from 2.34097 to 2.23453, saving model to pushkin_model_weights_saved.hdf5
Epoch 4/19
 163/5374 [..............................] - ETA: 2:47:46 - loss: 2.1839

KeyboardInterrupt: 

In [37]:
#model.load_weights(filepath)
#model.compile(loss='categorical_crossentropy', optimizer='adam')

In [38]:
#num_to_char = dict((i, c) for i, c in enumerate(chars))

In [44]:
#start = numpy.random.randint(0, len(x_data) - 1)
#pattern = x_data[start]
#print("Random Seed:")
#print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" щийся оду x x v i i i ту тему стихотворении пушкина живописцу см ниже примеч этому стих 670 написан  "


In [None]:
"""
4 эпохи:
04 г печатно приветствовал эдипа афинах послании владиславу александ ровичу озерову традиционные фор
5 5 ошибками пропусками н 1 8 9 9 п р м е ч 3 5 8 н 1 9 0 0 2 9 т 1 примеч 3 7 4 п б л л н м й б р ю
ой тьмою участи моей вздохнет урной гробовою стихотворения 1815 г 211 поздняя редакция подражание ви
ию 540 комментарии л 65 венере лаисы посвящении зеркала лаиса венере посвящая свое зеркало заглавие
щийся оду x x v i i i ту тему стихотворении пушкина живописцу см ниже примеч этому стих 670 написан
"""