In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow import keras
from keras import models, callbacks, utils, losses

from sklearn.model_selection import train_test_split

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
raw = open('data2.txt', mode='r', encoding='utf-8').readlines()
data = []
for line in raw:
    if line != '\n' and 'Глава' not in line:
        data.append(' '.join(line.split()[1:]))
    
data = [line.replace('\n', ' ').replace('\xa0', ' ') for line in data]
text = ' '.join(data)

In [None]:
text[:100]

In [None]:
def get_features_target(seq):
    features = seq[:-1]
    target = seq[1:]
    return features, target

BATCH_SIZE = 100

alphabet = np.array(sorted(set(text)))

word_index = {char: i for i, char in enumerate(alphabet)}
index_word = {i: char for i, char in enumerate(alphabet)}

sequences = Dataset.from_tensor_slices(np.array([word_index[char] for char in text])).batch(BATCH_SIZE, drop_remainder=True)
dataset = sequences.map(get_features_target)

data = dataset.batch(BATCH_SIZE, drop_remainder=True).repeat()
data = data.prefetch(AUTOTUNE)

In [None]:
model = keras.Sequential([
    keras.layers.Embedding(len(alphabet), BATCH_SIZE),
    keras.layers.SimpleRNN(16, return_sequences=True, stateful=True),
    keras.layers.Dense(len(alphabet))
])

model.compile(optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model.fit(data, epochs=40, verbose=1, steps_per_epoch= len(sequences) // BATCH_SIZE)

In [None]:
def predict_next(sample, model, tokenizer, vocabulary, n_next, temperature, batch_size):
    sample_vector = [tokenizer[char] for char in sample]
    predicted = sample_vector
    sample_tensor = tf.expand_dims(sample_vector, 0)
    sample_tensor = tf.repeat(sample_tensor, batch_size, axis=0)
    for i in range(n_next):
        pred = model(sample_tensor)
        pred = pred[0].numpy() / temperature
        pred = tf.random.categorical(pred, num_samples=1)[-1, 0].numpy()
        predicted.append(pred)
        sample_tensor = predicted[-99:]
        sample_tensor = tf.expand_dims([pred], 0)
        sample_tensor = tf.repeat(sample_tensor, batch_size, axis=0)
    pred_seq = [vocabulary[i] for i in predicted]
    generated = ''.join(pred_seq)
    return generated

In [None]:
print(predict_next(
    sample='б',
    model=model,
    tokenizer=word_index,
    vocabulary=index_word,
    n_next=200,
    temperature=0.6,
    batch_size=BATCH_SIZE
))

In [None]:
print(predict_next(
    sample='1',
    model=model,
    tokenizer=word_index,
    vocabulary=index_word,
    n_next=100,
    temperature=0.2,
    batch_size=BATCH_SIZE
))