In [3]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow import keras
from keras import losses


ModuleNotFoundError: No module named 'numpy'

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
raw = open('data2.txt', mode='r', encoding='utf-8').readlines()
data = []
for line in raw:
    if line != '\n' and 'Глава' not in line:
        data.append(' '.join(line.split()[1:]))
    
data = [line.replace('\n', ' ').replace('\xa0', ' ') for line in data]
text = ' '.join(data)

In [None]:
text[:100]

In [None]:
def get_features_target(seq):
    features = seq[:-1]
    target = seq[1:]
    return features, target

BATCH_SIZE = 128

alphabet = np.array(sorted(set(text)))
sym_to_idx = {}
idx_to_sym = {}

for idx, sym in enumerate(alphabet):
    sym_to_idx[sym] = idx
    idx_to_sym[idx] = sym
    
text_idx = np.array([sym_to_idx[char] for char in text])
sequences = Dataset.from_tensor_slices(text_idx).batch(BATCH_SIZE, drop_remainder=True)

dataset = sequences.map(get_features_target)

data = dataset.batch(BATCH_SIZE, drop_remainder=True).repeat()
data = data.prefetch(AUTOTUNE)

In [None]:
rnn_model = keras.Sequential([
    keras.layers.Embedding(len(alphabet), BATCH_SIZE),
    keras.layers.SimpleRNN(64, return_sequences=True, stateful=True),
    keras.layers.Dense(len(alphabet))
])

rnn_model.compile(optimizer='adam', loss=losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
rnn_model.fit(data, epochs=40, verbose=1, steps_per_epoch= len(sequences) // BATCH_SIZE)

In [None]:
def predict_next(sample, model, tokenizer, vocabulary, n_next, rnd_power, batch_size):
    sample_token = [tokenizer[char] for char in sample]
    predicted = sample_token

    sample_tensor = tf.expand_dims(sample_token, 0)
    sample_tensor = tf.repeat(sample_tensor, batch_size, axis=0)
    
    for _ in range(n_next):
        cur = model(sample_tensor)
        cur = cur[0].numpy() / rnd_power
        cur = tf.random.categorical(cur, num_samples=1)[-1, 0].numpy()
        predicted.append(cur)
        sample_tensor = predicted[-99:]
        sample_tensor = tf.expand_dims([cur], 0)
        sample_tensor = tf.repeat(sample_tensor, batch_size, axis=0)
    res = [vocabulary[i] for i in predicted]
    generated = ''.join(res)
    return generated

In [None]:
print(predict_next(
    sample='б',
    model=rnn_model,
    tokenizer=sym_to_idx,
    vocabulary=idx_to_sym,
    n_next=200,
    rnd_power=0.6,
    batch_size=BATCH_SIZE
))

In [None]:
print(predict_next(
    sample='1',
    model=rnn_model,
    tokenizer=sym_to_idx,
    vocabulary=idx_to_sym,
    n_next=100,
    rnd_power=0.2,
    batch_size=BATCH_SIZE
))