In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.nn import weighted_cross_entropy_with_logits
from tensorflow.data import Dataset

import re
import glob
import random
import numpy as np

In [15]:
remove = ['\u200e', '[', ']', '(', ')', '\x98', '́', '\r']
replace = {
    '»': '"',
    '«': '"',
    '“': '"',
    '„': '"',
    '...': '…',
}

def preprocess_str(string):
    string = string.lower()
    for x in remove:
        string = string.replace(x, '')
    for key, value in replace.items():
        string = string.replace(key, value)
    string = re.sub(r' +', ' ', string)
    return string.replace('\n ', '\n')

chars = set()
for path in glob.glob('poems/*.txt'):
    with open(path, 'rb') as f:
        text = preprocess_str(f.read().decode('utf-8'))
    chars.update(list(text))

corpus = {value: i for i, value in enumerate(sorted(chars))}
corpus_inv = {value: key for key, value in corpus.items()}

In [12]:
window_size = 100
min_chars = 50
batch_size = 64

def read(path):
    with open(path, 'rb') as f:
        text = preprocess_str(f.read())
    
    X, y = [], []
    for i in range(min_chars, len(text) - min_chars):
        if i < window_size:
            x = list(map(lambda x: corpus[x], text[:max(min_chars, i)]))
        else:
            x = list(map(lambda x: corpus[x], text[i - window_size:i]))
        y.append(corpus[text[i]])
        X.append(x)

    X = pad_sequences(X, window_size)
    np.random.shuffle(X)
    
    return X, y

In [30]:
def read_poem(path):
    with open(path, 'rb') as f:
        text = preprocess_str(f.read().decode('utf-8'))
    text_as_int = [corpus[c] for c in text]

    ds = Dataset.from_tensor_slices(text_as_int)
    ds = ds.batch(window_size + 1, drop_remainder=True).map(lambda x: (x[:-1], x[1:]))
    
    return ds.shuffle(1024).batch(batch_size, drop_remainder=True)

def read_n_poems(n=None):
    total_chars = 0
    total_words = 0
    dataset = None
    
    for path in glob.glob('poems/*.txt')[:n]:
        with open(path, 'rb') as f:
            text = preprocess_str(f.read().decode('utf-8'))
            total_chars += len(text)
            total_words += len(text.split(' '))
        ds = read_poem(path)
        dataset = ds if dataset is None else dataset.concatenate(ds)
    
    print(f'Всего прочитано сивмолов: {total_chars}')
    print(f'Всего прочитано слов: {total_words}')
    
    return dataset

dataset = read_n_poems()

Всего прочитано сивмолов: 443330
Всего прочитано слов: 56780


In [4]:
def read_all(cnt=None):
    X, y = [], []
    total_chars = 0
    total_words = 0

    for path in glob.glob('poems/*.txt')[:cnt]:
        with open(path, 'rb') as f:
            text = preprocess_str(f.read())
            total_chars += len(text)
            total_words += len(text.split(' '))
        a, b = read(path)
        X.extend(a)
        y.extend(b)

    X = np.array(X)
    y = to_categorical(y, len(corpus.keys()))
    
    print(f'Всего прочитано сивмолов: {total_chars}')
    print(f'Всего прочитано слов: {total_words}')
    
    return X, y

In [32]:
from sklearn.utils.class_weight import compute_class_weight

chars = []
for path in glob.glob('poems/*.txt'):
    with open(path, 'rb') as f:
        chars.extend(list(preprocess_str(f.read().decode('utf-8'))))

weights = compute_class_weight('balanced', sorted(list(corpus.keys())), chars)
class_weight = dict(zip(range(len(weights)), weights))

In [35]:
def top_2_acc(x, y):
    return top_k_categorical_accuracy(x, y, 2)

model = Sequential([
    Embedding(len(corpus), 128, batch_input_shape=[batch_size, None]),
    GRU(512, return_sequences=True, stateful=True),
    Dense(len(corpus)),
])

model.compile('adam', loss='sparse_categorical_crossentropy', weighted_metrics=['acc', top_2_acc])

In [7]:
X, y = read_all(300)
print(f'\nX.shape = {X.shape}')
print(f'y.shape = {y.shape}')

Всего прочитано сивмолов: 198487
Всего прочитано слов: 25732

X.shape = (168546, 100)
y.shape = (168546, 45)


In [39]:
next(iter(dataset))

(<tf.Tensor: shape=(64, 100), dtype=int32, numpy=
 array([[32, 10, 26, ..., 24, 26, 24],
        [24, 40,  1, ..., 33, 15, 26],
        [28,  1, 28, ..., 15, 23, 35],
        ...,
        [24,  4,  0, ..., 23, 18, 15],
        [14, 10, 41, ..., 20, 24, 12],
        [10, 34,  1, ..., 15, 23, 18]])>,
 <tf.Tensor: shape=(64, 100), dtype=int32, numpy=
 array([[10, 26, 38, ..., 26, 24, 28],
        [40,  1, 24, ..., 15, 26, 10],
        [ 1, 28, 12, ..., 23, 35, 18],
        ...,
        [ 4,  0, 17, ..., 18, 15, 44],
        [10, 41,  1, ..., 24, 12, 24],
        [34,  1, 18, ..., 23, 18, 28]])>)

In [36]:
%%time 

model.fit(
#     X, y,
    dataset,
    epochs=300,
    class_weight=class_weight,
)

Epoch 1/300


ValueError: in user code:

    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    <ipython-input-35-814be9822700>:2 top_2_acc  *
        return top_k_categorical_accuracy(x, y, 2)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\keras\metrics.py:3335 top_k_categorical_accuracy
        nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), K.floatx())
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\ops\nn_ops.py:5608 in_top_k
        return gen_nn_ops.in_top_kv2(predictions, targets, k, name=name)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py:4691 in_top_kv2
        "InTopKV2", predictions=predictions, targets=targets, k=k, name=name)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:593 _create_op_internal
        compute_device)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py:3485 _create_op_internal
        op_def=op_def)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py:1975 __init__
        control_input_ops, op_def)
    C:\Users\Yoskutik\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py:1815 _create_c_op
        raise ValueError(str(e))

    ValueError: Shape must be rank 2 but is rank 3 for '{{node in_top_k/InTopKV2}} = InTopKV2[T=DT_INT64](sequential_2/dense_2/BiasAdd, ArgMax_1, in_top_k/InTopKV2/k)' with input shapes: [64,100,45], [64], [].


In [10]:
def read_text(text):
    x = list(map(lambda x: corpus[x], text))
    return pad_sequences([x], window_size)

def get_next(value, next_value):
    return np.array([[*value[0][1:], next_value]])

def predict(X):
    return np.argmax(model.predict(X)[0])

In [15]:
initial_text = 'здравствуй, мой друг\nчто хочет покушать?\n'
X = read_text(initial_text)

print(initial_text, end='')
for _ in range(60):
    x = predict(X)
    print(corpus_inv[x], end='')
    X = get_next(X, x)

здравствуй, мой друг
что хочет покушать?
кныяынугвпьдзжнсттлбнолузуйвзатирцсарррсрксв,
йыхляосл
далиь

In [7]:
# num_cores = multiprocessing.cpu_count()

# class DataGenerator(Sequence):
#     def __init__(self, paths, batch_size=8):
#         self.batch_size = batch_size
#         self.paths = paths
#         self.on_epoch_end()

#     def __len__(self):
#         return int(np.floor(len(self.paths) / self.batch_size))

#     def __getitem__(self, index):
#         indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
#         paths = [self.paths[k] for k in indexes]
        
#         results = Parallel(n_jobs=num_cores)(
#             delayed(read)(path) for path in paths
#         )
        
#         for i in range(0, len(results), 100):
#             X = []
#             y = []
#             for a, b in results[i:i+100]:
#                 X.extend(a)
#                 y.extend(b)
        
#             yield np.array(X)[:, :, np.newaxis], to_categorical(y, len(corpus.keys()))

#     def on_epoch_end(self):
#         self.indexes = np.arange(len(self.paths))
#         np.random.shuffle(self.indexes)