In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.nn import weighted_cross_entropy_with_logits
from tensorflow.data import Dataset

import re
import glob
import random
import numpy as np

In [2]:
remove = ['\u200e', '[', ']', '(', ')', '\x98', '́', '\r']
replace = {
    '»': '"',
    '«': '"',
    '“': '"',
    '„': '"',
    '...': '…',
}

def preprocess_str(string):
    string = string.lower()
    for x in remove:
        string = string.replace(x, '')
    for key, value in replace.items():
        string = string.replace(key, value)
    string = re.sub(r' +', ' ', string)
    return string.replace('\n ', '\n')

chars = set()
for path in glob.glob('poems/*.txt'):
    with open(path, 'rb') as f:
        text = preprocess_str(f.read().decode('utf-8'))
    chars.update(list(text))

corpus = {value: i for i, value in enumerate(sorted(chars))}
corpus_inv = {value: key for key, value in corpus.items()}

In [3]:
window_size = 100
min_chars = 50
batch_size = 64

def read(path):
    with open(path, 'rb') as f:
        text = preprocess_str(f.read().decode('utf-8'))
    
    X, y = [], []
    for i in range(min_chars, len(text) - min_chars):
        if i < window_size:
            x = list(map(lambda x: corpus[x], text[:max(min_chars, i)]))
        else:
            x = list(map(lambda x: corpus[x], text[i - window_size:i]))
        y.append(corpus[text[i]])
        X.append(x)

    X = pad_sequences(X, window_size)
    np.random.shuffle(X)
    
    return X, y

In [4]:
def read_all(cnt=None):
    X, y = [], []
    total_chars = 0
    total_words = 0

    for path in glob.glob('poems/*.txt')[:cnt]:
        with open(path, 'rb') as f:
            text = preprocess_str(f.read().decode('utf-8'))
            total_chars += len(text)
            total_words += len(text.split(' '))
        a, b = read(path)
        X.extend(a)
        y.extend(b)

    X = np.array(X)
    y = to_categorical(y, len(corpus.keys()))
    
    print(f'Всего прочитано сивмолов: {total_chars}')
    print(f'Всего прочитано слов: {total_words}')
    
    return X, y

In [5]:
from sklearn.utils.class_weight import compute_class_weight

chars = []
for path in glob.glob('poems/*.txt'):
    with open(path, 'rb') as f:
        chars.extend(list(preprocess_str(f.read().decode('utf-8'))))

weights = compute_class_weight('balanced', sorted(list(corpus.keys())), chars)
class_weight = dict(zip(range(len(weights)), weights))

In [6]:
def top_2_acc(x, y):
    return top_k_categorical_accuracy(x, y, 2)

model = Sequential([
    Embedding(len(corpus), 128),
    LSTM(256),
    Dense(len(corpus), activation='softmax'),
])

model.compile('adam', loss='categorical_crossentropy', weighted_metrics=['acc', top_2_acc])

In [7]:
X, y = read_all(300)
print(f'\nX.shape = {X.shape}')
print(f'y.shape = {y.shape}')

Всего прочитано сивмолов: 230840
Всего прочитано слов: 29311

X.shape = (200866, 100)
y.shape = (200866, 45)


In [8]:
%%time 

model.fit(
    X, y,
    epochs=300,
    class_weight=class_weight,
)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300

KeyboardInterrupt: 

In [9]:
def read_text(text):
    x = list(map(lambda x: corpus[x], text))
    return pad_sequences([x], window_size)

def get_next(value, next_value):
    return np.array([[*value[0][1:], next_value]])

def predict(X):
    return np.argmax(model.predict(X)[0])

In [10]:
initial_text = 'здравствуй, мой друг\nчто хочет покушать?\n'
X = read_text(initial_text)

print(initial_text, end='')
for _ in range(60):
    x = predict(X)
    print(corpus_inv[x], end='')
    X = get_next(X, x)

здравствуй, мой друг
что хочет покушать?
ым?гшкшшшшццггд!чюз!хгхщххы!ж—чюххж.бгчююююююююююююююююююююю

In [11]:
# num_cores = multiprocessing.cpu_count()

# class DataGenerator(Sequence):
#     def __init__(self, paths, batch_size=8):
#         self.batch_size = batch_size
#         self.paths = paths
#         self.on_epoch_end()

#     def __len__(self):
#         return int(np.floor(len(self.paths) / self.batch_size))

#     def __getitem__(self, index):
#         indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
#         paths = [self.paths[k] for k in indexes]
        
#         results = Parallel(n_jobs=num_cores)(
#             delayed(read)(path) for path in paths
#         )
        
#         for i in range(0, len(results), 100):
#             X = []
#             y = []
#             for a, b in results[i:i+100]:
#                 X.extend(a)
#                 y.extend(b)
        
#             yield np.array(X)[:, :, np.newaxis], to_categorical(y, len(corpus.keys()))

#     def on_epoch_end(self):
#         self.indexes = np.arange(len(self.paths))
#         np.random.shuffle(self.indexes)