In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.nn import weighted_cross_entropy_with_logits

import re
import glob
import random
import numpy as np

In [2]:
remove = ['\u200e', '[', ']', '(', ')', '\x98', '́', '\r']
replace = {
    '»': '"',
    '«': '"',
    '“': '"',
    '„': '"',
    '...': '…',
    'ё': 'е',
    'ъ': 'ь',
}

def preprocess_str(string):
    string = string.decode('UTF-8').lower()
    for x in remove:
        string = string.replace(x, '')
    for key, value in replace.items():
        string = string.replace(key, value)
    string = re.sub(r' +', ' ', string)
    return string.replace('\n ', '\n')

chars = set()
for path in glob.glob('poems/*.txt'):
    try:
        with open(path, 'rb') as f:
            text = preprocess_str(f.read())
        chars.update(list(text))
    except:
        print(path)

corpus = {value: i for i, value in enumerate(sorted(chars))}
corpus_inv = {value: key for key, value in corpus.items()}

In [3]:
window_size = 32
min_chars = 16

def read(path):
    with open(path, 'rb') as f:
        text = preprocess_str(f.read())
    
    X, y = [], []
    for i in range(min_chars, len(text) - min_chars):
        if i < window_size:
            x = list(map(lambda x: corpus[x], text[:max(min_chars, i)]))
        else:
            x = list(map(lambda x: corpus[x], text[i - window_size:i]))
        y.append(corpus[text[i]])
        X.append(x)

    X = pad_sequences(X, window_size)
    np.random.shuffle(X)
    
    return X, y

In [4]:
def read_all():
    X, y = [], []

    for path in glob.glob('poems/*.txt'):
        a, b = read(path)
        X.extend(a)
        y.extend(b)

    X = np.array(X)
    y = to_categorical(y, len(corpus.keys()))
    
    return X, y

In [5]:
def top_2_acc(x, y):
    return top_k_categorical_accuracy(x, y, 2)

model = Sequential([
    Input(shape=[window_size]),
    Embedding(input_dim=len(corpus.keys()), output_dim=16),
    LSTM(64, return_sequences=True),
    LSTM(64),
    Dense(len(corpus.keys()), activation='softmax', use_bias=False),
])

model.compile('adam', loss='categorical_crossentropy', metrics=['acc', top_2_acc])

In [6]:
X, y = read_all()

In [7]:
model.fit(
    X, y, 
    epochs=200, 
    batch_size=64
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200


Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
 617/6633 [=>............................] - ETA: 39s - loss: 3.1398 - acc: 0.1364 - top_2_acc: 0.2239

KeyboardInterrupt: 

In [18]:
weights = []
for i in range(len(corpus.keys())):
    weights.append(counter[corpus_inv[i]])
weights = 1 / np.array(weights)

def loss(y_true, y_pred):
    return tf.reduce_mean(weighted_cross_entropy_with_logits(y_true, y_pred, weights)) * 100

In [8]:
counter = {}
for path in glob.glob('poems/*.txt'):
    try:
        with open(path, 'rb') as f:
            text = preprocess_str(f.read())
        for c in text:
            if c in counter.keys():
                counter[c] += 1
            else:
                counter[c] = 1
        if '  ' in text:
            print(1)
    except:
        print(path)
        break

In [9]:
# def read_text(text):
#     X = []
#     y = []
#     x = list(map(lambda x: corpus[x], text))
#     return pad_sequences([x], window_size)

# def get_next(value, next_value):
#     return np.array([[*value[0][1:], next_value]])

In [33]:
# X = read_text('привет, любовь моя,\nчто делает меня счастливым\n')
# x = predict(X)
# print(corpus_inv[x])
# X = get_next(X, x)
# x = predict(X)
# print(corpus_inv[x])
# X = get_next(X, x)
# x = predict(X)
# print(corpus_inv[x])
# X = get_next(X, x)
# x = predict(X)
# print(corpus_inv[x])
# X = get_next(X, x)
# x = predict(X)
# print(corpus_inv[x])

 
 
 
 
 


In [7]:
# num_cores = multiprocessing.cpu_count()

# class DataGenerator(Sequence):
#     def __init__(self, paths, batch_size=8):
#         self.batch_size = batch_size
#         self.paths = paths
#         self.on_epoch_end()

#     def __len__(self):
#         return int(np.floor(len(self.paths) / self.batch_size))

#     def __getitem__(self, index):
#         indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
#         paths = [self.paths[k] for k in indexes]
        
#         results = Parallel(n_jobs=num_cores)(
#             delayed(read)(path) for path in paths
#         )
        
#         for i in range(0, len(results), 100):
#             X = []
#             y = []
#             for a, b in results[i:i+100]:
#                 X.extend(a)
#                 y.extend(b)
        
#             yield np.array(X)[:, :, np.newaxis], to_categorical(y, len(corpus.keys()))

#     def on_epoch_end(self):
#         self.indexes = np.arange(len(self.paths))
#         np.random.shuffle(self.indexes)