Устанавливаем и импортируем библиотеки

In [1]:
!pip install tensorflow



In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

Собираем датасет. Я взяла первую книгу о Гарри Поттере, т.к. уже ее использовала для других проектов по корпусной лингвистике.

In [3]:
import re

with open('harry potter.txt', 'r', encoding='utf-8') as data:
    data = data.read()

pattern = r'\s*Page\s*\d+\s*|[^\w\s.]|\n(?=\w)'
cleantext = re.sub(pattern, '', data.lower()).strip()

sentences = re.split(r'\.\s*', cleantext)
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

with open('clean_hp.txt', 'w', encoding='utf-8') as f:
    for sentence in sentences:
        f.write(sentence + '\n')

with open('clean_hp.txt', 'r', encoding='utf-8') as hp:
  data = hp.read()

data = data.split('\n')
print(data[:20])

['harry potter and the sorcerers stone', '', 'chapter one', 'the boy who lived', 'mr', 'and mrs', 'dursley of number four privet drive were proud to saythat they were perfectly normal thank you very much', 'they were the lastpeople youd expect to be involved in anything strange or mysteriousbecause they just didnt hold with such nonsense', 'mr', 'dursley was the director of a firm called grunnings which madedrills', 'he was a big beefy man with hardly any neck although he didhave a very large mustache', 'mrs', 'dursley was thin and blonde and hadnearly twice the usual amount of neck which came in very useful as shespent so much of her time craning over garden fences spying on theneighbors', 'the dursleys had a small son called dudley and in theiropinion there was no finer boy anywhere', 'the dursleys had everything they wanted but they also had a secret andtheir greatest fear was that somebody would discover it', 'they didntthink they could bear it if anyone found out about the potters

Подготовка данных

In [4]:
# Инициализируем токенизатор
tokenizer = Tokenizer()

# Обучаем токенизатор на заголовках
tokenizer.fit_on_texts(data)

# Преобразуем заголовки в последовательности чисел
sequences = tokenizer.texts_to_sequences(data)

# Создаем входные и выходные данные
X = []
y = []
for seq in sequences:
    for i in range(1, len(seq)):
        X.append(seq[:i])
        y.append(seq[i])

Числа - индексы слов. Х - это входные последовательности индексов слов, а y - индекс для следующего слова, который модель учится предсказывать.

In [5]:
X[:10], y[:10]

([[7],
  [7, 124],
  [7, 124, 2],
  [7, 124, 2, 1],
  [7, 124, 2, 1, 635],
  [533],
  [1],
  [1, 149],
  [1, 149, 74],
  [2]],
 [124, 2, 1, 635, 146, 39, 149, 74, 909, 232])

In [6]:
# Преобразуем списки в массивы numpy
X = np.asarray(X, dtype="object")
y = np.array(y)

# Дополняем последовательности до одинаковой длины
X = pad_sequences(X)

# Преобразуем y в one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

Создание и обучение модели

In [7]:
# Создаем модель
model = Sequential()

# Добавляем слой Embedding
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X.shape[1]))

# Добавляем слой LSTM
model.add(LSTM(150, return_sequences=False))

# Добавляем полносвязный слой
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

# Компилируем модель
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Выводим информацию о модели
model.summary()



In [8]:
# Обучаем модель
history = model.fit(X, y, epochs=50, batch_size=64, validation_split=0.2)

Epoch 1/50
[1m837/837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 404ms/step - accuracy: 0.0401 - loss: 7.3990 - val_accuracy: 0.0481 - val_loss: 6.9920
Epoch 2/50
[1m837/837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m357s[0m 426ms/step - accuracy: 0.0579 - loss: 6.6120 - val_accuracy: 0.0713 - val_loss: 6.8005
Epoch 3/50
[1m837/837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m358s[0m 428ms/step - accuracy: 0.0883 - loss: 6.1701 - val_accuracy: 0.0883 - val_loss: 6.7311
Epoch 4/50
[1m837/837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 426ms/step - accuracy: 0.1166 - loss: 5.7517 - val_accuracy: 0.0938 - val_loss: 6.6935
Epoch 5/50
[1m837/837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 412ms/step - accuracy: 0.1286 - loss: 5.4462 - val_accuracy: 0.0973 - val_loss: 6.7491
Epoch 6/50
[1m837/837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 413ms/step - accuracy: 0.1426 - loss: 5.1631 - val_accuracy: 0.1028 - val_loss: 6.8095
Epoc

Обновление саммари модели

In [9]:
model.summary()

Тестирование

In [10]:
# Функция для генерации текста
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Генерируем новый заголовок
generated_text = generate_text("Harry", 10, X.shape[1])
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Harry had never seen in things that was even air thesame


In [12]:
generated_text = generate_text("Dumbledore", 30, X.shape[1])
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88

In [13]:
generated_text = generate_text("magic", 10, X.shape[1])
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
magic carpets all gotpunctures have they found out how to be


Сохранение

In [14]:
model.save('hp_generator.keras')