In [1]:
import pandas as pd

In [2]:
fairy_tales = pd.read_csv("../data/fairy_tales.csv")

fairy_tales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   10 non-null     object
 1   text    10 non-null     object
 2   source  10 non-null     object
dtypes: object(3)
memory usage: 368.0+ bytes


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
tokenizer = Tokenizer()

corpus = fairy_tales['text'].str.lower().tolist()

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(total_words)

3555


In [5]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
MAX_SEQ_LEN = 50

input_sequences = []

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]

        if len(n_gram_sequence) <= MAX_SEQ_LEN:
            input_sequences.append(n_gram_sequence)

max_sequence_len = min(max([len(x) for x in input_sequences]), MAX_SEQ_LEN)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

ys = keras.utils.to_categorical(labels, num_classes=total_words)

In [7]:
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [12]:
model = keras.Sequential([
    layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
    layers.Bidirectional(layers.LSTM(128)),
    layers.Dropout(0.2),
    layers.Dense(total_words, activation='softmax')
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.01),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True, verbose=1)

history = model.fit(xs, ys, epochs=100, batch_size=64, verbose=1, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 50: early stopping


In [19]:
def generate_text(seed_text, next_words, temperature=0.8):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        
        predicted = model.predict(token_list, verbose=0)[0]
        predicted = np.log(predicted + 1e-10) / temperature
        exp_preds = np.exp(predicted)
        predicted = exp_preds / np.sum(exp_preds)
        
        predicted_index = np.random.choice(len(predicted), p=predicted)
        
        if predicted_index == 0 or predicted_index not in tokenizer.index_word:
            continue
            
        output_word = tokenizer.index_word[predicted_index]
        seed_text += " " + output_word
    
    return seed_text

print(generate_text("Ерте", 20, temperature=1.5))

Ерте ертеде бір елде бай саудагер өмір сүріпті оның байлығы күміс үйінде мал арыстан өмір бай үйіне басқа есімді кейін бай
