# Character-Based Text Generation

In [115]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import time
import os

In [116]:
text_path = "C:\\MyComputer\\uni 2 uj\\SI\\CharacterGeneration\\data\\idiot.txt"

with open(text_path, 'r', encoding='utf-8') as f:
    text = f.read()

text = text[:100000]
print(f"Przyklad: {text[:1500]}...")

Przyklad: ﻿The Project Gutenberg eBook of The Idiot
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Idiot

Author: Fyodor Dostoyevsky

Translator: Eva Martin

Release date: May 1, 2001 [eBook #2638]
                Most recently updated: June 21, 2021

Language: English

Credits: Martin Adamson, David Widger, with corrections by Andrew Sly


*** START OF THE PROJECT GUTENBERG EBOOK THE IDIOT ***




The Idiot

by Fyodor Dostoyevsky

Translated by Eva Martin


Contents

 PART I
 PART II
 PART III
 PART IV




PART I


I.

Towards the end of November, during a thaw, at nine o’clock o

In [117]:
# slownik
chars = sorted(list(set(text)))  # unikalne znaki
vocab_size = len(chars)
print(f"Liczba unikalnych znakow: {vocab_size}")
print(f"Znaki: {chars}")

# znak -> liczba i odwrotnie
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

Liczba unikalnych znakow: 79
Znaki: ['\n', ' ', '!', '#', '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '6', '8', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '\ufeff']


In [118]:
# parametry dla uczenia
SEQ_LEN = 120  # dlugosc sekwencji
BATCH_SIZE = 256
EPOCHS = 4

# tekst na liczby
text_as_int = np.array([char_to_idx[c] for c in text])
print(f"Przyklad konwersji: '{text[:10]}' -> {text_as_int[:10]}")

Przyklad konwersji: '﻿The Proje' -> [78 38 54 51  1 34 64 61 56 51]


In [119]:
# Tworzenie sekwencji treningowych
def create_sequences(text_as_int, seq_len):
    sequences = []
    for i in range(len(text_as_int) - seq_len):
        sequences.append(text_as_int[i:i+seq_len+1])
    return np.array(sequences)

sequences = create_sequences(text_as_int, SEQ_LEN)

# Podzial na x (input) i y (target)
x = sequences[:, :-1]  # wszystko oprocz ostatniego
y = sequences[:, 1:]   # wszystko oprocz pierwszego

In [120]:
# model stateless
def build_stateless_model(vocab_size, embedding_dim=256, rnn_units=128, num_layers=1):
    model = keras.Sequential()
    
    # zamienia liczby na wektory
    model.add(keras.layers.Embedding(vocab_size, embedding_dim))
    
    # LSTM
    for i in range(num_layers):
        model.add(keras.layers.LSTM(rnn_units, return_sequences=True))

    model.add(keras.layers.Dense(vocab_size))
    
    return model

# Budowa modelu
model_stateless = build_stateless_model(vocab_size, num_layers=2)
model_stateless.summary()

In [121]:
# loss function: sparse categorical crossentropy
def loss_fn(y_true, y_pred):
    return keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) # from_logits = softmax + cross entropy

# kompilacja
model_stateless.compile(
    optimizer='adam',
    loss=loss_fn,
    metrics=['accuracy']
)

In [122]:
# trening stateless
start = time.time()

history_stateless = model_stateless.fit(
    x, y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1
)

end = time.time()
print(f"\nCzas treningu stateless: {end-start:.2f} sekund")

Epoch 1/4
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m331s[0m 929ms/step - accuracy: 0.3011 - loss: 2.5484 - val_accuracy: 0.3972 - val_loss: 2.0824
Epoch 2/4
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 1s/step - accuracy: 0.4520 - loss: 1.9135 - val_accuracy: 0.4811 - val_loss: 1.7880
Epoch 3/4
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 1s/step - accuracy: 0.5125 - loss: 1.6752 - val_accuracy: 0.5102 - val_loss: 1.6634
Epoch 4/4
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 1s/step - accuracy: 0.5518 - loss: 1.5238 - val_accuracy: 0.5288 - val_loss: 1.6014

Czas treningu stateless: 1531.99 sekund


In [123]:
# stateful model
def build_stateful_model(vocab_size, embedding_dim=256, rnn_units=128, num_layers=1, batch_size=64):
    model = keras.Sequential()

    model.add(keras.layers.Input(batch_shape=(batch_size, None)))

    # embedding
    model.add(keras.layers.Embedding(vocab_size, embedding_dim))
    
    # LSTM
    for i in range(num_layers):
        model.add(keras.layers.LSTM(
            rnn_units,
            return_sequences=True,
            stateful=True
        ))

    model.add(keras.layers.Dense(vocab_size))
    
    return model

# budowa
model_stateful = build_stateful_model(vocab_size, num_layers=2, batch_size=BATCH_SIZE)
model_stateful.summary()

In [124]:
# Kompilacja
model_stateful.compile(
    optimizer='adam',
    loss=loss_fn,
    metrics=['accuracy']
)

In [125]:
dataset_size = (len(x) // BATCH_SIZE) * BATCH_SIZE
x_temp = x[:dataset_size]
y_temp = y[:dataset_size]


num_batches = len(x_temp) // BATCH_SIZE

x_stateful = np.zeros_like(x_temp)
y_stateful = np.zeros_like(y_temp)

for batch_idx in range(num_batches):
    for seq_in_batch in range(BATCH_SIZE):

        new_pos = batch_idx * BATCH_SIZE + seq_in_batch
        old_pos = seq_in_batch * num_batches + batch_idx
        
        x_stateful[new_pos] = x_temp[old_pos]
        y_stateful[new_pos] = y_temp[old_pos]

print(f"Stateful data shape: {x_stateful.shape}")
print(f"Number of batches per epoch: {num_batches}")
print(f"Total sequences: {len(x_stateful)}")


Stateful data shape: (99840, 120)
Number of batches per epoch: 390
Total sequences: 99840


In [126]:
def custom_reset_states(self):
    for layer in self.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

setattr(keras.Sequential, 'reset_states', custom_reset_states)

In [127]:
# trening stateful
start = time.time()

# Dla stateful musimy recznie resetowac stan po kazdej epoce
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    history = model_stateful.fit(
        x_stateful, y_stateful,
        batch_size=BATCH_SIZE,
        epochs=1,
        shuffle=False  # nie shufflujemy dla stateful
    )
    model_stateful.reset_states()  # reset stanu po epoce

end = time.time()
print(f"\nCzas treningu stateful: {end-start:.2f} sekund")


Epoch 1/4
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 1s/step - accuracy: 0.3217 - loss: 2.4565

Epoch 2/4
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m349s[0m 893ms/step - accuracy: 0.4996 - loss: 1.7490

Epoch 3/4
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m433s[0m 1s/step - accuracy: 0.5840 - loss: 1.4385

Epoch 4/4
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 1s/step - accuracy: 0.6401 - loss: 1.2459

Czas treningu stateful: 1656.48 sekund


In [128]:
# generowanie tekstu
# greedy
def generate_greedy(model, start_string, num_generate=100):

    # Reset stanu jesli stateful
    if hasattr(model, 'reset_states'):
        model.reset_states()

    # input do liczb
    current_input = [char_to_idx[ch] for ch in start_string]
    current_input = tf.expand_dims(current_input, 0)

    # generowanie symboli
    generated_chars = []

    for _ in range(num_generate):
        # prawdopodobienstwo dla kazdego symbolu
        predictions = model(current_input)

        # bierzemy najbardziej prawdopodobny
        last_prediction = predictions[0, -1, :]
        next_char_id = tf.argmax(last_prediction).numpy()

        next_char = idx_to_char[next_char_id]
        generated_chars.append(next_char)
        current_input = tf.expand_dims([next_char_id], 0)

    return start_string + ''.join(generated_chars)

In [129]:
# 2. temperature - losowosc
def generate_temperature(model, start_text, num_generate=100, temperature=1.0):

    if hasattr(model, 'reset_states'):
        model.reset_states()

    result = start_text

    current = [char_to_idx[c] for c in start_text]
    current = tf.expand_dims(current, 0)

    for _ in range(num_generate):
        predictions = model(current)
        last_char_logits = predictions[0, -1, :]

        # dzielimy przez temperature
        adjusted_logits = last_char_logits / temperature

        # bierzemy randomowy char
        next_char_id = tf.random.categorical(
            tf.expand_dims(adjusted_logits, 0),
            num_samples=1
        )[0, 0].numpy()

        next_char = idx_to_char[next_char_id]
        result += next_char

        current = tf.expand_dims([next_char_id], 0)

    return result

In [130]:
# 3. Beam Search - trzyma top-k najlepszych sekwencji
def generate_beam_search(model, start_string, num_generate=50, beam_width=3):

    start_indices = [char_to_idx[c] for c in start_string]

    # beam_candidates to [index list, score]
    # the lower score the better
    beam_candidates = [[start_indices, 0.0]]

    for _ in range(num_generate):
        all_new_paths = []

        # dla kazdego kandydata
        for seq, score in beam_candidates:

            input_tensor = tf.expand_dims(seq, 0)
            predictions = model(input_tensor)

            # patrzymy tylko na ostatnie prawdopodob
            last_token_logits = predictions[0, -1, :]
            probs = tf.nn.softmax(last_token_logits).numpy()

            # bierzemy 3 najlepsze
            top_indices = np.argsort(probs)[-beam_width:]

            for idx in top_indices:
                # nowy szlak
                new_seq = seq + [idx]

                # liczymy score
                new_score = score - np.log(probs[idx])

                all_new_paths.append([new_seq, new_score])

        # pruning, zostaja tylko 3 najlepszych
        ordered = sorted(all_new_paths, key=lambda x: x[1])
        beam_candidates = ordered[:beam_width]
    # najlepszy z najlepszych
    best_path_indices = beam_candidates[0][0]

    return ''.join([idx_to_char[i] for i in best_path_indices])

In [131]:
# generowanie

model_for_gen = build_stateful_model(
    vocab_size,
    embedding_dim=256,
    rnn_units=128,
    num_layers=2,
    batch_size=1  # dla generacji
)

# Przepisz wagi z wytrenowanego modelu
model_for_gen.set_weights(model_stateful.get_weights())

In [135]:
# testowanie roznych metod
start_text = ("What is the ")
print(f"Tekst poczatkowy: '{start_text}'\n")

# greedy
print("greedy:")
result_greedy = generate_greedy(model_for_gen, start_text, num_generate=100)
print(result_greedy)
print()

Tekst poczatkowy: 'What is the '

greedy:
What is the general with a good deal and seemed to astonished and strange of the secretary to the common some to



In [136]:
# temperature
temps = [0.5, 1.0, 1.5]
for temp in temps:
    print(f"temperature {temp}")
    result = generate_temperature(model_for_gen, start_text, num_generate=100, temperature=temp)
    print(result)
    print()

temperature 0.5
What is the general and was a fanional instance. I am sure of in this place of the latter had better had sortanc

temperature 1.0
What is the general in curtural
reare the driblad, if I have been Schneideredued to this instanking in a This co

temperature 1.5
What is the cannes he are mning Geras﻿.
‘Atke a itgo go simplacely selirutiant, have strokehefe
frieaby nomly it



In [137]:
# beam search
beam_widths = [2, 3, 5]
for width in beam_widths:
    print(f"beam search, width={width}")
    result = generate_beam_search(model_for_gen, start_text, num_generate=50, beam_width=width)
    print(result)
    print()

beam search, width=2
What is the prince in the commortances, and what she had long 

beam search, width=3
What is the general, with the general, with the general, with 

beam search, width=5
What is the general, and when I have been that they were nothi

