In [1]:
import pathlib, sys
import numpy as np

from lstm import LSTMNextTokenGenerator


In [2]:
# Утилиты из 03 RNN/main.py
def build_vocab(text: str):
    chars = sorted(set(text))
    char2idx = {ch: i for i, ch in enumerate(chars)}
    idx2char = {i: ch for ch, i in char2idx.items()}
    return char2idx, idx2char

def encode_text(text: str, char2idx: dict[str, int]) -> np.ndarray:
    return np.array([char2idx[ch] for ch in text], dtype=int)

def to_one_hot(indices: np.ndarray, vocab_size: int) -> np.ndarray:
    T = len(indices)
    one_hot = np.zeros((T, vocab_size), dtype=float)
    one_hot[np.arange(T), indices] = 1.0
    return one_hot

def build_sequences(indices: np.ndarray, seq_len: int, vocab_size: int):
    X_list = []
    y_list = []
    for start in range(len(indices) - seq_len):
        x_idx = indices[start:start + seq_len]
        y_idx = indices[start + 1:start + seq_len + 1]
        X_list.append(to_one_hot(x_idx, vocab_size))
        y_list.append(y_idx)  # целевые индексы
    return np.stack(X_list), np.stack(y_list)

def decode_indices(indices: np.ndarray, idx2char: dict[int, str]) -> str:
    return "".join(idx2char[i] for i in indices)


In [3]:
base_text = (
    "to be or not to be that is the question whether tis nobler in the mind "
    "to suffer the slings and arrows of outrageous fortune "
    "or to take arms against a sea of troubles and by opposing end them "
    "to die to sleep no more and by a sleep to say we end the heartache "
    "and the thousand natural shocks that flesh is heir to "
)

# Увеличиваем корпус повторением базового текста
repeats = 15
text = base_text * repeats

seq_len = 20

char2idx, idx2char = build_vocab(text)
vocab_size = len(char2idx)
indices = encode_text(text, char2idx)

split = int(len(indices) * 0.8)
train_idx = indices[:split]
test_idx = indices[split:]

X_train, y_train = build_sequences(train_idx, seq_len, vocab_size)
X_test, y_test = build_sequences(test_idx, seq_len, vocab_size)

X_train.shape, y_train.shape


((3736, 20, 23), (3736, 20))

In [7]:
model = LSTMNextTokenGenerator(
    input_dim=vocab_size,
    hidden_dim=256,
    vocab_size=vocab_size,
    lr=1e-3,
    max_epochs=20,
    random_state=42,
    verbose=True,
)
model.fit(X_train, y_train)


epoch=0 | loss=2.4408
epoch=1 | loss=1.5505
epoch=2 | loss=0.6147
epoch=3 | loss=0.3751
epoch=4 | loss=0.3049
epoch=5 | loss=0.2712
epoch=6 | loss=0.2510
epoch=7 | loss=0.2379
epoch=8 | loss=0.2284
epoch=9 | loss=0.2203
epoch=10 | loss=0.2147
epoch=11 | loss=0.2099
epoch=12 | loss=0.2061
epoch=13 | loss=0.2042
epoch=14 | loss=0.2011
epoch=15 | loss=0.1981
epoch=16 | loss=0.1965
epoch=17 | loss=0.1955
epoch=18 | loss=0.1935
epoch=19 | loss=0.1918


<lstm.LSTMNextTokenGenerator at 0x1187f5d30>

In [8]:
# Оценка точности предсказания следующего символа
probs_test = model.predict_proba(X_test)
preds = np.argmax(probs_test, axis=2)
correct = (preds == y_test).sum()
total = y_test.size
print(f"Test accuracy (next-char): {correct / total:.4f}")

Test accuracy (next-char): 0.9292


In [9]:
def sample(model, seed_text: str, steps: int = 200):
    prefix = seed_text
    for _ in range(steps):
        window = prefix[-seq_len:]
        x_idx = np.array([char2idx[ch] for ch in window])
        x_vec = to_one_hot(x_idx, vocab_size)
        probs = model.predict_next_proba(x_vec)
        next_idx = np.random.default_rng().choice(vocab_size, p=probs)
        prefix += idx2char[next_idx]
    return prefix

generated = sample(model, seed_text="to be or not to be ")
print(generated)


to be or not to be that is the question whetherrt sleee nd no that is the suission whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by op
