In [8]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical

# Sample text data
text = "Hello world. This is a simple language model example."

# Create a mapping of unique characters to integers
chars = sorted(list(set(text)))
char_to_int = {char: idx for idx, char in enumerate(chars)}
int_to_char = {idx: char for idx, char in enumerate(chars)}
print("char_to_int")
print(char_to_int)
print("int_to_char")
print(int_to_char)

# Prepare the dataset of input to output pairs encoded as integers
seq_length = 12
dataX = []
dataY = []

for i in range(len(text) - seq_length):
    seq_in = text[i:i + seq_length]
    seq_out = text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
    print("dataX")
    print(dataX)
    print("dataY")
    print(dataY)

n_patterns = len(dataX)
n_vocab = len(chars)

# Reshape X to be [samples, time steps]
X = np.array(dataX)
# One-hot encode the output variable
y = to_categorical(dataY, num_classes=n_vocab)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=n_vocab, output_dim=50, input_length=seq_length))
model.add(LSTM(100))
model.add(Dense(n_vocab, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model
model.fit(X, y, epochs=100, batch_size=16, verbose=2)

# Function to generate text
def generate_text(seed_text, gen_length=100):
    result = seed_text
    pattern = [char_to_int[char] for char in seed_text]
    for _ in range(gen_length):
        x_input = np.array([pattern])
        prediction = model.predict(x_input, verbose=0)
        index = np.argmax(prediction)
        char_out = int_to_char[index]
        result += char_out
        pattern.append(index)
        pattern = pattern[1:]
    return result

# Generate text using the trained model
seed = text[:seq_length]
generated_text = generate_text(seed_text=seed, gen_length=50)
print("seed")
print(seed)
print("Generated Text:\n", generated_text)


char_to_int
{' ': 0, '.': 1, 'H': 2, 'T': 3, 'a': 4, 'd': 5, 'e': 6, 'g': 7, 'h': 8, 'i': 9, 'l': 10, 'm': 11, 'n': 12, 'o': 13, 'p': 14, 'r': 15, 's': 16, 'u': 17, 'w': 18, 'x': 19}
int_to_char
{0: ' ', 1: '.', 2: 'H', 3: 'T', 4: 'a', 5: 'd', 6: 'e', 7: 'g', 8: 'h', 9: 'i', 10: 'l', 11: 'm', 12: 'n', 13: 'o', 14: 'p', 15: 'r', 16: 's', 17: 'u', 18: 'w', 19: 'x'}
dataX
[[2, 6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1]]
dataY
[0]
dataX
[[2, 6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1], [6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1, 0]]
dataY
[0, 3]
dataX
[[2, 6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1], [6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1, 0], [10, 10, 13, 0, 18, 13, 15, 10, 5, 1, 0, 3]]
dataY
[0, 3, 8]
dataX
[[2, 6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1], [6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1, 0], [10, 10, 13, 0, 18, 13, 15, 10, 5, 1, 0, 3], [10, 13, 0, 18, 13, 15, 10, 5, 1, 0, 3, 8]]
dataY
[0, 3, 8, 9]
dataX
[[2, 6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1], [6, 10, 10, 13, 0, 18, 13, 15, 10, 5, 1, 0], [1