### Text Gen with LSTM 

Notebook does Text Gen with a simple LSTM model with the following layers: Embedding, LSTM, Dense

Data comes from  https://s3.amazonaws.com/text-datasets/nietzsche.txt

Use as prompt: "new faculty and the jubilation reached its climax when kant" to generate text



In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import re

2025-03-28 09:18:10.576935: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743167890.588786   10948 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743167890.592310   10948 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743167890.602278   10948 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743167890.602292   10948 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743167890.602294   10948 computation_placer.cc:177] computation placer alr

In [2]:
def preprocess_text(text):
    """Cleans and tokenizes the text."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    return tokens



The function 

        create_Sequences(tokens, seq_lengt)
        
takes a list of tokens and an integer seq_length as input to generate a list of sequences from the given tokens.

+ The primary goal is to transform a flat list of tokens into a set of **overlapping sequences** which are commonly used in NLP and/or time-series prediction.

+ Each sequence consists of seq_length input tokens followed by one output token.

+ tokens: is a list of items (e.g., words, characters, numbers). These are the raw data from which the sequences will be constructed.

+ seq_length: An integer that determines the length of the input portion of each sequence.

+ Example: 

+ If tokens = [1, 2, 3, 4, 5, 6] and seq_length is 3, the output is :

    `[[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6]]`

Note that each inner list contains 4 elements. The first three are the input, and the last is the output.


In [None]:
##     Create input-output sequences

def create_sequences(tokens, seq_length):
    """Creates input-output sequences."""
    sequences = []
    for i in range(seq_length, len(tokens)):
        seq = tokens[i - seq_length:i + 1]
        sequences.append(seq)
    return sequences


In [None]:
 ##     Create token-to-index and index-to-token mappings

def create_token_index(tokens):
    """Creates token-to-index and index-to-token mappings."""
    unique_tokens = sorted(list(set(tokens)))
    token_index = {token: index for index, token in enumerate(unique_tokens)}
    index_token = {index: token for index, token in enumerate(unique_tokens)}
    return token_index, index_token

In [None]:
##   Create input and output datasets.

def create_dataset(sequences, token_index, seq_length):
    """Creates input and output datasets."""
    x = []
    y = []
    for seq in sequences:
        input_seq = seq[:-1]
        output_token = seq[-1]
        x.append([token_index[token] for token in input_seq])
        y.append(token_index[output_token])
    x = np.array(x)
    y = np.array(y)
    return x, y

In [None]:
## The code in this cell did not work. Ignore

def build_lstm_model_bad(vocab_size, embedding_dim, rnn_units, batch_size):
    """Builds the LSTM model."""
    model = keras.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), # Correct usage
        keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        keras.layers.Dense(vocab_size)
    ])
    return model

In [None]:

##    Build the LSTM model

def build_lstm_model(vocab_size, embedding_dim, rnn_units, batch_size):
    """Builds the LSTM model."""
    model = keras.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim), # Removed batch_input_shape
        # keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),

        # keras.layers.LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform'),
        keras.layers.LSTM(rnn_units, recurrent_initializer='glorot_uniform'),

        keras.layers.Dense(vocab_size)
    ])
    return model

In [None]:

##    Generates text using the trained LSTM model. Note temperature=1.0

def generate_text_lstm(model, start_string, token_index, index_token, num_generate, seq_length, temperature=1.0):
    """Generates text using the trained LSTM model."""
    input_eval = [token_index[s] for s in start_string.split()]
    input_eval = tf.expand_dims(input_eval, 0) #Keep this.
    text_generated = start_string.split()

    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(tf.expand_dims(predictions, 0), num_samples=1)[-1, 0].numpy() #Added expand_dims here.
        text_generated.append(index_token[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0)

    return ' '.join(text_generated)

In [None]:
## Dataset is the collected works by F. Nietzsche

path = keras.utils.get_file(
'nietzsche.txt',
origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Corpus length: 600893


In [None]:

## Simpler text used for testing. 

# Example usage: Use text from cell above 
# #text = """
#The quick brown fox jumps over the lazy dog.
#The dog was very lazy.
#The fox was very quick.
#"""


In [15]:
tokens = preprocess_text(text)
seq_length = 5
sequences = create_sequences(tokens, seq_length)
token_index, index_token = create_token_index(tokens)
vocab_size = len(token_index)
x, y = create_dataset(sequences, token_index, seq_length)

In [16]:
embedding_dim = 256
rnn_units = 1024
batch_size = 64


In [17]:
lstm_model = build_lstm_model(vocab_size, embedding_dim, rnn_units, batch_size)
lstm_model.compile(optimizer='adam', loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))

I0000 00:00:1743167893.310382   10948 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9311 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:08:00.0, compute capability: 8.6


In [18]:
# Train the LSTM model
lstm_model.fit(x, y, epochs=50, batch_size=batch_size)

Epoch 1/50


I0000 00:00:1743167895.132069   11060 cuda_dnn.cc:529] Loaded cuDNN version 90501


[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step - loss: 6.9784
Epoch 2/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - loss: 6.0068
Epoch 3/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - loss: 5.4299
Epoch 4/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - loss: 4.6785
Epoch 5/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - loss: 3.6587
Epoch 6/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - loss: 2.5476
Epoch 7/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - loss: 1.6134
Epoch 8/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - loss: 0.8876
Epoch 9/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - loss: 0.4152
Epoch 10/50
[1m1549/1549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x798f198577d0>

In [19]:
# Generate Text
# start_string = "the quick brown fox"
start_string = "new faculty and the jubilation reached its climax when kant" 
num_generate = 20
generated_text = generate_text_lstm(lstm_model, start_string, token_index, index_token, num_generate, seq_length)
print(generated_text)

new faculty and the jubilation reached its climax when kant feel all love of the most strength of the most ever absolutely after all his own love of all nature


In [20]:
# Generate Text
# start_string = "the quick brown fox"
start_string = "new faculty and the jubilation reached its climax when kant" 
num_generate = 20
generated_text = generate_text_lstm(lstm_model, start_string, token_index, index_token, num_generate, seq_length, 0.5)
print(generated_text)

new faculty and the jubilation reached its climax when kant further god they are thus than to a right here too much god to a right here too much god


In [21]:
# Generate Text
# start_string = "the quick brown fox"
start_string = "new faculty and the jubilation reached its climax when kant" 
num_generate = 20
generated_text = generate_text_lstm(lstm_model, start_string, token_index, index_token, num_generate, seq_length, 5.0)
print(generated_text)

new faculty and the jubilation reached its climax when kant scourge metaphysics assailant property firstlings reduce gay accepted origin dread tongues rates prey relax shake inapplicable begloom permanent man selfconfidence
