In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [2]:
# 1. Data Preparation
text = "hello world how are you today"

In [3]:
tokenizer = Tokenizer(char_level=True)

In [4]:
tokenizer.fit_on_texts([text])

In [5]:
vocab_size = len(tokenizer.word_index) + 1

In [6]:
tokenizer.word_index

{'o': 1,
 ' ': 2,
 'l': 3,
 'h': 4,
 'e': 5,
 'w': 6,
 'r': 7,
 'd': 8,
 'a': 9,
 'y': 10,
 'u': 11,
 't': 12}

In [7]:
encoded_text = tokenizer.texts_to_sequences([text])[0]

In [8]:
encoded_text

[4,
 5,
 3,
 3,
 1,
 2,
 6,
 1,
 7,
 3,
 8,
 2,
 4,
 1,
 6,
 2,
 9,
 7,
 5,
 2,
 10,
 1,
 11,
 2,
 12,
 1,
 8,
 9,
 10]

In [9]:
# Create input-target pairs
# Example: "hello" -> "ello "
input_sequences = []
target_sequences = []
for i in range(1, len(encoded_text)):
    input_sequences.append(encoded_text[:i])
    target_sequences.append(encoded_text[i])

In [10]:
input_sequences

[[4],
 [4, 5],
 [4, 5, 3],
 [4, 5, 3, 3],
 [4, 5, 3, 3, 1],
 [4, 5, 3, 3, 1, 2],
 [4, 5, 3, 3, 1, 2, 6],
 [4, 5, 3, 3, 1, 2, 6, 1],
 [4, 5, 3, 3, 1, 2, 6, 1, 7],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9, 7],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9, 7, 5],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9, 7, 5, 2],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9, 7, 5, 2, 10],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9, 7, 5, 2, 10, 1],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9, 7, 5, 2, 10, 1, 11],
 [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9, 7, 5, 2, 10, 1, 11, 2],
 [4,
  5,
  3,
  3,
  

In [11]:
target_sequences

[5,
 3,
 3,
 1,
 2,
 6,
 1,
 7,
 3,
 8,
 2,
 4,
 1,
 6,
 2,
 9,
 7,
 5,
 2,
 10,
 1,
 11,
 2,
 12,
 1,
 8,
 9,
 10]

In [12]:
# Pad sequences to a fixed length for batching
max_sequence_len = max(len(seq) for seq in input_sequences)
padded_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
target_sequences_one_hot = to_categorical(target_sequences, num_classes=vocab_size)

In [13]:
# 2. Model Definition
model = Sequential([
    Embedding(vocab_size, 10, input_length=max_sequence_len),
    SimpleRNN(50, return_sequences=False), # return_sequences=False for predicting single next token
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [14]:
# 3. Training
model.fit(padded_input_sequences, target_sequences_one_hot, epochs=100, verbose=0)

<keras.src.callbacks.history.History at 0x799f591110d0>

In [15]:
max_seq_len=max_sequence_len
seed_text = 'th'
generated_text = seed_text
encoded_seed = tokenizer.texts_to_sequences([seed_text])[0]
padded_seed = tf.keras.preprocessing.sequence.pad_sequences([encoded_seed], maxlen=max_seq_len, padding='pre')
predicted_probs = model.predict(padded_seed, verbose=0)[0]

In [16]:
encoded_seed

[12, 4]

In [17]:
padded_seed

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12,  4]], dtype=int32)

In [18]:
predicted_probs

array([0.00778203, 0.2669546 , 0.13166967, 0.33389673, 0.02348744,
       0.0435743 , 0.08044758, 0.05700757, 0.03199705, 0.00979185,
       0.00492762, 0.00398928, 0.00447428], dtype=float32)

In [19]:
predicted_token_index = np.argmax(predicted_probs)
predicted_token_index

np.int64(3)

In [20]:
predicted_char = tokenizer.index_word[predicted_token_index]
predicted_char

'l'

In [21]:
generated_text += predicted_char
generated_text

'thl'

In [22]:
seed_text += predicted_char
seed_text

'thl'

In [23]:
# 4. Generation
def generate_text(model, tokenizer, seed_text, num_generate=10, max_seq_len=max_sequence_len):
    generated_text = seed_text
    for _ in range(num_generate):
        encoded_seed = tokenizer.texts_to_sequences([seed_text])[0]
        padded_seed = tf.keras.preprocessing.sequence.pad_sequences([encoded_seed], maxlen=max_seq_len, padding='pre')
        predicted_probs = model.predict(padded_seed, verbose=0)[0]
        predicted_token_index = np.argmax(predicted_probs)
        predicted_char = tokenizer.index_word[predicted_token_index]
        generated_text += predicted_char
        seed_text += predicted_char # Update seed for next prediction
    return generated_text

# Example generation
generated_output = generate_text(model, tokenizer, "h", num_generate=15)
print(f"Generated text: {generated_output}")


Generated text: hllloooolld how 


In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Assuming 'text' is already defined, e.g., text = "hello world how are you today"

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1
encoded_text = tokenizer.texts_to_sequences([text])[0]

print("Tokenizer word_index:", tokenizer.word_index)
print("Vocab Size:", vocab_size)
print("Encoded Text:", encoded_text)

Tokenizer word_index: {'o': 1, ' ': 2, 'l': 3, 'h': 4, 'e': 5, 'w': 6, 'r': 7, 'd': 8, 'a': 9, 'y': 10, 'u': 11, 't': 12}
Vocab Size: 13
Encoded Text: [4, 5, 3, 3, 1, 2, 6, 1, 7, 3, 8, 2, 4, 1, 6, 2, 9, 7, 5, 2, 10, 1, 11, 2, 12, 1, 8, 9, 10]
