In [None]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# --- 1. Define the Training Data (Corpus) ---
# A small, simple corpus for quick demonstration.
text_corpus = """
Hello world
This is a simple recurrent neural network example
It can predict the next word
"""

print("--- Text Corpus Loaded ---")


In [None]:
# --- 2. Data Preprocessing (Tokenization) ---
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_corpus])
word_index = tokenizer.word_index
total_words = len(word_index) + 1  # +1 for the padding token (0)

# Convert text into a sequence of integer tokens
token_list = tokenizer.texts_to_sequences([text_corpus])[0]


In [None]:
# --- 3. Create Input Sequences (X) and Target Word (Y) ---
# Create n-gram sequences: (word1, word2, word3) -> target (word4)
input_sequences = []
for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

# Pad sequences to ensure all inputs have the same length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Separate Input (X) and Label (Y)
X = input_sequences[:, :-1]
y_int = input_sequences[:, -1]  # The last word is the target
y = to_categorical(y_int, num_classes=total_words)

print(f"total_words: {total_words}, max_sequence_len: {max_sequence_len}")
print(f"input: {X.shape}, target: {y.shape}")

In [None]:
# --- 4. Build the SimpleRNN Model ---
embedding_dim = 100  # Size of the word vector

model = Sequential([
    # Embedding layer: maps words to a dense vector space
    Embedding(total_words, embedding_dim, input_length=max_sequence_len - 1),

    # SimpleRNN layer: the recurrent core of the model
    SimpleRNN(100),

    # Dense output layer: predicts probability for each word in the vocabulary
    Dense(total_words, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model structure
print("\n--- Model Summary ---")
model.summary()


In [None]:
model.fit(X, y, epochs=5, verbose=1)

In [None]:
# --- 6. Prediction Function and Example ---
def predict_next_word(seed_text, n_words):
    """Generates the next word(s) based on a starting text."""
    for _ in range(n_words):
        # Convert seed text to sequence
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        # Pad the sequence to the required input length
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        # Predict the next word
        # Predict the word index
    predicted_probs = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted_probs)

    # Find the word corresponding to the index
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            output_word = word
            break

    # Append the predicted word and repeat
    seed_text += " " + output_word

    return seed_text


In [None]:
# Example Prediction
print("\n--- Prediction Example ---")
start_text = "this is a simple"
predicted_sentence = predict_next_word(start_text, 2)

print(f"Start Text: '{start_text}'")
print(f"Predicted Sentence: '{predicted_sentence}'")
