In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
import numpy as np

# Load and preprocess text
def load_data(file_path):
    """Load text data from file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text


file_path = "Book1.txt"  # Update this path as needed
text = load_data(file_path).lower()

print(f"Length of text: {len(text)} characters")
print(f"First 100 characters: {text[:100]}")

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

# Create tokenizer for character-level tokenization
tokenizer = Tokenizer(char_level=True, oov_token='<OOV>')  # Out-of-Vocabulary token
# Note: If a word not seen during training appears later, it will be replaced with <OOV>
# This helps handle unknown words instead of ignoring them

# Fit tokenizer on text and create word index
tokenizer.fit_on_texts([text])  # Analyzes the input text and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1  # +1 is usually reserved for padding

print(f"Total unique characters: {total_words}")
print(f"Character to index mapping (first 10): {dict(list(tokenizer.word_index.items())[:10])}")

# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]  # Converts the input text into a list of numbers based on the word index
seq_length = 50  # Each input sequence contains 50 characters

# Create sequences for training
# First seq_length tokens (input): Used for training the model
# Last token (target): Used as the label the model tries to predict
# So total of (50 + 1) in one input_sequence index

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

print(f"Total sequences created: {len(input_sequences)}")
print(f"Example sequence: {input_sequences[0]}")

# Pad sequences and split inputs/targets
# After this X will have inputs and y will have labels for those inputs
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

print(f"Input shape: {X.shape}")
print(f"Target shape: {y.shape}")

# One-hot encode the labels - Note: there are other ways for encoding like pre-trained word2vec encoding and so on
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

print(f"One-hot encoded target shape: {y.shape}")

# Build the Simple RNN model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=64, input_length=seq_length),  # Word embeddings
    SimpleRNN(256, return_sequences=False),  # RNN layer
    Dense(256, activation='relu'),  # Fully Connected Layer
    Dense(total_words, activation='softmax')  # Output Layer
])

print("Model Architecture:")
model.summary()

# Key points about the architecture:
# - Embedding layer: Converts character indices to dense vectors
# - SimpleRNN with 256 units: The number of hidden units (size of the hidden state vector)
# - return_sequences=False: The RNN will only return the final hidden state after processing the entire sequence
# - Dense layers: For final prediction

# Compile the model
model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

# Train the model
print("Starting training...")
history = model.fit(
    X, y, 
    epochs=30, 
    batch_size=128,
    verbose=1,
    validation_split=0.1  # Use 10% of data for validation
)

# Function to generate text using RNN
def generate_text(seed_text, next_words=50):
    """
    Generate text using the trained model
    
    Args:
        seed_text (str): Starting text to seed the generation
        next_words (int): Number of characters to generate
    
    Returns:
        str: Generated text
    """
    for _ in range(next_words):
        # Tokenize the seed text
        tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
        
        # Pad the sequence to match training input length
        tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')
        
        # Predict the next character probabilities
        predicted_probs = model.predict(tokenized_input, verbose=0)
        
        # Get the index of the character with highest probability
        predicted_index = np.argmax(predicted_probs)
        
        # Convert index back to character
        predicted_char = tokenizer.index_word.get(predicted_index, "<OOV>")
        
        # Add predicted character to seed text
        seed_text += " " + predicted_char
    
    return seed_text

# Generate text using the trained model
print("\nGenerating text...")
generated = generate_text("harry looked at", 100)
print("Generated text:")
print(generated)


Length of text: 474429 characters
First 100 characters: / 




the boy who lived 

mr. and mrs. dursley, of number four, privet drive, 
were proud to say th
Total unique characters: 61
Character to index mapping (first 10): {'<OOV>': 1, ' ': 2, 'e': 3, 't': 4, 'o': 5, 'a': 6, 'h': 7, 'r': 8, 'n': 9, 'i': 10}
Total sequences created: 474379
Example sequence: [57, 2, 14, 14, 14, 14, 14, 4, 7, 3, 2, 25, 5, 17, 2, 18, 7, 5, 2, 13, 10, 28, 3, 12, 2, 14, 14, 19, 8, 21, 2, 6, 9, 12, 2, 19, 8, 11, 21, 2, 12, 15, 8, 11, 13, 3, 17, 24, 2, 5, 22]
Input shape: (474379, 50)
Target shape: (474379,)
One-hot encoded target shape: (474379, 61)
Model Architecture:




Starting training...
Epoch 1/30
[1m3336/3336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 52ms/step - accuracy: 0.4603 - loss: 1.8436 - val_accuracy: 0.5377 - val_loss: 1.5434
Epoch 2/30
[1m3336/3336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 53ms/step - accuracy: 0.5565 - loss: 1.4597 - val_accuracy: 0.5694 - val_loss: 1.4313
Epoch 3/30
[1m3336/3336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 46ms/step - accuracy: 0.5790 - loss: 1.3697 - val_accuracy: 0.5779 - val_loss: 1.3920
Epoch 4/30
[1m3336/3336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 49ms/step - accuracy: 0.5911 - loss: 1.3218 - val_accuracy: 0.5850 - val_loss: 1.3613
Epoch 5/30
[1m3336/3336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 49ms/step - accuracy: 0.5990 - loss: 1.2909 - val_accuracy: 0.5909 - val_loss: 1.3515
Epoch 6/30
[1m3336/3336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 49ms/step - accuracy: 0.6048 - loss: 1.2689 - val_accuracy: 0.5

KeyboardInterrupt: 

In [2]:
# Generate text using the trained model
print("\nGenerating text...")
generated = generate_text("harry looked at", 100)
print("Generated text:")
print(generated)


Generating text...


NameError: name 'generate_text' is not defined