Step 1


In [16]:
import requests

# URL for Alice's Adventures in Wonderland from Project Gutenberg
url = "https://www.gutenberg.org/files/11/11-0.txt"
text_file_name = "alice_in_wonderland.txt"

try:
    response = requests.get(url)
    response.raise_for_status() # Raise an exception for HTTP errors
    with open(text_file_name, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"'{text_file_name}' downloaded successfully!")
except requests.exceptions.RequestException as e:
    print(f"Error downloading the file: {e}")
    print("Please check your internet connection or the URL.")

# Let's quickly inspect the first few characters of the downloaded text
with open(text_file_name, "r", encoding="utf-8") as f:
    sample_text = f.read(500) # Read the first 500 characters
print("\n--- Sample of the downloaded text ---")
print(sample_text)
print("------------------------------------")

'alice_in_wonderland.txt' downloaded successfully!

--- Sample of the downloaded text ---
*** START OF THE PROJECT GUTENBERG EBOOK 11 ***

[Illustration]




Alice’s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

 CHAPTER I.     Down the Rabbit-Hole
 CHAPTER II.    The Pool of Tears
 CHAPTER III.   A Caucus-Race and a Long Tale
 CHAPTER IV.    The Rabbit Sends in a Little Bill
 CHAPTER V.     Advice from a Caterpillar
 CHAPTER VI.    Pig and Pepper
 CHAPTER VII.   A Mad Tea-Party
 CHAPTER VIII.  The Queen’s Croquet-Ground
 CHAPTER IX.    The
------------------------------------


step 2


In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer # New import for tokenization
from tensorflow.keras.utils import to_categorical # New import for one-hot encoding
import re

# 1. Load the text (same as before)
text_file_name = "alice_in_wonderland.txt"
with open(text_file_name, "r", encoding="utf-8") as f:
    text = f.read()

# --- 2. Text Cleaning for Word-Level (Modified) ---
# Convert to lowercase
text = text.lower()
# Remove numbers and most punctuation, keep letters and spaces
# This is more aggressive to get clean words
text = re.sub(r'[^a-z\s]', '', text)
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text).strip()

print(f"Total length of cleaned text: {len(text)} characters")

# --- 3. Word Tokenization (New/Major Change) ---
# Initialize Keras Tokenizer
tokenizer = Tokenizer(oov_token='<unk>') # Handles out-of-vocabulary words
tokenizer.fit_on_texts([text]) # Builds the vocabulary from your text

# Convert text to sequence of word indices
word_sequences = tokenizer.texts_to_sequences([text])[0]

# Get word-to-index and index-to-word mappings
word_to_int = tokenizer.word_index
int_to_word = {v: k for k, v in tokenizer.word_index.items()}

# Calculate vocabulary size (+1 because Keras tokenizer indices start from 1)
vocab_size = len(tokenizer.word_index) + 1
print(f"Total unique words (vocabulary size): {vocab_size}")

print("\n--- Sample Word Vocabulary (first 20) ---")
print(list(word_to_int.keys())[:20])
print("-------------------------\n")

# --- 4. Define Sequence Length (Adjusted for Words) ---
seq_length = 50 # Now refers to 50 words, not characters

# --- 5. Prepare Input and Target Sequences (Modified) ---
dataX = [] # Input sequences of word indices
dataY = [] # Target word index

for i in range(0, len(word_sequences) - seq_length):
    seq_in = word_sequences[i:i + seq_length]
    seq_out = word_sequences[i + seq_length]
    dataX.append(seq_in)
    dataY.append(seq_out)

n_patterns = len(dataX)
print(f"Total patterns (sequences) for training: {n_patterns}")

# --- 6. Reshape and One-Hot Encode Data (Modified) ---
# X is now a simple NumPy array of word indices (samples, timesteps)
X = np.array(dataX)

# y is one-hot encoded, with num_classes as the word vocabulary size
y = to_categorical(dataY, num_classes=vocab_size)

print(f"Shape of X (input sequences): {X.shape}")
print(f"Shape of y (target words, one-hot encoded): {y.shape}")
print(f"Vocabulary Size for One-Hot Encoding: {vocab_size}")

Total length of cleaned text: 134617 characters
Total unique words (vocabulary size): 2764

--- Sample Word Vocabulary (first 20) ---
['<unk>', 'the', 'and', 'to', 'a', 'she', 'it', 'of', 'said', 'i', 'alice', 'in', 'you', 'was', 'that', 'as', 'her', 'at', 'on', 'with']
-------------------------

Total patterns (sequences) for training: 26426
Shape of X (input sequences): (26426, 50)
Shape of y (target words, one-hot encoded): (26426, 2764)
Vocabulary Size for One-Hot Encoding: 2764


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the model for word-level generation
model = Sequential()

# Embedding Layer:
# input_dim: Size of the vocabulary (+1 because word indices start from 1)
# output_dim: The dimensionality of the word embeddings (e.g., 256, 100, 50 - experiment with this!)
# input_length: The length of input sequences (seq_length from preprocessing)
embedding_dim = 256 # You can experiment with 100, 256, 512
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length))

# LSTM Layers (similar to character-level, but now processing word embeddings)
model.add(LSTM(256, return_sequences=True)) # First LSTM layer
model.add(Dropout(0.2)) # Dropout to prevent overfitting
model.add(LSTM(256)) # Second LSTM layer
model.add(Dropout(0.2))

# Output layer: predicts probabilities for the next WORD
model.add(Dense(vocab_size, activation='softmax')) # Output layer with softmax activation

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Print the model summary to see its structure
print(model.summary())

None


In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint callback (keep this as is)
# This will save the model weights whenever there's an improvement in loss
filepath="word-level-weights-improvement-{epoch:02d}-{loss:.4f}.keras" # Renamed filename for clarity
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# Train the model
# You can adjust the number of epochs and batch_size
print("Starting WORD-LEVEL model training. This will take significantly longer...")
# Ensure X and y are the word-level arrays from Step 3
model.fit(X, y, epochs=100, batch_size=128, callbacks=callbacks_list) # Retaining 100 epochs, but you could try more
print("Word-Level Model training complete!")

Starting WORD-LEVEL model training. This will take significantly longer...
Epoch 1/100
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 6.5923
Epoch 1: loss improved from inf to 6.29122, saving model to word-level-weights-improvement-01-6.2912.keras
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - loss: 6.5909
Epoch 2/100
[1m205/207[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - loss: 5.9390
Epoch 2: loss improved from 6.29122 to 5.91979, saving model to word-level-weights-improvement-02-5.9198.keras
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - loss: 5.9387
Epoch 3/100
[1m205/207[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - loss: 5.6874
Epoch 3: loss improved from 5.91979 to 5.71024, saving model to word-level-weights-improvement-03-5.7102.keras
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - loss: 5.6878
Epoch 4/100
[

In [23]:
import sys
import os
import glob
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

# --- Ensure the following are already defined from previous steps ---
# text, tokenizer, model, int_to_word, vocab_size, seq_length

# Load the best weights
try:
    list_of_files = glob.glob('word-level-weights-improvement-*.keras')
    latest_file = max(list_of_files, key=os.path.getctime) if list_of_files else None

    if latest_file:
        model.load_weights(latest_file)
        print(f"Loaded word-level model weights from: {latest_file}")
    else:
        print("No word-level model weights found. Please ensure training completed and files were saved.")
        print("Attempting to generate text with potentially untrained model (will likely be random).")
except Exception as e:
    print(f"Error loading model weights: {e}")

# Compile the model again after loading weights
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Word-level text generator with safe multinomial sampling
def generate_word_text(model, text_data, tokenizer_obj, int_to_word_map, vocab_size, seq_length, num_words_to_generate, diversity=1.0):
    original_word_sequences = tokenizer_obj.texts_to_sequences([text_data])[0]

    start_index = np.random.randint(0, len(original_word_sequences) - seq_length - 1)
    pattern_indices = original_word_sequences[start_index:start_index + seq_length]
    seed_words = [int_to_word_map.get(idx, "<unk>") for idx in pattern_indices]

    print(f"\n--- Seed: \"{' '.join(seed_words)}\" ---")

    generated_words = seed_words.copy()

    for _ in range(num_words_to_generate):
        x = np.array([pattern_indices])  # Shape: (1, seq_length)
        prediction = model.predict(x, verbose=0)[0]  # Shape: (vocab_size,)

        # Temperature sampling
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction + 1e-10) / diversity
        exp_preds = np.exp(prediction)
        prediction = exp_preds / np.sum(exp_preds)

        # Precision-safe fix: Normalize & clip
        prediction = np.maximum(prediction, 1e-12)
        prediction = prediction / np.sum(prediction)
        prediction[-1] = 1.0 - np.sum(prediction[:-1])  # Force sum to exactly 1.0

        if prediction[-1] < 0:  # If adjustment broke the distribution
            prediction = prediction / np.sum(prediction)

        # Sample next word index
        try:
            probas = np.random.multinomial(1, prediction, 1)
        except ValueError:
            # As backup: use uniform distribution if probabilities are invalid
            print("Warning: Probabilities invalid. Falling back to uniform sampling.")
            prediction = np.ones(vocab_size) / vocab_size
            probas = np.random.multinomial(1, prediction, 1)

        next_word_index = np.argmax(probas)
        next_word = int_to_word_map.get(next_word_index, '<unk>')
        generated_words.append(next_word)

        pattern_indices = pattern_indices[1:] + [next_word_index]

    return " ".join(generated_words)

# Generation parameters
num_words_to_generate = 100
diversity_values = [0.2, 0.5, 0.8]

for diversity_val in diversity_values:
    print(f"\n--- Generated Text (Diversity: {diversity_val}) ---")
    output = generate_word_text(
        model, text, tokenizer, int_to_word, vocab_size,
        seq_length, num_words_to_generate, diversity_val
    )
    print(output)
    print("--------------------------------------------------")


Loaded word-level model weights from: word-level-weights-improvement-100-1.1974.keras

--- Generated Text (Diversity: 0.2) ---

--- Seed: "very easy to take more than nothing nobody asked your opinion said alice whos making personal remarks now the hatter asked triumphantly alice did not quite know what to say to this so she helped herself to some tea and breadandbutter and then turned to the dormouse and repeated her" ---
very easy to take more than nothing nobody asked your opinion said alice whos making personal remarks now the hatter asked triumphantly alice did not quite know what to say to this so she helped herself to some tea and breadandbutter and then turned to the dormouse and repeated her question why did they live at the bottom of a well take a table with the first day said the gryphon i mean what makes the matter worse you must have been changed for mabel ill try the dormouse say how her hurried back to the game the queen merely remarking that a moments delay would cost t

In [24]:
import pickle
import json

# Define paths for saving
model_save_path = "word_level_text_generator_model.keras" # Saved after training in Step 5
tokenizer_save_path = "tokenizer.pkl"
int_to_word_map_path = "int_to_word.json"

# Save the trained model (if you didn't do it via ModelCheckpoint)
# You likely already have it saved from ModelCheckpoint, but this ensures a clean single file
# model.save(model_save_path) # Uncomment if you want to explicitly save the final model

# Save the tokenizer object
with open(tokenizer_save_path, 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"Tokenizer saved to {tokenizer_save_path}")

# Save the int_to_word map
with open(int_to_word_map_path, 'w', encoding='utf-8') as f:
    json.dump(int_to_word, f, ensure_ascii=False, indent=4)
print(f"int_to_word map saved to {int_to_word_map_path}")

print("All necessary assets saved for deployment.")

Tokenizer saved to tokenizer.pkl
int_to_word map saved to int_to_word.json
All necessary assets saved for deployment.


In [25]:
%%writefile app.py
import gradio as gr
import tensorflow as tf
import numpy as np
import pickle
import json
import os
from tensorflow.keras.preprocessing.text import Tokenizer # Import here for loading tokenizer

# --- Constants and File Paths ---
MODEL_PATH = 'word-level-weights-improvement-100-1.1974.keras' # IMPORTANT: Update with your best model filename
TOKENIZER_PATH = 'tokenizer.pkl'
INT_TO_WORD_MAP_PATH = 'int_to_word.json'
SEQ_LENGTH = 50 # Must match the seq_length used during training

# --- Global Variables for Model and Mappings ---
model = None
tokenizer = None
int_to_word = None
vocab_size = 0 # Will be derived from tokenizer after loading

# --- Load Model and Assets Function ---
def load_assets():
    global model, tokenizer, int_to_word, vocab_size

    # Load the model
    try:
        model = tf.keras.models.load_model(MODEL_PATH)
        print(f"Model loaded from {MODEL_PATH}")
    except Exception as e:
        print(f"Error loading model: {e}")
        # Try loading weights if the full model save failed or if it's just weights file
        print(f"Attempting to load weights directly into a new model structure.")
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

        # First, we need to load tokenizer to get vocab_size
        with open(TOKENIZER_PATH, 'rb') as f:
            tokenizer = pickle.load(f)
        vocab_size = len(tokenizer.word_index) + 1
        embedding_dim = 256 # Must match the embedding_dim used during training

        temp_model = Sequential()
        temp_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=SEQ_LENGTH))
        temp_model.add(LSTM(256, return_sequences=True))
        temp_model.add(Dropout(0.2))
        temp_model.add(LSTM(256))
        temp_model.add(Dropout(0.2))
        temp_model.add(Dense(vocab_size, activation='softmax'))
        temp_model.compile(loss='categorical_crossentropy', optimizer='adam') # Compile before loading weights

        # IMPORTANT: Find the actual best weights file name
        list_of_files = glob.glob('word-level-weights-improvement-*.keras')
        latest_weights_file = max(list_of_files, key=os.path.getctime) if list_of_files else MODEL_PATH
        if os.path.exists(latest_weights_file):
            temp_model.load_weights(latest_weights_file)
            print(f"Loaded weights from {latest_weights_file}")
            model = temp_model
        else:
            raise FileNotFoundError(f"Neither {MODEL_PATH} nor {latest_weights_file} found.")


    # Load the tokenizer
    with open(TOKENIZER_PATH, 'rb') as f:
        tokenizer = pickle.load(f)
    print(f"Tokenizer loaded from {TOKENIZER_PATH}")

    # Load the int_to_word map
    with open(INT_TO_WORD_MAP_PATH, 'r', encoding='utf-8') as f:
        int_to_word = json.load(f)
    print(f"int_to_word map loaded from {INT_TO_WORD_MAP_PATH}")

    vocab_size = len(tokenizer.word_index) + 1 # +1 for out-of-vocabulary token or padding
    print(f"Vocabulary size: {vocab_size}")

# --- Text Generation Function (adapted for Gradio) ---
def generate_text_for_gradio(seed_text, num_words_to_generate, diversity):
    if model is None:
        return "Model not loaded. Please wait or check logs."

    # Preprocess seed text
    seed_text = seed_text.lower()
    seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]

    # Ensure seed_sequence is at least seq_length
    if len(seed_sequence) < SEQ_LENGTH:
        # Pad or extend with start tokens if necessary
        # For simplicity, we'll just return an error or pad with known token if possible
        # A more robust solution might pre-pend common words or padding
        return f"Seed text must be at least {SEQ_LENGTH} words long to generate meaningful text. Please try a longer seed."

    # Trim seed sequence if too long
    pattern_indices = seed_sequence[-SEQ_LENGTH:]

    # Convert seed pattern (indices) back to words for display
    seed_words = [int_to_word.get(idx, '<unk>') for idx in pattern_indices]
    generated_words = []
    generated_words.extend(seed_words)

    for _ in range(num_words_to_generate):
        x = np.array([pattern_indices])
        prediction = model.predict(x, verbose=0)[0]

        prediction = np.log(prediction + 1e-10) / diversity
        exp_preds = np.exp(prediction)
        prediction = exp_preds / np.sum(exp_preds)
        prediction = np.maximum(prediction, 0)
        prediction = prediction / np.sum(prediction)

        if np.sum(prediction) == 0:
            next_word_index = np.random.randint(0, vocab_size) # Fallback to random
        else:
            probas = np.random.multinomial(1, prediction, 1)
            next_word_index = np.argmax(probas)

        next_word = int_to_word.get(next_word_index, '<unk>')
        generated_words.append(next_word)
        pattern_indices = pattern_indices[1:] + [next_word_index]

    return " ".join(generated_words)

# --- Gradio Interface ---
# Load assets before launching the interface
load_assets()

iface = gr.Interface(
    fn=generate_text_for_gradio,
    inputs=[
        gr.Textbox(lines=2, placeholder=f"Enter a seed text (at least {SEQ_LENGTH} words) here...", label="Seed Text"),
        gr.Slider(minimum=10, maximum=500, step=10, default=100, label="Number of words to generate"),
        gr.Slider(minimum=0.1, maximum=1.5, step=0.1, default=0.5, label="Diversity (Creativity)", info="Lower values are more conservative, higher values are more creative/random.")
    ],
    outputs="text",
    title="Alice in Wonderland Word-Level Text Generator",
    description="Generate text in the style of 'Alice's Adventures in Wonderland' using an LSTM neural network. Provide a seed text, and the model will continue the story word by word."
)

if __name__ == "__main__":
    iface.launch(debug=True) # debug=True is useful for local testing

Writing app.py


In [26]:
%%writefile requirements.txt
tensorflow
numpy
gradio
scipy
keras # Although part of tensorflow, sometimes good to explicitly list

Writing requirements.txt
