In [1]:
import tensorflow as tf
from collections import Counter # May not be needed for inference, but keeping for consistency
import keras # Often imported with TF, but good to be explicit
import Levenshtein # Used for Levenshtein Distance (evaluation)
import pickle
import json
# zipfile was not used in the provided code, can be removed
from tqdm.auto import tqdm # Useful for progress bars if processing large amounts of data later
import os
# gensim Word2Vec related imports might not be strictly needed if just loading the model and embeddings
# but keeping them for completeness if you might use Word2Vec directly later.
from gensim.models import Word2Vec
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Imports for Evaluation Metrics (if you plan to use them) ---
# from rouge import Rouge # For ROUGE scores - install with pip if needed
import nltk
# import word_tokenize # Not directly used in decode/sequences_to_text, but useful generally
# import corpus_bleu # For BLEU score
# from scipy.spatial.distance import cosine # For Cosine Similarity


print("Basic imports complete.")

# --- Define the same constants as training ---
# Make sure these match the values used during training (refer to your original notebook)
# MAX_RECORDS_TO_LOAD is only relevant for initial data loading, not inference setup
MAX_ARTICLE_LEN = 500       # Must match training
MAX_HEADLINE_LEN = 50       # Must match training
MAX_NUM_WORDS = 100000      # <--- CHANGE THIS TO MATCH TRAINING NOTEBOOK
OOV_TOKEN = "<OOV>"         # Must match tokenizer setup
START_TOKEN = "<start>"     # Must match tokenizer setup and data prep (should be added to titles)
END_TOKEN = "<end>"         # Must match tokenizer setup and data prep (should be added to titles)
RNN_SIZE = 256              # Must match training

print("Constants defined.")
# Note: EMBEDDING_DIM will be determined from the loaded model


# --- Download necessary NLTK data if not already present (Corrected Error Handling) ---
# This handles the LookupError raised when the resource is not found
try:
    # Try to find the punkt resource
    nltk.data.find('tokenizers/punkt')
    print("NLTK Punkt tokenizer found.")
except LookupError: # Catch the actual error raised by nltk.data.find
    print("NLTK Punkt tokenizer not found. Downloading...")
    # Download the punkt resource
    nltk.download('punkt')
    print("NLTK Punkt tokenizer downloaded.")
except Exception as e:
    # Catch any other unexpected errors during the check/download
    print(f"An unexpected error occurred during NLTK check/download: {e}")


print("NLTK setup complete.")

2025-05-23 11:29:01.988547: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-23 11:29:02.420520: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-23 11:29:02.499591: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-05-23 11:29:02.499610: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

Basic imports complete.
Constants defined.
NLTK Punkt tokenizer found.
NLTK setup complete.


In [2]:
tokenizer_path = 'data/tokenizer.pkl' # Path where you saved your tokenizer

if not os.path.exists(tokenizer_path):
    print(f"Error: Tokenizer file not found at '{tokenizer_path}'.")
    print("Please ensure you saved the tokenizer in your original training notebook.")
    # You might want to exit or raise an error here if the tokenizer is essential
else:
    print(f"Loading tokenizer from '{tokenizer_path}'...")
    with open(tokenizer_path, 'rb') as handle:
        tokenizer = pickle.load(handle)
    print("Tokenizer loaded successfully.")

    # Ensure VOCAB_SIZE is consistent with the tokenizer and training setup
    # It should be MAX_NUM_WORDS if you set it.
    VOCAB_SIZE = tokenizer.num_words # Use tokenizer's num_words attribute
    if VOCAB_SIZE is None: # If num_words wasn't explicitly set, it's the full vocab + OOV
         VOCAB_SIZE = len(tokenizer.word_index) + 1 # +1 for OOV token

    # Get token IDs (ensure they match the tokenizer's mapping)
    # Use .get() with a default in case tokens weren't in the top MAX_NUM_WORDS
    START_TOKEN_ID = tokenizer.word_index.get(START_TOKEN, None)
    END_TOKEN_ID = tokenizer.word_index.get(END_TOKEN, None)
    # Handle cases where START/END might not be in the tokenizer vocab (shouldn't happen if added to data)
    if START_TOKEN_ID is None or END_TOKEN_ID is None:
         print(f"Warning: START_TOKEN ('{START_TOKEN}') or END_TOKEN ('{END_TOKEN}') not found in tokenizer vocabulary.")
         # You might need to handle this case depending on your token IDs.
         # Let's assume they exist if you added them to titles before fitting.
         START_TOKEN_ID = tokenizer.word_index.get(START_TOKEN)
         END_TOKEN_ID = tokenizer.word_index.get(END_TOKEN)


    print(f"Tokenizer vocabulary size: {VOCAB_SIZE}")
    print(f"Start Token ID: {START_TOKEN_ID}")
    print(f"End Token ID: {END_TOKEN_ID}")

    # Create reverse word index (ID to word) for converting output IDs back to text
    reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}
    print("Reverse word index created.")

Loading tokenizer from 'data/tokenizer.pkl'...
Tokenizer loaded successfully.
Tokenizer vocabulary size: 100000
Start Token ID: 36
End Token ID: 37
Reverse word index created.


In [3]:
# --- Cell 3 (Load Trained Model) - MODIFIED PATH & SYNTAX FIX ---

# Set the path to the specific checkpoint file you want to load
# Use the exact path you found after running the training notebook
model_save_path = 'training_checkpoints/epoch_50_val_loss_1.4797.h5' # <--- CHANGE THIS LINE

if not os.path.exists(model_save_path):
    print(f"Error: Trained model file not found at '{model_save_path}'")
    print("Please ensure the path is correct and the training notebook saved this file.")
    # You might want to exit or raise an error here if the model is essential
else:
    print(f"Loading trained model from '{model_save_path}'...")
    # Load the full model, including architecture and weights
    loaded_model = tf.keras.models.load_model(model_save_path, compile=False) # compile=False is fine for inference
    print("Model loaded successfully.")

    # Get the Embedding Dimension from the loaded model's embedding layer
    try:
        # Assuming the first embedding layer ('shared_embedding0') is the encoder one
        # Or find it by layer type or name if you used specific names
        embedding_layer = None
        for layer in loaded_model.layers:
            # You can also check by name if you're certain of the name:
            # if layer.name == 'shared_embedding0':
            #     embedding_layer = layer
            #     break
            if isinstance(layer, tf.keras.layers.Embedding):
                 embedding_layer = layer
                 break # Found the first embedding layer

        if embedding_layer:
            EMBEDDING_DIM = embedding_layer.output_dim
            print(f"Embedding Dimension from loaded model: {EMBEDDING_DIM}")
        else:
            # --- SYNTAX ERROR FIXED HERE ---
            print(f"Error: Could not find an Embedding layer in the loaded model.\n") # <-- Corrected this line
            # Fallback or error handling if embedding dim is critical
            # This fallback might be needed if you change the model architecture significantly
            # For your current model, finding the embedding layer should work.
            EMBEDDING_DIM = 100 # Fallback to the value used during training if known
            print(f"Using fallback EMBEDDING_DIM: {EMBEDDING_DIM}")


    except Exception as e:
        print(f"Error getting embedding dimension from loaded model: {e}")
        EMBEDDING_DIM = 100 # Fallback
        print(f"Using fallback EMBEDDING_DIM: {EMBEDDING_DIM}")


    # Optional: Display model summary to verify
    # loaded_model.summary()

# --- End of Cell 3 MODIFIED ---

Loading trained model from 'training_checkpoints/epoch_50_val_loss_1.4797.h5'...


2025-05-23 11:29:06.772440: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2025-05-23 11:29:06.772836: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2025-05-23 11:29:06.772851: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (daniel-diaz-Latitude-3520): /proc/driver/nvidia/version does not exist
2025-05-23 11:29:06.773775: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model loaded successfully.
Embedding Dimension from loaded model: 100


In [4]:
print("\nSetting up inference models from loaded model layers...")

if 'loaded_model' not in locals():
    print("Error: Trained model not loaded. Please run the previous cell.")
    # Exit or handle error


try:
    # Get the layers from the loaded model by the names you used in the original training code
    encoder_embedding_layer = loaded_model.get_layer('shared_embedding0')
    encoder_lstm1_layer = loaded_model.get_layer('encoder_lstm_1')
    encoder_lstm2_layer = loaded_model.get_layer('encoder_lstm_2')
    encoder_lstm3_layer = loaded_model.get_layer('encoder_lstm_3')

    decoder_embedding_layer = loaded_model.get_layer('shared_embedding1')
    decoder_lstm_layer = loaded_model.get_layer('decoder_lstm_1')
    decoder_dense_layer = loaded_model.get_layer('output_layer')

    # --- Encoder Inference Model ---
    # Takes the input sequence and outputs the final encoder states
    encoder_inputs_inf = tf.keras.Input(shape=(MAX_ARTICLE_LEN,), name='encoder_input_inf')
    encoder_embedding_out = encoder_embedding_layer(encoder_inputs_inf)

    # Propagate through encoder LSTMs to get the final states of the LAST LSTM
    encoder_lstm1_output, _, _ = encoder_lstm1_layer(encoder_embedding_out) # We only need the sequence output for the next layer
    encoder_lstm2_output, _, _ = encoder_lstm2_layer(encoder_lstm1_output)
    encoder_lstm3_output, state_h3_enc_inf, state_c3_enc_inf = encoder_lstm3_layer(encoder_lstm2_output) # Get states from the last one

    encoder_states_inf = [state_h3_enc_inf, state_c3_enc_inf]
    encoder_model = tf.keras.Model(encoder_inputs_inf, encoder_states_inf, name='encoder_inference_model')

    print("Encoder Inference Model set up.")
    # encoder_model.summary() # Optional summary

    # --- Decoder Inference Model ---
    # Takes the single previous word and the previous decoder states,
    # outputs the next word probabilities and the new states.
    decoder_single_word_input_inf = tf.keras.Input(shape=(1,), name='decoder_single_word_input_inf')
    decoder_state_input_h_inf = tf.keras.Input(shape=(RNN_SIZE,), name='decoder_state_input_h_inf')
    decoder_state_input_c_inf = tf.keras.Input(shape=(RNN_SIZE,), name='decoder_state_input_c_inf')
    decoder_states_inputs_inf = [decoder_state_input_h_inf, decoder_state_input_c_inf]

    decoder_single_word_embedding_inf = decoder_embedding_layer(decoder_single_word_input_inf)

    # The decoder LSTM needs to run for a single timestep, taking previous states
    # Using the original trained layer instance with a single timestep input shape handles this
    decoder_outputs_inf, state_h_dec_inf, state_c_dec_inf = decoder_lstm_layer(
        decoder_single_word_embedding_inf, initial_state=decoder_states_inputs_inf
    )

    # Apply the dense layer to the output of the single timestep
    decoder_pred_output_inf = decoder_dense_layer(decoder_outputs_inf) # Shape (None, 1, VOCAB_SIZE)

    # We need the probabilities for the single timestep, so slice the result
    decoder_pred_output_inf = decoder_pred_output_inf[:, 0, :] # Shape (None, VOCAB_SIZE)


    decoder_model = tf.keras.Model(
        [decoder_single_word_input_inf] + decoder_states_inputs_inf,
        [decoder_pred_output_inf, state_h_dec_inf, state_c_dec_inf], # Output predicted probs and new states
        name='decoder_inference_model'
    )

    print("Decoder Inference Model set up.")
    print("Inference models built by reusing trained layers.")
    # decoder_model.summary() # Optional summary

except ValueError as e:
     print(f"Error setting up inference models. Could not get layer by name: {e}")
     print("Please check the layer names in your original training model summary and ensure they match the names used in this cell.")


Setting up inference models from loaded model layers...
Encoder Inference Model set up.
Decoder Inference Model set up.
Inference models built by reusing trained layers.


In [5]:
# --- Cell 5 (Decoding Helper Function) - FINAL CORRECTED VERSION with Debug Prints ---

def sequences_to_text(sequence, stop_on_end_token=True):
    """
    Converts a sequence of word IDs back to a sentence.
    Explicitly accesses necessary variables from the global scope.
    Includes debug prints to show what's happening.
    """
    # Explicitly check for necessary variables in the global scope
    if 'reverse_word_index' not in globals():
         print("Error [sequences_to_text]: 'reverse_word_index' is not defined in global scope. Please run testingMonograph.ipynb Cell 2.")
         return "Decoding error: Missing reverse_word_index."
    if 'OOV_TOKEN' not in globals():
         print("Error [sequences_to_text]: 'OOV_TOKEN' is not defined in global scope. Please run testingMonograph.ipynb Cell 1.")
         return "Decoding error: Missing OOV_TOKEN."
    if 'START_TOKEN' not in globals():
         print("Error [sequences_to_text]: 'START_TOKEN' is not defined in global scope. Please run testingMonograph.ipynb Cell 1.")
         return "Decoding error: Missing START_TOKEN."
    if 'END_TOKEN' not in globals():
         print("Error [sequences_to_text]: 'END_TOKEN' is not defined in global scope. Please run testingMonograph.ipynb Cell 1.")
         return "Decoding error: Missing END_TOKEN."

    # Get references to the global variables once at the start of the function call
    global reverse_word_index, OOV_TOKEN, START_TOKEN, END_TOKEN

    # --- Debug Print ---
    print(f"--- sequences_to_text Debug ---")
    print(f"Input sequence IDs: {sequence}")
    print(f"Stop on END token: {stop_on_end_token}")
    print(f"START_TOKEN: '{START_TOKEN}', END_TOKEN: '{END_TOKEN}', OOV_TOKEN: '{OOV_TOKEN}'")
    # --- End Debug Print ---

Small vocabulary can lead to VOO only 

When the vocabulary is too big it may lead to OOM

There is a limit to how much a model can be perfected  (convergence point)

The time for epoch will grow constantly in a semi exponential way (means that the more epochs the longer it will start taking)

The loss can be a not accurate measurement if based on the similarity
    text = []
    for word_id in sequence:
        if word_id == 0: # Padding ID (assuming 0 is padding)
            # --- Debug Print ---
            # print(f"  Skipping padding ID 0")
            # --- End Debug Print ---
            continue

        # Explicitly access the global reverse_word_index and OOV_TOKEN
        word = reverse_word_index.get(word_id, OOV_TOKEN)

        # Stop if the END token is predicted (and we are set to stop)
        # Explicitly access the global END_TOKEN
        if stop_on_end_token and word == END_TOKEN:
            # --- Debug Print ---
            print(f"  Encountered END token ('{END_TOKEN}'). Stopping decoding.")
            # --- End Debug Print ---
            break

        # Avoid adding the START token itself to the output text
        # Explicitly access the global START_TOKEN
        if word != START_TOKEN:
            # --- Debug Print ---
            # print(f"  Adding word ID {word_id} ('{word}')")
            # --- End Debug Print ---
            text.append(word)
        # else:
            # --- Debug Print ---
            # print(f"  Skipping START token ('{START_TOKEN}')")
            # --- End Debug Print ---


    # Join words and clean up leading/trailing whitespace
    generated_text = " ".join(text).strip()
    # --- Debug Print ---
    print(f"Final generated text: '{generated_text}'")
    print(f"--- sequences_to_text Debug End ---")
    # --- End Debug Print ---
    return generated_text

print("sequences_to_text helper function defined (with debug prints).")

sequences_to_text helper function defined (with debug prints).


In [6]:
# --- Cell 6 (Headline Generation Function) - Definition with debug prints ---\n",

def generate_headline(input_text, max_headline_length=MAX_HEADLINE_LEN, simulate_output=False):
    """
    Generates a headline for a given input article text using the loaded Seq2Seq models.
    Includes an option to simulate output for testing the decoding pipeline without full training.
    Includes debug prints to show prediction steps.

    Args:
        input_text: A string containing the article content.
        max_headline_length: The maximum length of the generated headline sequence (including START token).
        simulate_output: If True, bypasses model prediction and simulates a sequence of token IDs.

    Returns:
        A string containing the generated headline.
    """
    # --- DEBUG PRINTS START ---
    print("\n--- Debugging generate_headline ---")
    print(f"'tokenizer' in globals(): {'tokenizer' in globals()}")
    print(f"'encoder_model' in globals(): {'encoder_model' in globals()}")
    print(f"'decoder_model' in globals(): {'decoder_model' in globals()}")
    print(f"'START_TOKEN_ID' in globals(): {globals().get('START_TOKEN_ID') is not None}") # Check if None
    print(f"'END_TOKEN_ID' in globals(): {globals().get('END_TOKEN_ID') is not None}") # Check if None
    print(f"Simulation mode active: {simulate_output}")
    print(f"MAX_HEADLINE_LEN: {max_headline_length}")
    # --- DEBUG PRINTS END ---

    # Check for necessary components (less strict if simulating)
    # Use .get() for START_TOKEN_ID and END_TOKEN_ID checks to avoid NameError if they weren't found by tokenizer
    if 'tokenizer' not in globals() or ('encoder_model' not in globals() and not simulate_output) or ('decoder_model' not in globals() and not simulate_output) or globals().get('START_TOKEN_ID') is None or globals().get('END_TOKEN_ID') is None:
         print("Error: Necessary components (tokenizer, inference models - unless simulating, token IDs) are not loaded or defined correctly.")
         return "Generation failed: Model components or token IDs missing."

    # Preprocess the input article text
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=MAX_ARTICLE_LEN, padding='post')

    # Encode the input sequence to get the initial states for the decoder
    try:
        # Check if encoder_model is available or if we are simulating
        if 'encoder_model' in globals() and encoder_model: # Added check for existence
             # --- Debug Print ---
             print("Running encoder_model.predict...")
             # --- End Debug Print ---
             encoder_states = encoder_model.predict(input_seq, verbose=0)
             # --- Debug Print ---
             print("Encoder prediction complete.")
             # --- End Debug Print ---
        elif simulate_output:
             print("Encoder model not available, using dummy states in simulation.")
             # Provide dummy states if encoder fails in simulation
             encoder_states = [np.zeros((1, RNN_SIZE)), np.zeros((1, RNN_SIZE))]
        else:
             print("Error: Encoder model not available and not in simulation mode.")
             return "Generation failed: Encoder model missing."

    except NameError as e:
        print(f"Error during encoder_model.predict: {e}. encoder_model likely not available.")
        if not simulate_output:
             return "Generation failed: Encoder model predict error."
        else:
             print("Ignoring encoder predict error in simulation mode.")
             encoder_states = [np.zeros((1, RNN_SIZE)), np.zeros((1, RNN_SIZE))] # Dummy fallback
    # REMOVED THE SYNTAX ERROR HERE: removed \n",
    except Exception as e:
         print(f"Unexpected error during encoder_model.predict: {e}")
         if not simulate_output:
              return "Generation failed: Encoder model predict error."
         else:
              print("Ignoring encoder predict error in simulation mode.")
              encoder_states = [np.zeros((1, RNN_SIZE)), np.zeros((1, RNN_SIZE))] # Dummy fallback


    # Initialize the decoder input with the START token ID
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = START_TOKEN_ID

    # List to store the IDs of the generated headline sequence
    generated_sequence_ids = []

    # --- Simulation Setup ---
    if simulate_output:
        if 'reverse_word_index' not in globals():
             print("Error: reverse_word_index not available for simulation.")
             return "Simulation failed: reverse_word_index missing."
        if 'OOV_TOKEN' not in globals():
             print("Error: OOV_TOKEN not available for simulation.")
             return "Simulation failed: OOV_TOKEN missing."

        OOV_TOKEN_ID = tokenizer.word_index.get(globals()['OOV_TOKEN'], 1)

        # Create a sample list of words. Get their IDs, or use OOV_TOKEN_ID if word not in vocab
        sample_words = ["this", "is", "a", "simulated", "headline", "for", "testing", "this", "decoding", "pipeline"]
        simulated_ids = [tokenizer.word_index.get(word.lower(), OOV_TOKEN_ID) for word in sample_words]
        simulated_ids.append(END_TOKEN_ID)

        print(f"Simulating prediction with IDs: {simulated_ids}")
        sim_idx = 0

    # Loop to predict the next word token by token
    print("Starting decoding loop...") # Added print
    for i in range(max_headline_length):
        # --- Debug Print ---
        # print(f"Decoding step {i+1}/{max_headline_length}")
        # --- End Debug Print ---

        if simulate_output:
            # In simulation mode, get the next ID from the predefined list
            if sim_idx < len(simulated_ids):
                predicted_token_id = simulated_ids[sim_idx]
                sim_idx += 1
            else:
                # If simulation list is exhausted, force prediction of END token to stop
                predicted_token_id = END_TOKEN_ID

            # Dummy state update in simulation (not strictly necessary for this simulation type)
            h, c = encoder_states # Just keep states constant

            # --- Debug Print ---
            predicted_word = globals().get('reverse_word_index', {}).get(predicted_token_id, globals().get('OOV_TOKEN', '<Unknown>'))
            print(f"Sim Step {i}: Predicted ID {predicted_token_id} ({predicted_word})")
            # --- End Debug Print ---


        else: # Standard inference mode
            # Check if decoder_model is available
            if 'decoder_model' not in globals() or not decoder_model: # Added check for existence
                 print(f"Error during decoder_model.predict step {i}: decoder_model is not available.")
                 return "Generation failed: Decoder model missing."
            try:
                # --- Debug Print ---
                # print(f"  Running decoder_model.predict with target_seq shape {target_seq.shape} and encoder_states...")
                # --- End Debug Print ---
                output_tokens, h, c = decoder_model.predict(
                    [target_seq] + encoder_states, verbose=0
                )
                # --- Debug Print ---
                # print(f"  Decoder prediction output_tokens shape: {output_tokens.shape}")
                # print(f"  Decoder predicted states h shape: {h.shape}, c shape: {c.shape}")
                # --- End Debug Print ---

            # REMOVED THE SYNTAX ERROR HERE: removed \n",
            except Exception as e:
                 print(f"Unexpected error during decoder_model.predict step {i}: {e}")
                 return "Generation failed: Decoder model predict error.\n"


            # Sample the next token ID (Greedy search: pick the token with the highest probability)
            predicted_token_id = np.argmax(output_tokens[0, :])

            # --- Debug Print ---
            predicted_word = globals().get('reverse_word_index', {}).get(predicted_token_id, globals().get('OOV_TOKEN', '<Unknown>'))
            print(f"Step {i}: Predicted ID {predicted_token_id} ({predicted_word})")
            # --- End Debug Print ---


        # Append the predicted token ID to the generated sequence
        generated_sequence_ids.append(predicted_token_id)

        # Check if the predicted token is the END token
        # Add a check before using END_TOKEN_ID
        if END_TOKEN_ID is not None and predicted_token_id == END_TOKEN_ID:
            print("END token predicted. Stopping generation.") # Added print
            break # Stop decoding

        # Update the target sequence for the next step (only needed in standard mode)
        if not simulate_output:
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = predicted_token_id

            # Update the decoder states for the next step (only needed in standard mode)
            encoder_states = [h, c] # These are actually the decoder states now


    # Convert the sequence of IDs to text
    # Ensure decoding components are available before calling sequences_to_text
    # The checks are also inside sequences_to_text, but belt and suspenders
    if 'reverse_word_index' not in globals() or 'OOV_TOKEN' not in globals() or 'START_TOKEN' not in globals() or 'END_TOKEN' not in globals():
         print("Error: Decoding components (reverse_word_index, OOV_TOKEN, tokens) are not defined.")
         return "Generation failed: Decoding components missing."

    print("Decoding sequence IDs to text...") # Added print
    generated_headline_text = sequences_to_text(generated_sequence_ids, stop_on_end_token=True)

    print("Decoding complete.") # Added print
    print("--- Debugging generate_headline End ---\n") # Added print

    return generated_headline_text

print("generate_headline function defined (with simulation option and corrected syntax).")

generate_headline function defined (with simulation option and corrected syntax).


In [7]:
# --- Cell 7 (Load Original Data for Testing) ---
data_filepath = 'data/title_content_pair.pkl' # Path to your original data file

if not os.path.exists(data_filepath):
    print(f"Error: Data file not found at '{data_filepath}'.")
    print("Cannot load original data for testing.")
    # You'll need to provide input_text manually if this fails
else:
    print(f"Loading original data from '{data_filepath}'...")
    with open(data_filepath, 'rb') as fp:
        loaded_data = pickle.load(fp)

    # Extract titles and contents (titles include <start>/<end> if you modified Cell 2)
    title_list = [item.get('title') for item in loaded_data]
    content_list = [item.get('content') for item in loaded_data]
    print(f"Loaded {len(loaded_data)} records.\n") # Added newline

Loading original data from 'data/title_content_pair.pkl'...
Loaded 50000 records.



In [8]:
# --- Cell 8 (Generate and Display Headline) ---

if 'content_list' not in globals() or not content_list: # Check globals() as data loading might be in a separate cell
    print("\nError: 'content_list' is not loaded. Please run Cell 7 (Load Original Data) or provide input_text manually.")
else:
    # Choose an article index to generate a headline for
    article_index_to_test = 15 # You can change this index

    if article_index_to_test < 0 or article_index_to_test >= len(content_list):
        print(f"Error: Article index {article_index_to_test} is out of bounds.")
    else:
        input_article_text = content_list[article_index_to_test]
        original_headline = title_list[article_index_to_test] # Includes <start>/<end> if modified in Cell 2

        print(f"\nGenerating headline for article index {article_index_to_test}...")

        # Generate the headline - Use simulate_output=True for testing the pipeline
        # Use simulate_output=False later when you have a sufficiently trained model
        # Ensure generate_headline function is defined in a cell *before* this one
        if 'generate_headline' in globals():
             generated_headline = generate_headline(input_article_text, simulate_output=False) # <--- Call with parameter
        else:
             print("Error: 'generate_headline' function is not defined. Please run the cell containing its definition.")
             generated_headline = "Generation failed: Function not defined."


        print("\n--- Original Article (First 500 chars) ---\n", input_article_text[:500] + '...') # Added newline
        print("\n--- Original Headline ---\n", original_headline) # Added newline
        print("\n--- Generated Headline (Simulated) ---\n", generated_headline) # Changed label & added newline


        # Optional: Basic comparison using Levenshtein Distance (will compare simulated output)
        # This comparison is less meaningful with simulated output but tests the metric calculation
        # This block was previously misplaced and had syntax errors
        if 'Levenshtein' in globals() and original_headline and generated_headline and "Generation failed" not in generated_headline: # Check if Levenshtein is imported and generation wasn't an error
             # Clean original headline for comparison (remove <start>/<end>)
             # Ensure START_TOKEN and END_TOKEN are defined globally (from Cell 1)
             if 'START_TOKEN' in globals() and 'END_TOKEN' in globals():
                 cleaned_original_headline = original_headline.replace(START_TOKEN, '').replace(END_TOKEN, '').strip()
             else:
                 print("Warning: START_TOKEN or END_TOKEN not defined, cannot clean original headline for comparison.")
                 cleaned_original_headline = original_headline # Use original as is if cleaning tokens not found

             if cleaned_original_headline: # Avoid dividing by zero if original is empty after cleaning
                 ld = Levenshtein.distance(cleaned_original_headline, generated_headline)
                 max_len = max(len(cleaned_original_headline), len(generated_headline))
                 similarity_score = (max_len - ld) / max_len if max_len > 0 else 0
                 print(f"\n--- Comparison (Simulated Output) ---")
                 # CORRECTED PRINT LINES - REMOVED THE TRAILING BACKSLASHES INSIDE THE STRING
                 print(f"Levenshtein Distance: {ld}")
                 print(f"Normalized Similarity (higher is better): {similarity_score:.4f}")
             else:
                 print("\n--- Comparison (Simulated Output) ---\nOriginal headline is empty after cleaning.") # Added newline
        elif "Generation failed" in generated_headline:
             print("\nSkipping comparison due to generation failure.")
        else:
             print("\nSkipping comparison: Levenshtein not imported, or headlines/original empty.")


Generating headline for article index 15...

--- Debugging generate_headline ---
'tokenizer' in globals(): True
'encoder_model' in globals(): True
'decoder_model' in globals(): True
'START_TOKEN_ID' in globals(): True
'END_TOKEN_ID' in globals(): True
Simulation mode active: False
MAX_HEADLINE_LEN: 50
Running encoder_model.predict...
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Encoder prediction complete.
Starting decoding loop...
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'argume