In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ============================================================
# 0. Install & Imports (Colab)
# ============================================================
!pip install tensorflow==2.15.0 joblib --quiet

import numpy as np
import pandas as pd
import joblib
import os

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input,
    Embedding,
    LSTM,
    Dense,
    AdditiveAttention,
    Concatenate
)
from tensorflow.keras.optimizers import Adam

print("TensorFlow version:", tf.__version__)


[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.15.0 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.18.1, 2.19.0rc0, 2.19.0, 2.19.1, 2.20.0rc0, 2.20.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.15.0[0m[31m
[0mTensorFlow version: 2.19.0


In [None]:
# ============================================================
# 1. Load Data
# ============================================================

CSV_PATH = "/content/drive/MyDrive/news_project/data/cleaned/pens_clean_with_summaries.csv"   # change if needed

df = pd.read_csv(CSV_PATH)

# Adjust column names if needed
SOURCE_COLUMN = "text"
TARGET_COLUMN = "summary"

df = df[[SOURCE_COLUMN, TARGET_COLUMN]].dropna()
df[SOURCE_COLUMN] = df[SOURCE_COLUMN].astype(str)
df[TARGET_COLUMN] = df[TARGET_COLUMN].astype(str)

print("Total rows:", len(df))
df.head()


Total rows: 9467


Unnamed: 0,text,summary
0,The following list is based on travel alerts a...,The following list is based on travel alerts a...
1,Pharrell Williams surprised the 2019 graduatin...,"The Academy in Harlem, N.Y., is a charter scho..."
2,"GRAND RAPIDS, Mich. - One of Grand Rapids' tas...",For people with an adventurous palate who love...
3,WASHINGTON Border Patrol agents are on track t...,WASHINGTON Border Patrol agents are on track t...
4,Katy Perry's latest look works for a night in ...,The 33-year-old pop star was spotted out in Lo...


In [None]:
# ============================================================
# 2. Prepare text with <sos>/<eos>
# ============================================================

src_texts = df[SOURCE_COLUMN].tolist()
tgt_texts = df[TARGET_COLUMN].tolist()

tgt_texts = [f"<sos> {t.strip()} <eos>" for t in tgt_texts]

print(src_texts[0][:200])
print(tgt_texts[0])




In [None]:
# ============================================================
# 3. Tokenize & Pad
# ============================================================

# ---- Hyperparameters ----
SRC_VOCAB_LIMIT = 30000
TGT_VOCAB_LIMIT = 15000

MAX_SRC_LEN = 150   # max encoder timesteps
MAX_TGT_LEN = 40    # max decoder timesteps (incl <sos> & <eos>)

# ---- Source tokenizer ----
src_tokenizer = Tokenizer(num_words=SRC_VOCAB_LIMIT, oov_token="<OOV>")
src_tokenizer.fit_on_texts(src_texts)
src_sequences = src_tokenizer.texts_to_sequences(src_texts)

src_vocab_size = min(SRC_VOCAB_LIMIT, len(src_tokenizer.word_index) + 1)
print("Source vocab size (used):", src_vocab_size)

# ---- Target tokenizer ----
tgt_tokenizer = Tokenizer(num_words=TGT_VOCAB_LIMIT, oov_token="<OOV>")
tgt_tokenizer.fit_on_texts(tgt_texts)
tgt_sequences = tgt_tokenizer.texts_to_sequences(tgt_texts)

tgt_vocab_size = min(TGT_VOCAB_LIMIT, len(tgt_tokenizer.word_index) + 1)
print("Target vocab size (used):", tgt_vocab_size)

# ---- Pad sequences ----
encoder_input_data = pad_sequences(
    src_sequences,
    maxlen=MAX_SRC_LEN,
    padding="post",
    truncating="post"
)

decoder_input_data = pad_sequences(
    tgt_sequences,
    maxlen=MAX_TGT_LEN,
    padding="post",
    truncating="post"
)

# Decoder target is the next token (shifted)
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
decoder_target_data[:, -1] = 0  # last token target is padding

print("Encoder input shape:", encoder_input_data.shape)
print("Decoder input shape:", decoder_input_data.shape)
print("Decoder target shape:", decoder_target_data.shape)


Source vocab size (used): 30000
Target vocab size (used): 15000
Encoder input shape: (9467, 150)
Decoder input shape: (9467, 40)
Decoder target shape: (9467, 40)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense

class BahdanauAttention(Layer):
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        """
        query:  decoder outputs (batch, T_dec, hidden)
        values: encoder outputs (batch, T_enc, hidden)
        """
        # Add time axes so we can broadcast add
        # query:  (batch, T_dec, 1, hidden)
        # values: (batch, 1, T_enc, hidden)
        query_with_time_axis = tf.expand_dims(query, axis=2)
        values_with_time_axis = tf.expand_dims(values, axis=1)

        # score: (batch, T_dec, T_enc, 1)
        score = self.V(
            tf.nn.tanh(
                self.W1(values_with_time_axis) + self.W2(query_with_time_axis)
            )
        )
        score = tf.squeeze(score, axis=-1)  # (batch, T_dec, T_enc)

        # attention weights: (batch, T_dec, T_enc)
        attention_weights = tf.nn.softmax(score, axis=-1)

        # context vector: (batch, T_dec, hidden)
        context = tf.matmul(attention_weights, values)
        return context, attention_weights


In [None]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# ============================================================
# 4. Build LSTM + Custom Bahdanau Attention Model
# ============================================================

EMBED_DIM = 256
LSTM_UNITS = 256

# ---- Encoder ----
encoder_inputs = Input(shape=(MAX_SRC_LEN,), name="encoder_inputs")
enc_emb = Embedding(
    input_dim=src_vocab_size,
    output_dim=EMBED_DIM,
    # mask_zero=False  # default is False; we'll keep it simple
    name="encoder_embedding"
)(encoder_inputs)

encoder_lstm = LSTM(
    LSTM_UNITS,
    return_sequences=True,
    return_state=True,
    name="encoder_lstm"
)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# ---- Decoder ----
decoder_inputs = Input(shape=(MAX_TGT_LEN,), name="decoder_inputs")
dec_emb_layer = Embedding(
    input_dim=tgt_vocab_size,
    output_dim=EMBED_DIM,
    # mask_zero=False
    name="decoder_embedding"
)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(
    LSTM_UNITS,
    return_sequences=True,
    return_state=True,
    name="decoder_lstm"
)

decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# ---- Custom Bahdanau Attention ----
attention_layer = BahdanauAttention(LSTM_UNITS, name="bahdanau_attention")
context_vectors, attention_weights = attention_layer(decoder_outputs, encoder_outputs)
# context_vectors: (batch, T_dec, LSTM_UNITS)

# Concatenate decoder outputs with context
decoder_combined_context = Concatenate(axis=-1, name="concat_layer")(
    [decoder_outputs, context_vectors]
)

# ---- Output layer ----
decoder_dense = Dense(tgt_vocab_size, activation="softmax", name="output_dense")
decoder_outputs_final = decoder_dense(decoder_combined_context)

# ---- Full training model ----
model = Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs_final
)

model.summary()


In [None]:
# ============================================================
# 5. Compile & Train
# ============================================================

BATCH_SIZE = 64
EPOCHS = 25   # change as per your Colab time

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1
)


Epoch 1/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 189ms/step - accuracy: 0.3594 - loss: 3.3495 - val_accuracy: 0.2248 - val_loss: 5.9888
Epoch 2/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 194ms/step - accuracy: 0.3986 - loss: 3.0766 - val_accuracy: 0.2243 - val_loss: 6.0673
Epoch 3/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 192ms/step - accuracy: 0.4170 - loss: 2.9506 - val_accuracy: 0.1975 - val_loss: 6.4167
Epoch 4/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 183ms/step - accuracy: 0.3224 - loss: 3.7472 - val_accuracy: 0.2200 - val_loss: 6.0290
Epoch 5/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 187ms/step - accuracy: 0.3952 - loss: 3.1232 - val_accuracy: 0.2197 - val_loss: 6.1193
Epoch 6/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 194ms/step - accuracy: 0.4380 - loss: 2.8406 - val_accuracy: 0.2204 - val_loss: 6.2138
Epoch 7/25

In [None]:
# ============================================================
# 6. SAVE TRAINED MODEL & TOKENIZERS
# ============================================================

import os
import joblib

SAVE_DIR = "/content/drive/MyDrive/news_project/lstm_bahdanau_summarizer"
os.makedirs(SAVE_DIR, exist_ok=True)

# ---- Save model ----
MODEL_PATH = os.path.join(SAVE_DIR, "lstm_bahdanau_model.h5")
model.save(MODEL_PATH)

# ---- Save tokenizers ----
SRC_TOKENIZER_PATH = os.path.join(SAVE_DIR, "src_tokenizer.pkl")
TGT_TOKENIZER_PATH = os.path.join(SAVE_DIR, "tgt_tokenizer.pkl")

joblib.dump(src_tokenizer, SRC_TOKENIZER_PATH)
joblib.dump(tgt_tokenizer, TGT_TOKENIZER_PATH)

print("✅ Model & Tokenizers Saved Successfully")
print("Model:", MODEL_PATH)
print("Src Tokenizer:", SRC_TOKENIZER_PATH)
print("Tgt Tokenizer:", TGT_TOKENIZER_PATH)




✅ Model & Tokenizers Saved Successfully
Model: /content/drive/MyDrive/news_project/lstm_bahdanau_summarizer/lstm_bahdanau_model.h5
Src Tokenizer: /content/drive/MyDrive/news_project/lstm_bahdanau_summarizer/src_tokenizer.pkl
Tgt Tokenizer: /content/drive/MyDrive/news_project/lstm_bahdanau_summarizer/tgt_tokenizer.pkl


#### Evaluation

In [None]:
!pip install rouge-score --quiet


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [25]:

import numpy as np
import pandas as pd
import joblib
import tensorflow as tf
from rouge_score import rouge_scorer
from tensorflow.keras.layers import Layer, Dense


# ---------------- PATHS ----------------

MODEL_PATH = "/content/drive/MyDrive/news_project/lstm_bahdanau_summarizer/lstm_bahdanau_model.h5"
SRC_TOKENIZER_PATH = "/content/drive/MyDrive/news_project/lstm_bahdanau_summarizer/src_tokenizer.pkl"
TGT_TOKENIZER_PATH = "/content/drive/MyDrive/news_project/lstm_bahdanau_summarizer/tgt_tokenizer.pkl"

DATA_PATH = "/content/drive/MyDrive/news_project/data/cleaned/pens_clean_with_summaries.csv"
# OUTPUT_CSV = "/content/drive/MyDrive/news_project/lstm_eval/rouge_eval_results.csv"
OUTPUT_CSV = "/content/drive/MyDrive/news_project/lstm_eval/rouge_eval_results.csv"


MAX_SRC_LEN = 150
MAX_TGT_LEN = 40
EVAL_SAMPLES = 100


In [26]:
# =================== ATTENTION ===================

class BahdanauAttention(Layer):
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V  = Dense(1)

    def call(self, query, values):
        q = tf.expand_dims(query, axis=2)
        v = tf.expand_dims(values, axis=1)

        score = self.V(tf.nn.tanh(self.W1(v) + self.W2(q)))
        score = tf.squeeze(score, axis=-1)

        weights = tf.nn.softmax(score, axis=-1)
        context = tf.matmul(weights, values)

        return context, weights


In [27]:
# =================== LOAD MODEL ===================

model = tf.keras.models.load_model(
    MODEL_PATH,
    custom_objects={"BahdanauAttention": BahdanauAttention}
)

src_tokenizer = joblib.load(SRC_TOKENIZER_PATH)
tgt_tokenizer = joblib.load(TGT_TOKENIZER_PATH)



In [28]:
# =================== LOAD DATA ===================

df = pd.read_csv(DATA_PATH)

SRC_COL = "text"
TGT_COL = "summary"

df = df[[SRC_COL, TGT_COL]].dropna().head(EVAL_SAMPLES)

texts = df[SRC_COL].astype(str).tolist()
true_summaries = df[TGT_COL].astype(str).tolist()


In [31]:
# =================== PREPROCESS ===================

src_seq = src_tokenizer.texts_to_sequences(texts)

encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(
    src_seq,
    maxlen=MAX_SRC_LEN,
    padding="post",
    truncating="post"
)


In [30]:
print(list(tgt_tokenizer.word_index.keys())[:50])


['<OOV>', 'the', 'and', 'to', 'of', 'a', 'in', 'on', 'for', 'that', 'sos', 'eos', 'with', 'is', 'at', 'as', 'was', 'it', 'from', 'you', 'be', 'i', 'by', 'this', 'his', 'are', 'he', 'but', 'have', 'an', 'her', 'has', 'said', 'will', 'or', 'one', 'who', 'more', 'new', 'out', '2019', 'up', 'all', 'they', 'their', 'when', 'which', 'not', 'she', 'your']


In [32]:

# =================== GREEDY DECODING ===================

index_word = tgt_tokenizer.index_word
word_index = tgt_tokenizer.word_index

SOS_ID = word_index["sos"]
EOS_ID = word_index["eos"]

def generate_summary(enc_seq):

    decoder_input = np.zeros((1, MAX_TGT_LEN))
    decoder_input[0,0] = SOS_ID

    result = []

    for t in range(1, MAX_TGT_LEN):

        preds = model.predict(
            [enc_seq, decoder_input],
            verbose=0
        )

        token_id = np.argmax(preds[0, t-1])

        if token_id == EOS_ID or token_id == 0:
            break

        result.append(index_word.get(token_id,""))

        decoder_input[0,t] = token_id

    return " ".join(result)



In [33]:
tf.config.list_physical_devices("GPU")


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [34]:
# =================== GENERATE SUMMARIES ===================

predicted_summaries = []

for i in range(len(encoder_input_data)):
    s = generate_summary(
        encoder_input_data[i:i+1]
    )
    predicted_summaries.append(s)

In [35]:
# =================== ROUGE SCORING ===================

scorer = rouge_scorer.RougeScorer(
    ["rouge1","rouge2","rougeL"],
    use_stemmer=True
)

r1, r2, rl = [], [], []

for ref, hyp in zip(true_summaries, predicted_summaries):
    scores = scorer.score(ref, hyp)
    r1.append(scores["rouge1"].fmeasure)
    r2.append(scores["rouge2"].fmeasure)
    rl.append(scores["rougeL"].fmeasure)


# =================== SAVE ONLY OVERALL RESULT ===================

OUTPUT_CSV = "/content/drive/MyDrive/news_project/lstm_eval/rouge_eval_results.csv"

# ---- Compute averages ----
avg_r1 = float(np.mean(r1))
avg_r2 = float(np.mean(r2))
avg_rl = float(np.mean(rl))

# ---- Create single result row ----
new_df = pd.DataFrame([{
    "Model": "lstm_bahdanau",
    "rouge1": avg_r1,
    "rouge2": avg_r2,
    "rougeL": avg_rl,
    "rougeLsum": avg_rl
}])

# ---- Append below previous rows (NO duplicates / NO new file) ----
if os.path.exists(OUTPUT_CSV):
    old_df = pd.read_csv(OUTPUT_CSV)
    final_df = pd.concat([old_df, new_df], ignore_index=True)
else:
    final_df = new_df

final_df.to_csv(OUTPUT_CSV, index=False)

print("Completed")



Completed


In [36]:
data= pd.read_csv(OUTPUT_CSV)
data

Unnamed: 0,Model,rouge1,rouge2,rougeL,rougeLsum
0,bert,0.748193,0.686271,0.688875,0.688424
1,lstm_bahdanau,0.30082,0.131604,0.225389,0.225389
