## 1. Load Data from Your Directory

In [None]:
import os

In [4]:
def safe_read_file(file_path):
    encodings = ['utf-8', 'ISO-8859-1', 'cp1252']
    for enc in encodings:
        try:
            with open(file_path, 'r', encoding=enc) as f:
                return f.read().strip()
        except UnicodeDecodeError:
            continue
    print(f"[Warning] Could not decode: {file_path}")
    return ""

def load_data(base_path='./datasets/text_sum/'):
    articles_path = os.path.join(base_path, 'News_Articles')
    summaries_path = os.path.join(base_path, 'Summaries')

    articles, summaries = [], []

    for category in os.listdir(articles_path):
        article_dir = os.path.join(articles_path, category)
        summary_dir = os.path.join(summaries_path, category)

        for filename in os.listdir(article_dir):
            article_path = os.path.join(article_dir, filename)
            summary_path = os.path.join(summary_dir, filename)

            article = safe_read_file(article_path)
            summary = safe_read_file(summary_path)

            if article and summary:
                articles.append(article)
                summaries.append(summary)

    return articles, summaries

articles, summaries = load_data()
print(f"Loaded {len(articles)} valid pairs.")


Loaded 2225 valid pairs.


# 2. Preprocessing + Tokenization

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
# Add special tokens to summary
summaries = ['<start> ' + s + ' <end>' for s in summaries]

# Tokenizer settings
VOCAB_SIZE = 10000
MAX_LEN_ARTICLE = 300
MAX_LEN_SUMMARY = 50

article_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<unk>')
summary_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<unk>')

In [7]:
article_tokenizer.fit_on_texts(articles)
summary_tokenizer.fit_on_texts(summaries)

In [8]:
# Convert to sequences
article_seq = article_tokenizer.texts_to_sequences(articles)
summary_seq = summary_tokenizer.texts_to_sequences(summaries)

In [9]:

# Pad sequences
encoder_input = pad_sequences(article_seq, maxlen=MAX_LEN_ARTICLE, padding='post')
decoder_input = pad_sequences(summary_seq, maxlen=MAX_LEN_SUMMARY, padding='post')

# Decoder target (shifted left)
import numpy as np
decoder_target = np.zeros_like(decoder_input)
decoder_target[:, :-1] = decoder_input[:, 1:]


# 3. Define Encoder-Decoder LSTM Model


In [11]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

In [12]:
EMBEDDING_DIM = 256
LSTM_UNITS = 512

In [13]:
# Encoder
encoder_inputs = Input(shape=(MAX_LEN_ARTICLE,))
enc_emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(LSTM_UNITS, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(MAX_LEN_SUMMARY,))
dec_emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

In [14]:
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [15]:
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


# 4. Train the Model

In [16]:
model.fit(
    [encoder_input, decoder_input],
    np.expand_dims(decoder_target, -1),  # shape must be (batch, seq_len, 1)
    batch_size=64,
    epochs=10,
    validation_split=0.1
)


Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 8s/step - loss: 8.3013 - val_loss: 7.1793
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m448s[0m 14s/step - loss: 6.8834 - val_loss: 6.9908
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m475s[0m 15s/step - loss: 6.7458 - val_loss: 6.8341
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m489s[0m 15s/step - loss: 6.6300 - val_loss: 6.7447
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m463s[0m 14s/step - loss: 6.4720 - val_loss: 6.6475
Epoch 6/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 11s/step - loss: 6.3117 - val_loss: 6.5647
Epoch 7/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 10s/step - loss: 6.2052 - val_loss: 6.4988
Epoch 8/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 10s/step - loss: 6.0643 - val_loss: 6.4562
Epoch 9/10
[1m32/32[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1d8ac8391d0>

In [17]:
# Save the model
model.save('text_summarization_model.h5')




In [18]:
# Save tokenizers
import pickle

with open('article_tokenizer.pkl', 'wb') as f:
    pickle.dump(article_tokenizer, f)

with open('summary_tokenizer.pkl', 'wb') as f:
    pickle.dump(summary_tokenizer, f)



In [19]:
# Load the model and tokenizers for inference
from tensorflow.keras.models import load_model
import pickle

In [None]:
model = load_model('text_summarization_model.h5')
with open('article_tokenizer.pkl', 'rb') as f:
    article_tokenizer = pickle.load(f)
with open('summary_tokenizer.pkl', 'rb') as f:  
    summary_tokenizer = pickle.load(f)

In [None]:



def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = model.layers[2].predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first character of target sequence with the start character
    target_seq[0, 0] = summary_tokenizer.word_index['<start>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = model.layers[3].predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = summary_tokenizer.index_word.get(sampled_token_index, '')

        if sampled_char == '<end>' or len(decoded_sentence) > MAX_LEN_SUMMARY:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_char

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()



In [None]:
def summarize_article(article):
    # Preprocess the article
    article_seq = article_tokenizer.texts_to_sequences([article])
    article_input = pad_sequences(article_seq, maxlen=MAX_LEN_ARTICLE, padding='post')

    # Decode the sequence
    summary = decode_sequence(article_input)
    return summary


In [None]:
# Example usage
article = articles[0]  # Replace with your article
summary = summarize_article(article)
print("Article:", article)
print("Summary:", summary)