In [2]:
import os
import re
import nltk
import joblib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer

nltk.download("punkt", quiet=True)

True

In [3]:
def sentence_tokenize(text):
    """Split text into clean sentences."""
    sentences = nltk.sent_tokenize(text)
    return [re.sub(r"\s+", " ", s).strip() for s in sentences]

In [4]:
class ExtractiveSummarizer:
    def __init__(self, vectorizer=None):
        self.vectorizer = vectorizer

    def fit_vectorizer(self, sentences):
        self.vectorizer = TfidfVectorizer()
        self.vectorizer.fit(sentences)
        return self.vectorizer

    def score_sentences(self, sentences):
        if self.vectorizer is None:
            self.fit_vectorizer(sentences)

        tfidf_matrix = self.vectorizer.transform(sentences)
        sim_matrix = cosine_similarity(tfidf_matrix)

        # Score sentences by total similarity
        scores = sim_matrix.sum(axis=1)
        return scores

    def summarize(self, text, num_sentences=3):
        sentences = sentence_tokenize(text)
        if len(sentences) <= num_sentences:
            return " ".join(sentences)

        if self.vectorizer is None:
            self.fit_vectorizer(sentences)

        self.vectorizer = self.fit_vectorizer(sentences)
        tfidf_matrix = self.vectorizer.transform(sentences)
        sim_matrix = cosine_similarity(tfidf_matrix)
        scores = sim_matrix.sum(axis=1)

        ranked_idx = np.argsort(scores)[::-1]
        top_idx = sorted(ranked_idx[:num_sentences])

        summary = " ".join([sentences[i] for i in top_idx])
        return summary

In [5]:
data = {
    "document": [
        "Artificial intelligence is transforming industries. It helps in automation, efficiency, and decision making. Many companies are adopting AI to stay competitive.",
        "Climate change is one of the biggest global challenges. Governments are working together to reduce emissions. Renewable energy adoption is increasing worldwide."
    ],
    "reference_summary": [
        "AI is transforming industries with automation and efficiency.",
        "Climate change is a global challenge with efforts in emission reduction and renewable energy."
    ],
}

df = pd.DataFrame(data)
df

Unnamed: 0,document,reference_summary
0,Artificial intelligence is transforming indust...,AI is transforming industries with automation ...
1,Climate change is one of the biggest global ch...,Climate change is a global challenge with effo...


In [6]:
summarizer = ExtractiveSummarizer()

generated_summaries = []
rouge_scores = []

scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

for i, row in df.iterrows():
    doc = row["document"]
    ref_summary = row["reference_summary"]

    gen_summary = summarizer.summarize(doc, num_sentences=2)
    generated_summaries.append(gen_summary)

    # Compute ROUGE
    score = scorer.score(ref_summary, gen_summary)
    rouge_scores.append(score)

df["generated_summary"] = generated_summaries
df

Unnamed: 0,document,reference_summary,generated_summary
0,Artificial intelligence is transforming indust...,AI is transforming industries with automation ...,"It helps in automation, efficiency, and decisi..."
1,Climate change is one of the biggest global ch...,Climate change is a global challenge with effo...,Climate change is one of the biggest global ch...


In [7]:
def avg_rouge(scores, metric):
    return np.mean([s[metric].fmeasure for s in scores])

print("ROUGE-1 F1:", avg_rouge(rouge_scores, "rouge1"))
print("ROUGE-L F1:", avg_rouge(rouge_scores, "rougeL"))


ROUGE-1 F1: 0.40804597701149425
ROUGE-L F1: 0.32471264367816094


In [8]:
os.makedirs("models", exist_ok=True)
joblib.dump(summarizer.vectorizer, "models/vectorizer.pkl")
print("Vectorizer saved at models/vectorizer.pkl")

Vectorizer saved at models/vectorizer.pkl


In [9]:
loaded_vectorizer = joblib.load("models/vectorizer.pkl")
summarizer_loaded = ExtractiveSummarizer(vectorizer=loaded_vectorizer)

test_text = (
    "Space exploration has led to many technological advancements. "
    "Satellites provide weather forecasting, communication, and navigation. "
    "Future missions aim for Mars colonization."
)

print("\nOriginal Text:\n", test_text)
print("\nGenerated Summary:\n", summarizer_loaded.summarize(test_text, num_sentences=2))


Original Text:
 Space exploration has led to many technological advancements. Satellites provide weather forecasting, communication, and navigation. Future missions aim for Mars colonization.

Generated Summary:
 Satellites provide weather forecasting, communication, and navigation. Future missions aim for Mars colonization.
