In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rouge import Rouge
import joblib

In [9]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# Initialize Rouge for evaluation
rouge_scorer = Rouge()

In [11]:
class TextSummarizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def preprocess_text(self, text):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        words = word_tokenize(text)
        # Remove stopwords and lemmatize
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(words)

    def get_sentence_scores(self, text):
        sentences = sent_tokenize(text)
        # Preprocess sentences
        processed_sentences = [self.preprocess_text(sentence) for sentence in sentences]

        # Calculate TF-IDF scores
        tfidf_matrix = self.vectorizer.fit_transform(processed_sentences)

        # Calculate sentence scores based on TF-IDF weights
        sentence_scores = []
        for i in range(len(sentences)):
            score = np.mean(tfidf_matrix[i].toarray())
            sentence_scores.append((sentences[i], score))

        return sentence_scores

    def summarize(self, text, num_sentences=3):
        if not text:
            return ""

        # Get sentence scores
        sentence_scores = self.get_sentence_scores(text)

        # Sort sentences by score
        sentence_scores.sort(key=lambda x: x[1], reverse=True)

        # Select top n sentences
        selected_sentences = [sentence[0] for sentence in sentence_scores[:num_sentences]]

        # Sort sentences by their original position
        original_sentences = sent_tokenize(text)
        final_sentences = [sent for sent in original_sentences if sent in selected_sentences]

        return ' '.join(final_sentences)

In [15]:
from text_summarizer import TextSummarizer

# Train and save the summarizer
summarizer = TextSummarizer()


In [16]:
import pandas as pd

# Load the dataset and train if needed
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

In [17]:
# Train the summarizer
print("Training the summarizer...")
train_sample = train_df['article'].tolist()  
for text in train_sample:
    summarizer.summarize(text)

Training the summarizer...


In [18]:
# Evaluate on a sample of validation data
print("Evaluating the summarizer...")
val_sample = val_df
val_summaries = [summarizer.summarize(text, num_sentences=3) for text in val_sample['article'].values]
val_highlights = val_sample['highlights'].tolist()
val_scores = rouge_scorer.get_scores(val_summaries, val_highlights, avg=True)

# Print scores in a formatted way
print("\nValidation ROUGE Scores:")
print("=" * 40)

for rouge_type, scores in val_scores.items():
    print(f"\n{rouge_type.upper()}:")
    print("-" * 20)
    print(f"Precision: {scores['p']:.4f}")
    print(f"Recall:    {scores['r']:.4f}")
    print(f"F1-Score:  {scores['f']:.4f}")

Evaluating the summarizer...

Validation ROUGE Scores:

ROUGE-1:
--------------------
Precision: 0.2207
Recall:    0.4000
F1-Score:  0.2762

ROUGE-2:
--------------------
Precision: 0.0721
Recall:    0.1376
F1-Score:  0.0911

ROUGE-L:
--------------------
Precision: 0.1997
Recall:    0.3616
F1-Score:  0.2498


In [10]:
# Save the trained model
joblib.dump(summarizer, 'text_summarizer_model.joblib')

['text_summarizer_model.joblib']

In [12]:
# Test loading and using the model
print("\nTesting the saved model...")
loaded_summarizer = joblib.load('text_summarizer_model.joblib')
test_text = "I am writing to express my deep frustration and concern regarding the deplorable condition of the main road in our neighborhood. Over the past few months, the road has deteriorated significantly, and it is now riddled with numerous potholes of varying sizes. These potholes pose a serious hazard to both drivers and pedestrians. Every day, I witness vehicles swerving dangerously to avoid these craters, which increases the risk of accidents. The situation is particularly dire during the rainy season when the potholes fill with water, making them even more difficult to see and navigate. This not only damages vehicles but also endangers the lives of those who use the road. Despite multiple complaints to the local authorities, no action has been taken to repair the road. The lack of maintenance and timely repairs is unacceptable and reflects poorly on the administration's commitment to public safety. I urge the concerned authorities to prioritize the repair of this road to prevent any further accidents and ensure the safety of all residents."
summary = loaded_summarizer.summarize(test_text, num_sentences=2)
print("Test Summary:", summary)


Testing the saved model...
Test Summary: I am writing to express my deep frustration and concern regarding the deplorable condition of the main road in our neighborhood. The lack of maintenance and timely repairs is unacceptable and reflects poorly on the administration's commitment to public safety.
