In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rouge import Rouge
import joblib
import re

In [None]:
class ExtractiveTextSummarizer:
    def __init__(self):
        # Download required NLTK data
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.rouge = Rouge()
        
    def preprocess_text(self, text):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        words = word_tokenize(text)
        # Remove stopwords and lemmatize
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(words)
    
    def fit_transform_vectorizer(self, texts, batch_size=1000):
        tfidf_matrix = None
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            processed_batch = [self.preprocess_text(text) for text in batch]
            batch_matrix = self.vectorizer.fit_transform(processed_batch)
            if tfidf_matrix is None:
                tfidf_matrix = batch_matrix
            else:
                tfidf_matrix = np.vstack((tfidf_matrix, batch_matrix))
        return tfidf_matrix
    
    def get_sentence_scores(self, text):
        sentences = sent_tokenize(text)
        # Preprocess sentences
        processed_sentences = [self.preprocess_text(sentence) for sentence in sentences]
        
        # Calculate TF-IDF scores
        tfidf_matrix = self.fit_transform_vectorizer(processed_sentences)
        
        # Calculate sentence scores based on TF-IDF weights
        sentence_scores = []
        for i in range(len(sentences)):
            score = np.mean(tfidf_matrix[i].toarray())
            sentence_scores.append((sentences[i], score))
            
        return sentence_scores
    
    def summarize(self, text, num_sentences=3):
        if not text:
            return ""
            
        # Get sentence scores
        sentence_scores = self.get_sentence_scores(text)
        
        # Sort sentences by score
        sentence_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Select top n sentences
        selected_sentences = [sentence[0] for sentence in sentence_scores[:num_sentences]]
        
        # Sort sentences by their original position
        original_sentences = sent_tokenize(text)
        final_sentences = [sent for sent in original_sentences if sent in selected_sentences]
        
        return ' '.join(final_sentences)
    
    def train(self, train_data, val_data, batch_size=1000):
        # Fit the vectorizer on the training data in batches
        all_train_texts = train_data['article'].tolist()
        self.fit_transform_vectorizer(all_train_texts, batch_size=batch_size)
        
        # Evaluate on validation data
        val_summaries = [self.summarize(text, num_sentences=3) for text in val_data['article'].values]
        val_highlights = val_data['highlights'].tolist()
        val_scores = self.rouge.get_scores(val_summaries, val_highlights, avg=True)
        print("Validation ROUGE Scores:", val_scores)
        
        return self
    
    def save_model(self, filename):
        joblib.dump(self, filename)
        
    @staticmethod
    def load_model(filename):
        return joblib.load(filename)

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer(stop_words='english')
rouge = Rouge()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

def fit_transform_vectorizer(texts, batch_size=1000):
    tfidf_matrix = None
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        processed_batch = [preprocess_text(text) for text in batch]
        batch_matrix = vectorizer.fit_transform(processed_batch)
        if tfidf_matrix is None:
            tfidf_matrix = batch_matrix
        else:
            tfidf_matrix = np.vstack((tfidf_matrix, batch_matrix))
    return tfidf_matrix

def get_sentence_scores(text):
    sentences = sent_tokenize(text)
    # Preprocess sentences
    processed_sentences = [preprocess_text(sentence) for sentence in sentences]
    
    # Calculate TF-IDF scores
    tfidf_matrix = fit_transform_vectorizer(processed_sentences)
    
    # Calculate sentence scores based on TF-IDF weights
    sentence_scores = []
    for i in range(len(sentences)):
        score = np.mean(tfidf_matrix[i].toarray())
        sentence_scores.append((sentences[i], score))
        
    return sentence_scores

def summarize(text, num_sentences=3):
    if not text:
        return ""
        
    # Get sentence scores
    sentence_scores = get_sentence_scores(text)
    
    # Sort sentences by score
    sentence_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Select top n sentences
    selected_sentences = [sentence[0] for sentence in sentence_scores[:num_sentences]]
    
    # Sort sentences by their original position
    original_sentences = sent_tokenize(text)
    final_sentences = [sent for sent in original_sentences if sent in selected_sentences]
    
    return ' '.join(final_sentences)

In [5]:
# Load the dataset
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

In [6]:
# Train the summarizer
all_train_texts = train_df['article'].tolist()
fit_transform_vectorizer(all_train_texts, batch_size=1000)

NameError: name 'fit_transform_vectorizer' is not defined

In [7]:
# Evaluate on validation data
val_summaries = [summarize(text, num_sentences=3) for text in val_df['article'].values]
val_highlights = val_df['highlights'].tolist()
val_scores = rouge.get_scores(val_summaries, val_highlights, avg=True)
print("Validation ROUGE Scores:", val_scores)

Validation ROUGE Scores: {'rouge-1': {'r': 0.3999497644136857, 'p': 0.22068170337896695, 'f': 0.27617508641236926}, 'rouge-2': {'r': 0.1376090268116563, 'p': 0.07204719465355827, 'f': 0.09109734677975852}, 'rouge-l': {'r': 0.3615805745357685, 'p': 0.1996810625084078, 'f': 0.24978781526751093}}


In [8]:
# Save the trained model
joblib.dump(summarize, 'text_summarizer_v3.joblib')

['text_summarizer_v3.joblib']

In [5]:
test_df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [6]:

# Train the summarizer
# Train the summarizer
summarizer = ExtractiveTextSummarizer()
summarizer.train(train_df, val_df, batch_size=1000)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yakuma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Validation ROUGE Scores: {'rouge-1': {'r': 0.3999497644136857, 'p': 0.22068170337896695, 'f': 0.27617508641236926}, 'rouge-2': {'r': 0.1376090268116563, 'p': 0.07204719465355827, 'f': 0.09109734677975852}, 'rouge-l': {'r': 0.3615805745357685, 'p': 0.1996810625084078, 'f': 0.24978781526751093}}


<__main__.ExtractiveTextSummarizer at 0x23cbc7c3040>

In [7]:
# Save the trained model
# summarizer.save_model('text_summarizer.joblib')
joblib.dump(summarizer, 'text_summarizerv2.joblib')

['text_summarizerv2.joblib']

In [9]:
# load trained model
summarizer = joblib.load('text_summarizer_v3.joblib')

In [12]:
# example usage
test_text = 'Potholes are not a big issues. But it is. Potholes are big issue.'
summary = summarizer(test_text, num_sentences=2)
print("Summary:", summary)
# print("Original Text:", test_text)
print("Original Summary:", 'The main road in our neighborhood is full of dangerous potholes, posing serious risks to drivers and pedestrians, and despite multiple complaints, no repairs have been made.')



Summary: Potholes are not a big issues. Potholes are big issue.
Original Summary: The main road in our neighborhood is full of dangerous potholes, posing serious risks to drivers and pedestrians, and despite multiple complaints, no repairs have been made.
