In [1]:
import pandas as pd
import numpy as np
import regex as re
import nltk
from utils import cleaned_list_of_sentences
from datasets import load_dataset
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from utils import calculate_scores, sum_metrices

  from .autonotebook import tqdm as notebook_tqdm


# Load data

In [2]:
# All together
cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train']).drop(columns='id')
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation']).drop(columns='id')
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test']).drop(columns='id')

df = pd.concat([train_df,test_df, val_df], ignore_index=True)

In [3]:
df.head()

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [None]:
# Example of first article
print(df['article'][0])

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

# Preprocess data

In [11]:
class TextPreprocessor:
    def __init__(self, dataframe):
        """
        Initialize the TextPreprocessor with a DataFrame.
        """
        self.dataframe = dataframe
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def preprocess(self):
        """
        Apply all preprocessing steps to the DataFrame.
        """
        self.dataframe = self.dataframe.applymap(lambda x: x.lower() if isinstance(x, str) else x)
        self.dataframe['article'] = self.dataframe['article'].apply(self._clean_text)
        self.dataframe['article'] = self.dataframe['article'].apply(self._cleaned_list_of_sentences)
        return self.dataframe

    def _clean_text(self, text):
        """
        Remove non-alphabetic, non-digit, and non-dot characters from text.
        """
        pattern = r'[^A-Za-z0-9.\s]+'
        return re.sub(pattern, '', text)

    def _cleaned_list_of_sentences(self, text):
        """
        Tokenize sentences, remove stopwords, and apply stemming.
        """
        sentences = sent_tokenize(text)
        cleaned_sentences = []
        for sentence in sentences:
            words = word_tokenize(sentence)
            cleaned_words = [
                self.stemmer.stem(word) for word in words if word.isalnum() and word not in self.stop_words
            ]
            cleaned_sentences.append(cleaned_words)
        return cleaned_sentences


class SummaryGenerator:
    def __init__(self, sentences):
        """
        Initialize the SummaryGenerator with a list of tokenized sentences.
        """
        self.sentences = sentences
        self.matrix = None
        self.ranked_sentences = None

    def create_matrix(self):
        """
        Create a similarity matrix based on common words between sentences.
        """
        n = len(self.sentences)
        self.matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i + 1, n):
                common_words = set(self.sentences[i]) & set(self.sentences[j])
                self.matrix[i][j] = self.matrix[j][i] = len(common_words)
        return self.matrix

    def rank_sentences(self):
        """
        Rank sentences based on the similarity matrix.
        """
        if self.matrix is None:
            raise ValueError("Matrix not created. Call create_matrix() first.")
        ranking_vector = self.matrix.sum(axis=1)
        self.ranked_sentences = sorted(enumerate(ranking_vector), key=lambda x: x[1], reverse=True)
        return self.ranked_sentences

    def produce_summary(self, summary_threshold):
        """
        Produce a summary based on the ranked sentences.
        """
        if self.ranked_sentences is None:
            raise ValueError("Sentences not ranked. Call rank_sentences() first.")
        summary_indices = [index for index, _ in self.ranked_sentences[:summary_threshold]]
        summary = " ".join([" ".join(self.sentences[i]) for i in sorted(summary_indices)])
        return summary


In [17]:
df = train_df[:5]

In [18]:
# Step 1: Preprocess the articles using TextPreprocessor
preprocessor = TextPreprocessor(df)
df = preprocessor.preprocess()

# Step 2: Generate summaries using SummaryGenerator
df['summary'] = None  # Initialize a new column for summaries

for index, row in df.iterrows():
    tokenized_sentences = row['article']  # Tokenized sentences after preprocessing
    summarizer = SummaryGenerator(tokenized_sentences)  # Initialize the summary generator
    summarizer.create_matrix()  # Create the similarity matrix
    summarizer.rank_sentences()  # Rank sentences based on the matrix
    summary = summarizer.produce_summary(summary_threshold=2)  # Generate a summary with top-2 sentences
    df.loc[index, 'summary'] = summary  # Store the summary in the DataFrame

# Evaluate solution

In [None]:
scorer = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
    use_stemmer=True
)

all_scores = []

for idx, row in df.iterrows():
    candidate = row['summary']
    reference = row['highlights']
    
    # Compute the ROUGE scores for this pair
    scores = scorer.score(candidate, reference)
    
    # Each score is a dict of { 'precision': float, 'recall': float, 'fmeasure': float }
    all_scores.append(scores)

# Attach these raw scores back to the DataFrame as a new column
df['rouge_scores'] = all_scores

In [16]:
df

Unnamed: 0,article,highlights,summary
0,"[[london, england, reuter, harri, potter, star...","[[harri, potter, star, daniel, radcliff, get, ...",london england reuter harri potter star daniel...
1,"[[editor, note, behind, scene, seri, cnn, corr...","[[mental, ill, inmat, miami, hous, forgotten, ...",soledad obrien take user insid jail mani inmat...
2,"[[minneapoli, minnesota, cnn, driver, minneapo...","[[new, thought, go, die, driver, say], [man, s...",whole bridg one side mississippi complet gave ...
3,"[[washington, cnn, doctor, remov, five, small,...","[[five, small, polyp, found, procedur, none, w...",washington cnn doctor remov five small polyp p...
4,"[[cnn, nation, footbal, leagu, indefinit, susp...","[[new, nfl, chief, atlanta, falcon, owner, cri...",vick said would plead guilti one count conspir...


In [19]:
df

Unnamed: 0,article,highlights,summary
0,"[[london, england, reuter, harri, potter, star...",harry potter star daniel radcliffe gets £20m f...,london england reuter harri potter star daniel...
1,"[[editor, note, behind, scene, seri, cnn, corr...",mentally ill inmates in miami are housed on th...,soledad obrien take user insid jail mani inmat...
2,"[[minneapoli, minnesota, cnn, driver, minneapo...","new: ""i thought i was going to die,"" driver sa...",whole bridg one side mississippi complet gave ...
3,"[[washington, cnn, doctor, remov, five, small,...","five small polyps found during procedure; ""non...",washington cnn doctor remov five small polyp p...
4,"[[cnn, nation, footbal, leagu, indefinit, susp...","new: nfl chief, atlanta falcons owner critical...",vick said would plead guilti one count conspir...


In [20]:
df = calculate_scores(df, 'summary', 'highlights')

In [21]:
df

Unnamed: 0,article,highlights,summary,rouge_scores
0,"[[london, england, reuter, harri, potter, star...",harry potter star daniel radcliffe gets £20m f...,london england reuter harri potter star daniel...,"{'rouge1': (0.46153846153846156, 0.375, 0.4137..."
1,"[[editor, note, behind, scene, seri, cnn, corr...",mentally ill inmates in miami are housed on th...,soledad obrien take user insid jail mani inmat...,"{'rouge1': (0.22448979591836735, 0.29729729729..."
2,"[[minneapoli, minnesota, cnn, driver, minneapo...","new: ""i thought i was going to die,"" driver sa...",whole bridg one side mississippi complet gave ...,"{'rouge1': (0.07317073170731707, 0.13043478260..."
3,"[[washington, cnn, doctor, remov, five, small,...","five small polyps found during procedure; ""non...",washington cnn doctor remov five small polyp p...,"{'rouge1': (0.4166666666666667, 0.370370370370..."
4,"[[cnn, nation, footbal, leagu, indefinit, susp...","new: nfl chief, atlanta falcons owner critical...",vick said would plead guilti one count conspir...,"{'rouge1': (0.12195121951219512, 0.10204081632..."


In [22]:
mean_scores = sum_metrices(df, 'rouge_scores')

In [23]:
mean_scores

{'rouge1': {'precision': 0.2595633750686016,
  'recall': 0.2550286533205788,
  'fmeasure': 0.2533250061585714},
 'rouge2': {'precision': 0.09512204424103739,
  'recall': 0.0877636843062375,
  'fmeasure': 0.09043581068791151},
 'rougeL': {'precision': 0.20576125384487756,
  'recall': 0.19880537147617888,
  'fmeasure': 0.199350952088935},
 'rougeLsum': {'precision': 0.24139385585378617,
  'recall': 0.2359695431900401,
  'fmeasure': 0.2349805349833063}}