In [14]:
import pandas as pd
import numpy as np
import regex as re
import nltk
from datasets import load_dataset
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from utils import calculate_scores, sum_metrices

# Load data

In [15]:
# All together
cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train']).drop(columns='id')
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation']).drop(columns='id')
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test']).drop(columns='id')

#df = pd.concat([train_df,test_df, val_df], ignore_index=True)

In [3]:
df.head()

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [4]:
# Example of first article
print(df['article'][0])

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

# Preprocess data

In [3]:
class TextPreprocessor:
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def preprocess(self):
        self.dataframe = self.dataframe.applymap(lambda x: x.lower() if isinstance(x, str) else x)
        self.dataframe['article'] = self.dataframe['article'].apply(self._clean_text)
        self.dataframe['article'] = self.dataframe['article'].apply(self._cleaned_list_of_sentences)

        return self.dataframe

    def _clean_text(self, text):
        """
        Remove non-alphabetic, non-digit, and non-dot characters from text.
        """
        pattern = r'[^A-Za-z0-9.\s]+'

        return re.sub(pattern, '', text)

    def _cleaned_list_of_sentences(self, text):
        """
        Tokenize sentences, remove stopwords, and apply stemming.
        """
        sentences = sent_tokenize(text)
        cleaned_sentences = []
        for sentence in sentences:
            words = word_tokenize(sentence)
            cleaned_words = [
                self.stemmer.stem(word) for word in words if word.isalnum() and word not in self.stop_words
            ]
            cleaned_sentences.append(cleaned_words)

        return cleaned_sentences


class SummaryGenerator:
    def __init__(self, sentences):
        self.sentences = sentences
        self.matrix = None
        self.ranked_sentences = None

    def create_matrix(self):
        """
        Create a similarity matrix based on common words between sentences.
        """
        n = len(self.sentences)
        self.matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i + 1, n):
                common_words = set(self.sentences[i]) & set(self.sentences[j])
                self.matrix[i][j] = self.matrix[j][i] = len(common_words)

        return self.matrix

    def rank_sentences(self):
        """
        Rank sentences based on the similarity matrix.
        """
        ranking_vector = self.matrix.sum(axis=1)
        self.ranked_sentences = sorted(enumerate(ranking_vector), key=lambda x: x[1], reverse=True)

        return self.ranked_sentences

    def produce_summary(self, summary_threshold):
        """
        Produce a summary based on the ranked sentences.
        """
        summary_indices = [index for index, _ in self.ranked_sentences[:summary_threshold]]
        summary = " ".join([" ".join(self.sentences[i]) for i in sorted(summary_indices)])

        return summary


In [32]:
df = test_df

In [33]:
# 1. Preprocess the articles 
preprocessor = TextPreprocessor(df)
df = preprocessor.preprocess()

# Generate summaries 
df['summary'] = None  

for index, row in df.iterrows():
    tokenized_sentences = row['article']  
    summarizer = SummaryGenerator(tokenized_sentences) 
    summarizer.create_matrix()  
    summarizer.rank_sentences()  
    summary = summarizer.produce_summary(summary_threshold=3)  
    df.loc[index, 'summary'] = summary  

# Evaluate solution

In [25]:
df = calculate_scores(df, 'summary', 'highlights')

In [26]:
df.head()

Unnamed: 0,article,highlights,summary,rouge_scores
0,"[[cnnthe, palestinian, author, offici, becam, ...",membership gives the icc jurisdiction over all...,palestinian sign icc found rome statut januari...,"{'rouge1': (0.4117647058823529, 0.368421052631..."
1,"[[cnnnever, mind, cat, nine, live], [stray, po...","theia, a bully breed mix, was apparently hit b...",veterinari hospit good samaritan fund committe...,"{'rouge1': (0.06976744186046512, 0.13636363636..."
2,"[[cnnif, youv, follow, news, late, certain, th...",mohammad javad zarif has spent more time with ...,anoth way say outsid countri demonstr shah ira...,"{'rouge1': (0.11428571428571428, 0.1, 0.106666..."
3,"[[cnnfive, american, monitor, three, week, oma...",17 americans were exposed to the ebola virus w...,cnnfive american monitor three week omaha nebr...,"{'rouge1': (0.09523809523809523, 0.21052631578..."
4,"[[cnna, duke, student, admit, hang, noos, made...",student is no longer on duke university campus...,cnna duke student admit hang noos made rope tr...,"{'rouge1': (0.19047619047619047, 0.5, 0.275862..."


In [27]:
mean_scores = sum_metrices(df, 'rouge_scores', results_folder='Results', file_name='Multi_edges_graph_1.csv')

In [28]:
mean_scores

{'rouge1': {'precision': {'mean': 0.14901270748928494,
   'std': 0.10140257309110637},
  'recall': {'mean': 0.22866884813650046, 'std': 0.1490334420598135},
  'fmeasure': {'mean': 0.1726083445431439, 'std': 0.1094986335156275}},
 'rouge2': {'precision': {'mean': 0.034803296419381326,
   'std': 0.05122864909744376},
  'recall': {'mean': 0.053524391738245414, 'std': 0.07698983362898121},
  'fmeasure': {'mean': 0.040419390951128904, 'std': 0.05803994756892757}},
 'rougeL': {'precision': {'mean': 0.115656084041827,
   'std': 0.08357053086786688},
  'recall': {'mean': 0.1786383828705406, 'std': 0.12569986428318355},
  'fmeasure': {'mean': 0.13436040410253372, 'std': 0.0918524154004193}},
 'rougeLsum': {'precision': {'mean': 0.1287863784838933,
   'std': 0.09201604623071462},
  'recall': {'mean': 0.1987300399604231, 'std': 0.13830964888462138},
  'fmeasure': {'mean': 0.14957387398520297, 'std': 0.10101652971880591}}}