In [None]:
import pandas as pd
import numpy as np
import regex as re
import nltk
from datasets import load_dataset
import re
import numpy as np
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from utils import calculate_scores, sum_metrices

# Load data

In [2]:
# All together
cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train']).drop(columns='id')
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation']).drop(columns='id')
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test']).drop(columns='id')

#df = pd.concat([train_df,test_df, val_df], ignore_index=True)

In [None]:
df.head()

In [None]:
# Example of first article
print(df['article'][0])

# Preprocess data

In [3]:
class TextPreprocessor:
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def preprocess(self):
        self.dataframe = self.dataframe.applymap(lambda x: x.lower() if isinstance(x, str) else x)
        self.dataframe['article'] = self.dataframe['article'].apply(self._clean_text)
        self.dataframe['article'] = self.dataframe['article'].apply(self._cleaned_list_of_sentences)

        return self.dataframe

    def _clean_text(self, text):
        """
        Remove non-alphabetic, non-digit, and non-dot characters from text.
        """
        pattern = r'[^A-Za-z0-9.\s]+'

        return re.sub(pattern, '', text)

    def _cleaned_list_of_sentences(self, text):
        """
        Tokenize sentences, remove stopwords, and apply stemming.
        """
        sentences = sent_tokenize(text)
        cleaned_sentences = []
        for sentence in sentences:
            words = word_tokenize(sentence)
            cleaned_words = [
                self.stemmer.stem(word) for word in words if word.isalnum() and word not in self.stop_words
            ]
            cleaned_sentences.append(cleaned_words)

        return cleaned_sentences


class SummaryGenerator:
    def __init__(self, sentences):
        self.sentences = sentences
        self.matrix = None
        self.ranked_sentences = None

    def create_matrix(self):
        """
        Create a similarity matrix based on common words between sentences.
        """
        n = len(self.sentences)
        self.matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i + 1, n):
                common_words = set(self.sentences[i]) & set(self.sentences[j])
                self.matrix[i][j] = self.matrix[j][i] = len(common_words)

        return self.matrix

    def rank_sentences(self):
        """
        Rank sentences based on the similarity matrix.
        """
        ranking_vector = self.matrix.sum(axis=1)
        self.ranked_sentences = sorted(enumerate(ranking_vector), key=lambda x: x[1], reverse=True)

        return self.ranked_sentences

    def produce_summary(self, summary_threshold):
        """
        Produce a summary based on the ranked sentences.
        """
        summary_indices = [index for index, _ in self.ranked_sentences[:summary_threshold]]
        summary = " ".join([" ".join(self.sentences[i]) for i in sorted(summary_indices)])

        return summary


In [5]:
df = test_df[:1000]

In [6]:
# 1. Preprocess the articles 
preprocessor = TextPreprocessor(df)
df = preprocessor.preprocess()

# Generate summaries 
df['summary'] = None  

for index, row in df.iterrows():
    tokenized_sentences = row['article']  
    summarizer = SummaryGenerator(tokenized_sentences) 
    summarizer.create_matrix()  
    summarizer.rank_sentences()  
    summary = summarizer.produce_summary(summary_threshold=3)  
    df.loc[index, 'summary'] = summary  

# Evaluate solution

In [7]:
df = calculate_scores(df, 'summary', 'highlights')
directory = 'Results_df'
if not os.path.exists(directory):
    os.makedirs(directory)
df.to_csv(os.path.join(directory, 'Multi_graph.csv'))

In [None]:
df.head()

In [27]:
mean_scores = sum_metrices(df, 'rouge_scores', results_folder='Results', file_name='Multi_edges_graph_1.csv')

In [None]:
mean_scores