In [26]:
import pandas as pd
import numpy as np
import nltk
import nltk
import numpy as np
import networkx as nx
from nltk.corpus import stopwords
from utils import calculate_scores, sum_metrices
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from math import log, sqrt
import os

# Load data

In [22]:
# All together
cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train']).drop(columns='id')
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation']).drop(columns='id')
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test']).drop(columns='id')

#df = pd.concat([train_df,test_df, val_df], ignore_index=True)

# Modified TextRank model

In [23]:
class TextRankSummarizer:
    def __init__(self, document, n):
        """
        Initialize the TextRankSummarizer with a document and the number of sentences for the summary.
        """
        self.document = document
        self.n = n
        self.stop_words = set(stopwords.words('english'))
        self.sentences = nltk.sent_tokenize(document)
        self.processed_sentences = self._preprocess_sentences()
        self.isf_dict = self._compute_isf()
        self.graph = self._build_graph()


    def _preprocess_sentences(self):
        """
        Preprocess sentences (tokenization, lowercase, removing stopwords and non-alphanumeric words).
        """
        preprocessed_sentences = []
        for sentence in self.sentences:
            tokens = nltk.word_tokenize(sentence.lower())
            filtered_tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]
            preprocessed_sentences.append(" ".join(filtered_tokens))

        return preprocessed_sentences


    def _compute_isf(self):
        """
        Compute the ISF for all words in the processed sentences.
        """
        isf_dict = {}
        for sentence in self.processed_sentences:
            for word in set(sentence):
                if word not in isf_dict:
                    isf_dict[word] = self._isf(word)

        return isf_dict

    def _isf(self, word):
        """
        Compute ISF for a single word.
        """
        sentence_count = sum(1 for sentence in self.processed_sentences if word in sentence)
        total_sentences = len(self.processed_sentences)
        isf = log(total_sentences / (1 + sentence_count))

        return isf

    def _build_graph(self):
        """
        Build a graph where nodes represent sentences and edges represent ISF-modified cosine similarity.
        """
        graph = nx.Graph()
        for i, sentence1 in enumerate(self.processed_sentences):
            for j, sentence2 in enumerate(self.processed_sentences):
                if i != j:
                    similarity = self._compute_similarity(sentence1, sentence2)
                    if similarity > 0:
                        graph.add_edge(i, j, weight=similarity)

        return graph

    def _compute_similarity(self, sentence1, sentence2):
        """
        Compute the ISF-modified cosine similarity between two sentences.
        """
        words1 = set(sentence1)
        words2 = set(sentence2)
        common_words = words1.intersection(words2)
        
        numerator = sum((sentence1.count(word) * sentence2.count(word) * self.isf_dict[word]**2) for word in common_words)
        denominator1 = sqrt(sum((sentence1.count(word) * self.isf_dict[word])**2 for word in words1))
        denominator2 = sqrt(sum((sentence2.count(word) * self.isf_dict[word])**2 for word in words2))
        
        if denominator1 == 0 or denominator2 == 0:
            return 0
        isf_modified = numerator / (denominator1 * denominator2)

        return isf_modified
    
    def summarize(self):
        """
        Summarize the document using the TextRank algorithm.
        """
        self._prune_graph()
        self._initialize_scores()
        self._update_scores()
        return self._extract_summary()

    def _prune_graph(self):
        """
        Remove edges with weights below the average weight.
        """
        edges_weight = [data['weight'] for _, _, data in self.graph.edges(data=True)]
        average_weight = np.mean(edges_weight)
        edges_to_remove = [(u, v) for u, v, data in self.graph.edges(data=True) if data['weight'] < average_weight]

        self.graph.remove_edges_from(edges_to_remove)

    def _initialize_scores(self):
        """
        Initialize TextRank scores for all nodes in the graph.
        """
        for node in self.graph.nodes():
            node_weights = [data['weight'] for _, _, data in self.graph.edges(node, data=True)]        
            if node_weights:
                node_score = np.mean(node_weights)
            else:
                node_score = 0  

            self.graph.nodes[node]['score'] = node_score


    def _update_scores(self, damping_factor=0.15, max_iter=100):
        """
        Iteratively update TextRank scores for all nodes in the graph.
        """
        for _ in range(max_iter):
            new_scores = {}
            for node in self.graph.nodes():
                neighbors = self.graph[node]
                score_sum = sum(self.graph.nodes[neighbor]['score'] / self.graph.degree(neighbor) for neighbor in neighbors)
                new_scores[node] = damping_factor / len(self.graph) + (1 - damping_factor) * score_sum
            for node in self.graph.nodes():
                self.graph.nodes[node]['score'] = new_scores[node]

    def _extract_summary(self):
        """
        Extract the top-n sentences as the summary.
        """
        nodes_with_scores = self.graph.nodes(data=True)
        ranked_sentences = sorted(nodes_with_scores, key=lambda x: x[1]['score'], reverse=True)
        # Extract top-ranked sentences
        top_sentences = []
        for node in ranked_sentences[:self.n]:
            sentence = self.sentences[node[0]]
            top_sentences.append(sentence)

        # Sort selected sentences from org text
        sorted_summary_sentences = []
        for sentence in top_sentences:
            index = self.sentences.index(sentence)
            sorted_summary_sentences.append((index, sentence))
            
        sorted_summary_sentences.sort() 
        ordered_summary = [sentence for _, sentence in sorted_summary_sentences]

        return ' '.join(ordered_summary)


In [24]:
df = test_df[:1000]

In [None]:
df['summary'] = None

for index, row in enumerate(df['article']):
    document_text = row
    summarizer = TextRankSummarizer(document_text, n=3)
    summary = summarizer.summarize()
    df.loc[index, 'summary'] = summary

# Calulate metrices

In [None]:
df = calculate_scores(df, 'summary', 'highlights')
directory = 'Results_df'
if not os.path.exists(directory):
    os.makedirs(directory)
df.to_csv(os.path.join(directory, 'Modified_Text_Rank.csv'))

In [None]:
df

In [11]:
mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'modified_text_rank_l11.ipynb')

In [None]:
df['rouge_scores'][0]

In [None]:
mean_scores