In [26]:
import pandas as pd
import numpy as np
import nltk
import nltk
import numpy as np
import networkx as nx
from nltk.corpus import stopwords
from utils import calculate_scores, sum_metrices
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from math import log, sqrt
import os

# Load data

In [22]:
# All together
cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train']).drop(columns='id')
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation']).drop(columns='id')
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test']).drop(columns='id')

#df = pd.concat([train_df,test_df, val_df], ignore_index=True)

# Modified TextRank model

In [23]:
class TextRankSummarizer:
    def __init__(self, document, n):
        """
        Initialize the TextRankSummarizer with a document and the number of sentences for the summary.
        """
        self.document = document
        self.n = n
        self.stop_words = set(stopwords.words('english'))
        self.sentences = nltk.sent_tokenize(document)
        self.processed_sentences = self._preprocess_sentences()
        self.isf_dict = self._compute_isf()
        self.graph = self._build_graph()


    def _preprocess_sentences(self):
        """
        Preprocess sentences (tokenization, lowercase, removing stopwords and non-alphanumeric words).
        """
        preprocessed_sentences = []
        for sentence in self.sentences:
            tokens = nltk.word_tokenize(sentence.lower())
            filtered_tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]
            preprocessed_sentences.append(" ".join(filtered_tokens))

        return preprocessed_sentences


    def _compute_isf(self):
        """
        Compute the ISF for all words in the processed sentences.
        """
        isf_dict = {}
        for sentence in self.processed_sentences:
            for word in set(sentence):
                if word not in isf_dict:
                    isf_dict[word] = self._isf(word)

        return isf_dict

    def _isf(self, word):
        """
        Compute ISF for a single word.
        """
        sentence_count = sum(1 for sentence in self.processed_sentences if word in sentence)
        total_sentences = len(self.processed_sentences)
        isf = log(total_sentences / (1 + sentence_count))

        return isf

    def _build_graph(self):
        """
        Build a graph where nodes represent sentences and edges represent ISF-modified cosine similarity.
        """
        graph = nx.Graph()
        for i, sentence1 in enumerate(self.processed_sentences):
            for j, sentence2 in enumerate(self.processed_sentences):
                if i != j:
                    similarity = self._compute_similarity(sentence1, sentence2)
                    if similarity > 0:
                        graph.add_edge(i, j, weight=similarity)

        return graph

    def _compute_similarity(self, sentence1, sentence2):
        """
        Compute the ISF-modified cosine similarity between two sentences.
        """
        words1 = set(sentence1)
        words2 = set(sentence2)
        common_words = words1.intersection(words2)
        
        numerator = sum(
            (sentence1.count(word) * sentence2.count(word) * self.isf_dict[word]**2) for word in common_words
        )
        denominator1 = sqrt(sum((sentence1.count(word) * self.isf_dict[word])**2 for word in words1))
        denominator2 = sqrt(sum((sentence2.count(word) * self.isf_dict[word])**2 for word in words2))
        
        if denominator1 == 0 or denominator2 == 0:
            return 0
        isf_modified = numerator / (denominator1 * denominator2)

        return isf_modified
    
    def summarize(self):
        """
        Summarize the document using the TextRank algorithm.
        """
        self._prune_graph()
        self._initialize_scores()
        self._update_scores()
        return self._extract_summary()

    def _prune_graph(self):
        """
        Remove edges with weights below the average weight.
        """
        average_weight = np.mean([data['weight'] for _, _, data in self.graph.edges(data=True)])
        edges_to_remove = [(u, v) for u, v, data in self.graph.edges(data=True) if data['weight'] < average_weight]
        self.graph.remove_edges_from(edges_to_remove)

    def _initialize_scores(self):
        """
        Initialize TextRank scores for all nodes in the graph.
        """
        for node in self.graph.nodes():
            self.graph.nodes[node]['score'] = np.mean([data['weight'] for _, _, data in self.graph.edges(node, data=True)])

    def _update_scores(self, damping_factor=0.15, max_iter=100):
        """
        Iteratively update TextRank scores for all nodes in the graph.
        """
        for _ in range(max_iter):
            new_scores = {}
            for node in self.graph.nodes():
                neighbors = self.graph[node]
                score_sum = sum(self.graph.nodes[neighbor]['score'] / self.graph.degree(neighbor) for neighbor in neighbors)
                new_scores[node] = damping_factor / len(self.graph) + (1 - damping_factor) * score_sum
            for node in self.graph.nodes():
                self.graph.nodes[node]['score'] = new_scores[node]

    def _extract_summary(self):
        """
        Extract the top-n sentences as the summary.
        """
        ranked_sentences = sorted(self.graph.nodes(data=True), key=lambda x: x[1]['score'], reverse=True)
        top_sentences = [self.sentences[node[0]] for node in ranked_sentences[:self.n]]
        summary = sorted(top_sentences, key=lambda sentence: self.sentences.index(sentence))
        return ' '.join(summary)


In [24]:
df = test_df[:1000]

In [25]:
df['summary'] = None

for index, row in enumerate(df['article']):
    document_text = row
    summarizer = TextRankSummarizer(document_text, n=3)
    summary = summarizer.summarize()
    df.loc[index, 'summary'] = summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['summary'] = None
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# Calulate metrices

In [27]:
df = calculate_scores(df, 'summary', 'highlights')
directory = 'Results_df'
if not os.path.exists(directory):
    os.makedirs(directory)
df.to_csv(os.path.join(directory, 'Modified_Text_Rank.csv'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rouge_scores'] = all_scores


In [20]:
df

Unnamed: 0,article,highlights,summary,rouge_scores
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,"""As Palestine formally becomes a State Party t...","{'rouge1': (0.17647058823529413, 0.16666666666..."
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...","Four days after her apparent death, the dog ma...","{'rouge1': (0.2558139534883721, 0.323529411764..."
2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...,"The first sentence of his official biography, ...","{'rouge1': (0.22857142857142856, 0.25806451612..."
3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...,They all had contact with a colleague who was ...,"{'rouge1': (0.3333333333333333, 0.538461538461..."
4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...,"In February, a noose was hung around the neck ...","{'rouge1': (0.16666666666666666, 0.30434782608..."
...,...,...,...,...
995,Washington (CNN)The flight voice recorder abo...,Autopilot could have taken control of Germanwi...,"In fact, more than 10 years ago, following 9/1...","{'rouge1': (0.16666666666666666, 0.13953488372..."
996,(CNN)At least 54 people have died and 15 other...,Fishing vessels are searching for 15 people st...,The trawler is also thought to have keeled ove...,"{'rouge1': (0.1951219512195122, 0.216216216216..."
997,"(The Hollywood Reporter)Stan Freberg, whose fr...","Stan Freberg was famed comedian, song parodist...",Freberg died of natural causes at a Santa Moni...,"{'rouge1': (0.14814814814814814, 0.17391304347..."
998,(CNN)Indiana's controversial religious freedom...,Gov. Mike Pence is making the right call to fi...,These are wholesome American values that every...,"{'rouge1': (0.02631578947368421, 0.1, 0.041666..."


In [11]:
mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'modified_text_rank_l11.ipynb')

In [16]:
df['rouge_scores'][0]

{'rouge1': Score(precision=0.17647058823529413, recall=0.16666666666666666, fmeasure=0.17142857142857143),
 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.11764705882352941, recall=0.1111111111111111, fmeasure=0.11428571428571428),
 'rougeLsum': Score(precision=0.14705882352941177, recall=0.1388888888888889, fmeasure=0.14285714285714288)}

In [12]:
mean_scores

{'rouge1': {'mean_f1': 0.21373879638134693, 'std_f1': 0.13244827447700827},
 'rouge2': {'mean_f1': 0.05779133285873173, 'std_f1': 0.11638147040880105},
 'rougeL': {'mean_f1': 0.15314325838189918, 'std_f1': 0.11611689024023021},
 'rougeLsum': {'mean_f1': 0.17293722382996837, 'std_f1': 0.12100826411934285}}