In [2]:
import pandas as pd
import numpy as np
import nltk
import nltk
import numpy as np
import networkx as nx
from nltk.corpus import stopwords
from utils import calculate_scores, sum_metrices
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from math import log, sqrt

  from .autonotebook import tqdm as notebook_tqdm


# Load data

In [4]:
# All together
cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train']).drop(columns='id')
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation']).drop(columns='id')
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test']).drop(columns='id')

#df = pd.concat([train_df,test_df, val_df], ignore_index=True)

# Modified TextRank model

In [5]:
class TextRankSummarizer:
    def __init__(self, document, n):
        """
        Initialize the TextRankSummarizer with a document and the number of sentences for the summary.
        """
        self.document = document
        self.n = n
        self.stop_words = set(stopwords.words('english'))
        self.sentences = nltk.sent_tokenize(document)
        self.processed_sentences = self._preprocess_sentences()
        self.isf_dict = self._compute_isf()
        self.graph = self._build_graph()


    def _preprocess_sentences(self):
        """
        Preprocess sentences (tokenization, lowercase, removing stopwords and non-alphanumeric words).
        """
        preprocessed_sentences = []
        for sentence in self.sentences:
            tokens = nltk.word_tokenize(sentence.lower())
            filtered_tokens = [word for word in tokens if word.isalnum() and word not in self.stop_words]
            preprocessed_sentences.append(" ".join(filtered_tokens))

        return preprocessed_sentences


    def _compute_isf(self):
        """
        Compute the ISF for all words in the processed sentences.
        """
        isf_dict = {}
        for sentence in self.processed_sentences:
            for word in set(sentence):
                if word not in isf_dict:
                    isf_dict[word] = self._isf(word)

        return isf_dict

    def _isf(self, word):
        """
        Compute ISF for a single word.
        """
        sentence_count = sum(1 for sentence in self.processed_sentences if word in sentence)
        total_sentences = len(self.processed_sentences)
        isf = log(total_sentences / (1 + sentence_count))

        return isf

    def _build_graph(self):
        """
        Build a graph where nodes represent sentences and edges represent ISF-modified cosine similarity.
        """
        graph = nx.Graph()
        for i, sentence1 in enumerate(self.processed_sentences):
            for j, sentence2 in enumerate(self.processed_sentences):
                if i != j:
                    similarity = self._compute_similarity(sentence1, sentence2)
                    if similarity > 0:
                        graph.add_edge(i, j, weight=similarity)

        return graph

    def _compute_similarity(self, sentence1, sentence2):
        """
        Compute the ISF-modified cosine similarity between two sentences.
        """
        words1 = set(sentence1)
        words2 = set(sentence2)
        common_words = words1.intersection(words2)
        
        numerator = sum(
            (sentence1.count(word) * sentence2.count(word) * self.isf_dict[word]**2) for word in common_words
        )
        denominator1 = sqrt(sum((sentence1.count(word) * self.isf_dict[word])**2 for word in words1))
        denominator2 = sqrt(sum((sentence2.count(word) * self.isf_dict[word])**2 for word in words2))
        
        if denominator1 == 0 or denominator2 == 0:
            return 0
        isf_modified = numerator / (denominator1 * denominator2)

        return isf_modified
    
    def summarize(self):
        """
        Summarize the document using the TextRank algorithm.
        """
        self._prune_graph()
        self._initialize_scores()
        self._update_scores()
        return self._extract_summary()

    def _prune_graph(self):
        """
        Remove edges with weights below the average weight.
        """
        average_weight = np.mean([data['weight'] for _, _, data in self.graph.edges(data=True)])
        edges_to_remove = [(u, v) for u, v, data in self.graph.edges(data=True) if data['weight'] < average_weight]
        self.graph.remove_edges_from(edges_to_remove)

    def _initialize_scores(self):
        """
        Initialize TextRank scores for all nodes in the graph.
        """
        for node in self.graph.nodes():
            self.graph.nodes[node]['score'] = np.mean([data['weight'] for _, _, data in self.graph.edges(node, data=True)])

    def _update_scores(self, damping_factor=0.15, max_iter=100):
        """
        Iteratively update TextRank scores for all nodes in the graph.
        """
        for _ in range(max_iter):
            new_scores = {}
            for node in self.graph.nodes():
                neighbors = self.graph[node]
                score_sum = sum(self.graph.nodes[neighbor]['score'] / self.graph.degree(neighbor) for neighbor in neighbors)
                new_scores[node] = damping_factor / len(self.graph) + (1 - damping_factor) * score_sum
            for node in self.graph.nodes():
                self.graph.nodes[node]['score'] = new_scores[node]

    def _extract_summary(self):
        """
        Extract the top-n sentences as the summary.
        """
        ranked_sentences = sorted(self.graph.nodes(data=True), key=lambda x: x[1]['score'], reverse=True)
        top_sentences = [self.sentences[node[0]] for node in ranked_sentences[:self.n]]
        summary = sorted(top_sentences, key=lambda sentence: self.sentences.index(sentence))
        return ' '.join(summary)


In [6]:
df = test_df

In [18]:
df['summary'] = None

for index, row in enumerate(df['article']):
    document_text = row
    summarizer = TextRankSummarizer(document_text, n=1)
    summary = summarizer.summarize()
    df.loc[index, 'summary'] = summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['summary'] = None
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# Calulate metrices

In [16]:
df = calculate_scores(df, 'summary', 'highlights')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rouge_scores'] = all_scores


In [21]:
df.head()

Unnamed: 0,article,highlights,summary,rouge_scores
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,(CNN)The Palestinian Authority officially beca...,"{'rouge1': (0.5882352941176471, 0.408163265306..."
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...",The veterinary hospital's Good Samaritan Fund ...,"{'rouge1': (0.27906976744186046, 0.17647058823..."
2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...,"The website of the Iranian Foreign Ministry, w...","{'rouge1': (0.4, 0.358974358974359, 0.37837837..."
3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...,(CNN)Five Americans who were monitored for thr...,"{'rouge1': (0.5714285714285714, 0.4, 0.4705882..."
4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...,This is no Duke we want.,"{'rouge1': (0.38095238095238093, 0.53333333333..."


In [19]:
mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'modified_text_rank_l1.ipynb')

In [20]:
mean_scores

{'rouge1': {'precision': {'mean': 0.349721157973672,
   'std': 0.16822640408811385},
  'recall': {'mean': 0.24459074078729662, 'std': 0.11804769516658276},
  'fmeasure': {'mean': 0.2729937750506234, 'std': 0.1176311324889716}},
 'rouge2': {'precision': {'mean': 0.11218582621939728,
   'std': 0.12854905002893363},
  'recall': {'mean': 0.07439882843170897, 'std': 0.08724707217803106},
  'fmeasure': {'mean': 0.08538332469707353, 'std': 0.09660336210372207}},
 'rougeL': {'precision': {'mean': 0.2379708185464808,
   'std': 0.1370852609754649},
  'recall': {'mean': 0.16738825239284638, 'std': 0.09945698866492356},
  'fmeasure': {'mean': 0.18581368725603975, 'std': 0.09835825455550519}},
 'rougeLsum': {'precision': {'mean': 0.27365592405151956,
   'std': 0.14654244411077277},
  'recall': {'mean': 0.19291156326798492, 'std': 0.1060331666338659},
  'fmeasure': {'mean': 0.21427480055503914, 'std': 0.10492829488303518}}}