In [12]:
import pandas as pd
import numpy as np
import nltk
import nltk
import numpy as np
import networkx as nx
from nltk.corpus import stopwords
from utils import calculate_scores, sum_metrices
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from math import log, sqrt
from rouge_score import rouge_scorer

# Load data

In [3]:
# All together
cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train']).drop(columns='id')
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation']).drop(columns='id')
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test']).drop(columns='id')

df = pd.concat([train_df,test_df, val_df], ignore_index=True)

# Modified TextRank model

In [4]:
class TextRankSummarizer:
    def __init__(self, document, n):
        """
        Initialize the TextRankSummarizer with a document and the number of sentences for the summary.
        """
        self.document = document
        self.n = n
        self.stop_words = set(stopwords.words('english'))
        self.sentences = nltk.sent_tokenize(document)
        self.processed_sentences = self._preprocess_sentences()
        self.isf_dict = self._compute_isf()
        self.graph = self._build_graph()

    def _preprocess_sentences(self):
        """
        Preprocess sentences by tokenizing, converting to lowercase, and removing stopwords and non-alphanumeric words.
        """
        return [
            [word for word in nltk.word_tokenize(sentence.lower()) if word.isalnum() and word not in self.stop_words]
            for sentence in self.sentences
        ]

    def _compute_isf(self):
        """
        Compute the Inverse Sentence Frequency (ISF) for all words in the processed sentences.
        """
        isf_dict = {}
        for sentence in self.processed_sentences:
            for word in set(sentence):
                if word not in isf_dict:
                    isf_dict[word] = self._isf(word)
        return isf_dict

    def _isf(self, word):
        """
        Compute ISF for a single word.
        """
        sentence_count = sum(1 for sentence in self.processed_sentences if word in sentence)
        total_sentences = len(self.processed_sentences)
        return log(total_sentences / (1 + sentence_count))

    def _build_graph(self):
        """
        Build a graph where nodes represent sentences and edges represent ISF-modified cosine similarity.
        """
        graph = nx.Graph()
        for i, sentence1 in enumerate(self.processed_sentences):
            for j, sentence2 in enumerate(self.processed_sentences):
                if i != j:
                    similarity = self._compute_similarity(sentence1, sentence2)
                    if similarity > 0:
                        graph.add_edge(i, j, weight=similarity)
        return graph

    def _compute_similarity(self, sentence1, sentence2):
        """
        Compute the ISF-modified cosine similarity between two sentences.
        """
        words1 = set(sentence1)
        words2 = set(sentence2)
        common_words = words1.intersection(words2)
        
        numerator = sum(
            (sentence1.count(word) * sentence2.count(word) * self.isf_dict[word]**2) for word in common_words
        )
        denominator1 = sqrt(sum((sentence1.count(word) * self.isf_dict[word])**2 for word in words1))
        denominator2 = sqrt(sum((sentence2.count(word) * self.isf_dict[word])**2 for word in words2))
        
        if denominator1 == 0 or denominator2 == 0:
            return 0
        return numerator / (denominator1 * denominator2)

    def summarize(self):
        """
        Summarize the document using the TextRank algorithm.
        """
        self._prune_graph()
        self._initialize_scores()
        self._update_scores()
        return self._extract_summary()

    def _prune_graph(self):
        """
        Remove edges with weights below the average weight.
        """
        average_weight = np.mean([data['weight'] for _, _, data in self.graph.edges(data=True)])
        edges_to_remove = [(u, v) for u, v, data in self.graph.edges(data=True) if data['weight'] < average_weight]
        self.graph.remove_edges_from(edges_to_remove)

    def _initialize_scores(self):
        """
        Initialize TextRank scores for all nodes in the graph.
        """
        for node in self.graph.nodes():
            self.graph.nodes[node]['score'] = np.mean([data['weight'] for _, _, data in self.graph.edges(node, data=True)])

    def _update_scores(self, damping_factor=0.15, max_iter=100):
        """
        Iteratively update TextRank scores for all nodes in the graph.
        """
        for _ in range(max_iter):
            new_scores = {}
            for node in self.graph.nodes():
                neighbors = self.graph[node]
                score_sum = sum(
                    self.graph.nodes[neighbor]['score'] / self.graph.degree(neighbor) for neighbor in neighbors
                )
                new_scores[node] = damping_factor / len(self.graph) + (1 - damping_factor) * score_sum
            for node in self.graph.nodes():
                self.graph.nodes[node]['score'] = new_scores[node]

    def _extract_summary(self):
        """
        Extract the top-n sentences as the summary.
        """
        ranked_sentences = sorted(
            self.graph.nodes(data=True), key=lambda x: x[1]['score'], reverse=True
        )
        top_sentences = [self.sentences[node[0]] for node in ranked_sentences[:self.n]]
        summary = sorted(top_sentences, key=lambda sentence: self.sentences.index(sentence))
        return ' '.join(summary)


In [5]:
df = train_df[:5]

In [6]:
df['summary'] = None

for index, row in enumerate(df['article']):
    document_text = row
    summarizer = TextRankSummarizer(document_text, n=2)
    summary = summarizer.summarize()
    df.loc[index, 'summary'] = summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['summary'] = None
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# Calulate metrices

In [15]:
df = calculate_scores(df, 'summary', 'highlights')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rouge_scores'] = all_scores


In [16]:
df

Unnamed: 0,article,highlights,summary,rouge_scores
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,"Daniel Radcliffe as Harry Potter in ""Harry Pot...","{'rouge1': (0.5384615384615384, 0.28, 0.368421..."
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,"Here, Soledad O'Brien takes users inside a jai...","{'rouge1': (0.3673469387755102, 0.352941176470..."
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...","""I could see the whole bridge as it was going ...","{'rouge1': (0.24390243902439024, 0.3125, 0.273..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",WASHINGTON (CNN) -- Doctors removed five small...,"{'rouge1': (0.375, 0.2571428571428571, 0.30508..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",In papers filed Friday with a federal court in...,"{'rouge1': (0.3170731707317073, 0.178082191780..."


In [None]:
mean_scores = sum_metrices(df, 'rouge_scores', 'Results', 'modified_text_rank.ipynb')

Metrics saved in Results/metrics_results.csv


In [54]:
mean_scores

{'rouge1': {'precision': 0.36835681739862924,
  'recall': 0.2761332450788535,
  'fmeasure': 0.3071097153145227},
 'rouge2': {'precision': 0.17617467581998475,
  'recall': 0.1296997566826789,
  'fmeasure': 0.14511845509145097},
 'rougeL': {'precision': 0.2535752574951181,
  'recall': 0.18522172978780554,
  'fmeasure': 0.20854486576320067},
 'rougeLsum': {'precision': 0.3093464027261937,
  'recall': 0.2311547235332489,
  'fmeasure': 0.25730674259365616}}

In [None]:
import os
import pandas as pd

def sum_metrices(df, metrics_column='rouge_scores', results_folder='Results', file_name='metrics_results.csv'):
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)

    # Dictionary for metrics sums
    metric_sums = {
        'rouge1': {'precision': 0, 'recall': 0, 'fmeasure': 0},
        'rouge2': {'precision': 0, 'recall': 0, 'fmeasure': 0},
        'rougeL': {'precision': 0, 'recall': 0, 'fmeasure': 0},
        'rougeLsum': {'precision': 0, 'recall': 0, 'fmeasure': 0},
    }
    
    metrics_count = {key: 0 for key in metric_sums}

    # Sum up the metrics
    for scores in df[metrics_column]:
        for rouge_type, metric in scores.items():
            metric_sums[rouge_type]['precision'] += metric.precision
            metric_sums[rouge_type]['recall'] += metric.recall
            metric_sums[rouge_type]['fmeasure'] += metric.fmeasure
            metrics_count[rouge_type] += 1

    # Calculate the mean of each metric
    metrics_mean = {
        rouge_type: {key: value / metrics_count[rouge_type] for key, value in metric.items()}
        for rouge_type, metric in metric_sums.items()
    }

    results_file = os.path.join(results_folder, file_name)    
    metrics_df = pd.DataFrame(metrics_mean).transpose()
    metrics_df.to_csv(results_file, index=True)
    
    return metrics_mean
