In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets rouge rouge_score

In [None]:
import nltk
from datasets import load_dataset, load_metric
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
import numpy as np
import csv
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer


In [16]:
def predict_lead(article,n):
  """
  input a string of multiple sentences,
  store the sentences in a list of sentences through sentence tokenisation and
  return a string of the n- first sentences of the list of setences
  """
  sentences =sent_tokenize(article,language='english')
  return " ".join(sentences[:n])

def generate_lead2_summaries(dataset_name, dataset_version, dataset_split):
  """
  given the dataset name, version and split part,
  load it and extract from it the content of the columns 'id','article', 'highlights' is separate variables,
  use the strings form the article and extract the 2 first sentences from it and store this content in the predicted_summary variable. Repeat through for-loop for all rows of the dataframe
  store the contents of the dataset's 'ids', 'predicted summaries','gold summaries' into lists
  and return them
  """
  dataset = load_dataset(dataset_name, dataset_version)
  predictions = []
  references = []
  article_ids = []

  for example in dataset[dataset_split]:
      article_id = example["id"]
      article = example["article"]
      gold_summary = example["highlights"]
      predicted_summary = predict_lead(article, 2)
      predictions.append(predicted_summary)
      references.append(gold_summary)
      article_ids.append(article_id)
  return article_ids,predictions, references

def compute_metrics(predictions, references, article_ids, output_txt_path):
    """
    uses the lists of ids, predicted summaries and gold summaries from the previous function,
    initialize a list to store information for each summary,
    iterate over each pair of reference and hypothesis sentences,
    computes the Rouge-2 score, rounds up the rouge scores to 3 decimal points for precision, recall,fmeasure,
    creates a dictionary with the summary information
    writes the lead2 summaries to a .txt file, which we specify when calling the function
    returns the list of dictionaries with the summary information for each article
    """

    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    summary_info = []

    for article_id, prediction, reference in zip(article_ids, predictions, references):
        scores = scorer.score(prediction, reference)
        rouge2_score = scores['rouge2']
        rounded_rouge2_score = {
            'precision': round(rouge2_score.precision, 3),
            'recall': round(rouge2_score.recall, 3),
            'fmeasure': round(rouge2_score.fmeasure, 3)
        }

        summary_info.append({
            'id': article_id,
            'highlight': reference,
            'lead2-summary': prediction,
            'rouge2_score(Lead_2)': rounded_rouge2_score
        })

    with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
        for article_id, prediction in zip(article_ids, predictions):
            txt_file.write(f'({article_id}) {prediction}\n')

    return summary_info

def create_dataframe(dataset_name, dataset_version, dataset_split, output_txt_path):
    """
    given the dataset's name, version, split part and a specific txt filepath
    apply the function to get the first 2 sentences of each article,
    compute the rouge2 and write the id and lead2 summary of each article in
    the specific txt pilepath
    transform the summary info list into a dataframe and return it
    """
    dataset = load_dataset(dataset_name, dataset_version)

    article_ids,predictions, references = generate_lead2_summaries(
        dataset_name=dataset_name,
        dataset_version=dataset_version,
        dataset_split=dataset_split
    )

    summary_info = compute_metrics(predictions, references, article_ids, output_txt_path)

    summaries = pd.DataFrame(summary_info)

    return summaries

In [17]:
lead2_dataframe = create_dataframe(dataset_name= "cnn_dailymail",
    dataset_version="3.0.0",
    dataset_split='test',
    output_txt_path = '/content/drive/MyDrive/NLU/lead2_summs.txt')


In [19]:
lead2_dataframe.to_csv('/content/drive/MyDrive/NLU/lead2_data.csv')

In [25]:
def compute_mean_std_metrics(dataframe, column_name):
  """
  given the dataframe and the (metric)column name
  calculate and return the mean and standard deviation for the 3 metrics
  (precision, recall, f-measure) on all rows of the dataframe
  """
  precision_values = []
  recall_values = []
  fmeasure_values = []

  for index, row in dataframe.iterrows():
        # Extract precision, recall, and fmeasure from the row
      precision = row[column_name]['precision']
      recall = row[column_name]['recall']
      fmeasure = row[column_name]['fmeasure']
      precision_values.append(precision)
      recall_values.append(recall)
      fmeasure_values.append(fmeasure)

  mean_precision = np.mean(precision_values)
  mean_recall = np.mean(recall_values)
  mean_fmeasure = np.mean(fmeasure_values)

  std_precision = np.std(precision_values)
  std_recall = np.std(recall_values)
  std_fmeasure = np.std(fmeasure_values)

  mean_metrics = {
        'mean_precision': mean_precision,
        'mean_recall': mean_recall,
        'mean_fmeasure': mean_fmeasure
    }

  std_metrics = {
        'std_precision': std_precision,
        'std_recall': std_recall,
        'std_fmeasure': std_fmeasure
    }

  return mean_metrics, std_metrics

mean_metrics, std_metrics = compute_mean_std_metrics(lead2_dataframe, 'rouge2_score(Lead_2)')
print("Mean Metrics:")
print(f"Mean Precision: {mean_metrics['mean_precision']:.3f}")
print(f"Mean Recall: {mean_metrics['mean_recall']:.3f}")
print(f"Mean F-measure: {mean_metrics['mean_fmeasure']:.3f}")

print("\nStandard Deviation Metrics:")
print(f"Std Precision: {std_metrics['std_precision']:.3f}")
print(f"Std Recall: {std_metrics['std_recall']:.3f}")
print(f"Std F-measure: {std_metrics['std_fmeasure']:.3f}")


Mean Metrics:
Mean Precision: 0.166
Mean Recall: 0.159
Mean F-measure: 0.156

Standard Deviation Metrics:
Std Precision: 0.125
Std Recall: 0.121
Std F-measure: 0.112


**ROUGE_2**

Given an reference R and a candidate C summmary:
ROUGE-2 precision is the ratio of the number of 2-grams in C that appear also in R , over the number of 2-grams in C.
ROUGE-2 recall is the ratio of the number of 2-grams in R that appear also in C , over the number of 2-grams in R.

Given the mean and standard deviation values in all 3 rouge_2 metrics, we see that the generated summaries have an average 15% ratio of bigrams also found in the gold summary, meaning that the lead2 baseline system  has a consistent low performance. This was an expected behavior, since the lead2 system extracts the first 2 senteces of the artiles, whereas the 'highlights', which are considered the gold summaries in our data capture several points throughout the article.

**ROUGE**

Pros: it correlates positively with human evaluation, it’s inexpensive to compute and language-independent.
Cons: ROUGE does not manage different words that have the same meaning, as it measures syntactical matches rather than semantics.