In [2]:
import os
from transformers import AutoTokenizer, LongT5ForConditionalGeneration
import pandas as pd
import spacy
import torch
import accelerate

torch.cuda.is_available()

True

In [3]:
model_name = 'google/long-t5-tglobal-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = LongT5ForConditionalGeneration.from_pretrained(model_name,
                                                       low_cpu_mem_usage=False)

nlp = spacy.load('en_core_web_sm')

In [4]:
from datasets import load_dataset

cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train'])
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation'])
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test'])

df = pd.concat([train_df, test_df, val_df]).drop(columns='id')

In [5]:
from transformers import DataCollatorForSeq2Seq, pipeline

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
summarizer = pipeline('summarization', model=model_name)

Device set to use cuda:0


In [12]:
from transformers import StoppingCriteria, StoppingCriteriaList
import unicodedata
import re

TOKENS_THRESHOLD = 2048
MEAN_TOKEN_COUNT = 56
SUMM_RATIO = 0.125


def clean_article(article):
    article = unicodedata.normalize('NFD', article)
    article = re.sub(r'[^\x00-\x7F]+', '', article)
    article = re.sub(r'\s+', ' ', article)

    return article.strip()


class SentenceStoppingCriteria(StoppingCriteria):
    def __init__(self, tokenizer, max_sentences):
        self.tokenizer = tokenizer
        self.max_sentences = max_sentences
        self.sentence_count = 0

    def __call__(self, input_ids, scores, **kwargs):
        decoded_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)

        sentences = list(nlp(decoded_text).sents)
        self.sentence_count = len(sentences)

        return self.sentence_count >= self.max_sentences


def summ_article(article, sentence_count=None):
    article = 'summarize: ' + clean_article(article)
    summary_len = int(len(list(nlp(article).sents)) * SUMM_RATIO)
    inputs = tokenizer(article,
                       return_tensors='pt',
                       truncation=True,
                       padding='longest',
                       max_length=TOKENS_THRESHOLD).input_ids

    stopping_criteria = SentenceStoppingCriteria(tokenizer, sentence_count or summary_len)
    summary_ids = model.generate(inputs,
                                 stopping_criteria=StoppingCriteriaList([stopping_criteria]),
                                 max_length=TOKENS_THRESHOLD
                                 # max_length=min(TOKENS_THRESHOLD, summary_len)
                                 )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [10]:
from rouge import Rouge


def evaluate_article(article, ref_summary, sentence_count):
    summary = summ_article(article, sentence_count)
    ref_summary = clean_article(ref_summary)
    scores = Rouge().get_scores(summary, ref_summary)

    return summary, scores

In [13]:
TEST_ARTICLE_COUNT = 1000
SENTENCE_COUNT = 3
test_articles = df[:TEST_ARTICLE_COUNT]
evaluation_inputs = [(row['article'], row['highlights']) for _, row in test_articles.iterrows()]

scores = []
summaries = []
for i, (_, row) in enumerate(test_articles.iterrows()):
    summary, score = evaluate_article(row['article'], row['highlights'], SENTENCE_COUNT)
    scores.append(score)
    summaries.append(summary)
    print(f'Article {i + 1} summarized')
    print(summary + '\n')
test_articles.loc['gen_summaries'] = summaries

Article 1 summarized
"I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month.

Article 2 summarized
MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here

Article 3 summarized
"The whole bridge from one side of the Mississippi to the other just completely gave way, fell all the way down," survivor Gary Babineau told CNN. Occasionally, a pickup truck with a medic inside would drive to get an injured person and bring him back up even ground, Hink told CNN. "So

Article 4 summarized
summarize: WASHINGTON (CNN) -- Doctors removed five small polyps from President Bush's colon on Saturday, and "none appeared worrisome," a White House spokesman said.

Article 5 summarized
In papers filed Friday with a federal court in Virginia, Vick also admitted that he and two co-conspirators ki

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_articles['gen_summaries'] = summaries


In [18]:
from Summarizers.utils import *

test_articles = calculate_scores(test_articles, 'gen_summaries', 'highlights')
mean_scores = sum_metrices(test_articles, 'rouge_scores', results_folder='Results', file_name='T5_summarizer.csv')

In [19]:
test_articles.to_csv(
    'Results/T5_summarizer-all.csv',
    columns=['gen_summaries', 'rouge_scores']
)

In [30]:
rouge_df = pd.DataFrame()
for i, row in test_articles.iterrows():
    rouge_df.loc[i, 'rouge1'] = row['rouge_scores']['rouge1'].fmeasure
    rouge_df.loc[i, 'rouge2'] = row['rouge_scores']['rouge2'].fmeasure
    rouge_df.loc[i, 'rouge-l'] = row['rouge_scores']['rougeL'].fmeasure
    
rouge1_std = rouge_df['rouge1'].std()
rouge2_std = rouge_df['rouge2'].std()
rougeL_std = rouge_df['rouge-l'].std()

print(f'ROUGE-1 std = {rouge1_std}\nROUGE-2 std = {rouge2_std}\nROUGE-L std = {rougeL_std}')

ROUGE-1 std = 0.1123989420848567
ROUGE-2 std = 0.09590077062759982
ROUGE-L std = 0.09834351638319519
