In [1]:
import os
from transformers import AutoTokenizer, LongT5ForConditionalGeneration
import pandas as pd
import spacy
import torch
import accelerate

torch.cuda.is_available()

True

In [2]:
model_name = 'google/long-t5-tglobal-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = LongT5ForConditionalGeneration.from_pretrained(model_name,
                                                       low_cpu_mem_usage=False)

nlp = spacy.load('en_core_web_sm')

In [3]:
from datasets import load_dataset

cnn_daily_dataset = load_dataset('cnn_dailymail', '3.0.0')

train_df = pd.DataFrame.from_dict(cnn_daily_dataset['train'])
val_df = pd.DataFrame.from_dict(cnn_daily_dataset['validation'])
test_df = pd.DataFrame.from_dict(cnn_daily_dataset['test'])

df = pd.concat([train_df, test_df, val_df]).drop(columns='id')

In [4]:
from transformers import DataCollatorForSeq2Seq, pipeline

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
summarizer = pipeline('summarization', model=model_name)

Device set to use cuda:0


In [5]:
import unicodedata
import re

TOKENS_THRESHOLD = 2048
SUMM_RATIO = 0.125


def clean_article(article):
    article = unicodedata.normalize('NFD', article)
    article = re.sub(r'[^\x00-\x7F]+', '', article)
    article = re.sub(r'\s+', ' ', article)

    return article.strip()


def summ_article(article):
    article = 'summarize: ' + clean_article(article)
    summary_len = int(len(list(nlp(article))) * SUMM_RATIO)
    inputs = tokenizer(article,
                       return_tensors='pt',
                       truncation=True,
                       padding='longest',
                       max_length=2048).input_ids

    summary_ids = model.generate(inputs,
                                 max_length=min(TOKENS_THRESHOLD, summary_len))

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [6]:
from rouge import Rouge


def evaluate_article(article, reference):
    summary = summ_article(article)
    ref_summary = clean_article(reference)
    scores = Rouge().get_scores(summary, ref_summary)

    return summary, scores

In [8]:
TEST_ARTICLE_COUNT = 1000
test_articles = df.sample(n=TEST_ARTICLE_COUNT, random_state=420).reset_index(drop=True)
evaluation_inputs = [(row['article'], row['highlights']) for _, row in test_articles.iterrows()]

scores = []
summaries = []
for i, (_, row) in enumerate(test_articles.iterrows()):
    summary, score = evaluate_article(row['article'], row['highlights'])
    scores.append(score)
    summaries.append(summary)
    print(f'Article {i + 1} summarized')
test_articles['gen_summaries'] = summaries

Article 1 summarized
Article 2 summarized
Article 3 summarized
Article 4 summarized
Article 5 summarized
Article 6 summarized
Article 7 summarized
Article 8 summarized
Article 9 summarized
Article 10 summarized
Article 11 summarized
Article 12 summarized
Article 13 summarized
Article 14 summarized
Article 15 summarized
Article 16 summarized
Article 17 summarized
Article 18 summarized
Article 19 summarized
Article 20 summarized
Article 21 summarized
Article 22 summarized
Article 23 summarized
Article 24 summarized
Article 25 summarized
Article 26 summarized
Article 27 summarized
Article 28 summarized
Article 29 summarized
Article 30 summarized
Article 31 summarized
Article 32 summarized
Article 33 summarized
Article 34 summarized
Article 35 summarized
Article 36 summarized
Article 37 summarized
Article 38 summarized
Article 39 summarized
Article 40 summarized
Article 41 summarized
Article 42 summarized
Article 43 summarized
Article 44 summarized
Article 45 summarized
Article 46 summariz

In [9]:
rouge2_f1 = 0
rougel_f1 = 0
for score in scores:
    rouge2_f1 += score[0]['rouge-2']['f']
    rougel_f1 += score[0]['rouge-l']['f']
print(rouge2_f1 / len(scores) * 100)
print(rougel_f1 / len(scores) * 100)

8.119387877289746
23.065415680558456


In [24]:
import json

score_rows = []
for score in scores:
    try:
        score_dict = json.loads(json.dumps(score[0]))
        row = {
            'rouge-1': score_dict.get('rouge-1').get('f') * 100,
            'rouge-2': score_dict.get('rouge-2').get('f') * 100,
            'rouge-l': score_dict.get('rouge-l').get('f') * 100
        }
        score_rows.append(row)
    except (json.JSONDecodeError, KeyError):
        score_rows.append({
            'rouge-1': None,
            'rouge-2': None,
            'rouge-l': None
        })

score_df = pd.DataFrame(score_rows)
score_df.to_csv('rouge_1000-rand_seed420.csv', index=False)
score_df

Unnamed: 0,rouge-1,rouge-2,rouge-l
0,28.985507,2.666666,14.492753
1,13.157894,0.000000,10.526315
2,32.258064,9.210526,25.806451
3,15.789473,0.000000,10.526315
4,26.666666,6.060606,24.444444
...,...,...,...
995,24.827586,6.703910,24.827586
996,30.232558,12.244897,30.232558
997,16.666666,0.000000,12.500000
998,10.810810,0.000000,10.810810


In [25]:
print(f'ROUGE-1: {score_df["rouge-1"].mean() : .2f}')
print(f'ROUGE-2: {score_df["rouge-2"].mean() : .2f}')
print(f'ROUGE-l: {score_df["rouge-l"].mean() : .2f}')

ROUGE-1:  25.78
ROUGE-2:  8.12
ROUGE-l:  23.07
