In [4]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import f1_score
from datasets import load_dataset, load_metric
from transformers import DataCollatorForSeq2Seq, AdamWeightDecay, \
    TFT5ForConditionalGeneration, T5Tokenizer

In [8]:
def preprocess_function(examples):
    """ Use tokenizer to preprocess data. """
    
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    prefix = "summarize: "

    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=80, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def download_and_preprocess_data():
    """ Load dataset from HuggingFace and preprocess. """
    
    news_ds = load_dataset("cnn_dailymail", "3.0.0", split="test")

    # Tokenized using preprocess_function
    tokenized_news = news_ds.map(preprocess_function, batched=True)

    return tokenized_news


In [9]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenized_news = download_and_preprocess_data()

optimizer = AdamWeightDecay(
    learning_rate=2e-5, 
    weight_decay_rate=0.01
)

model = TFT5ForConditionalGeneration.from_pretrained("t5_small_news")
model.compile(optimizer=optimizer)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model, 
    return_tensors="tf",
)

Reusing dataset cnn_dailymail (C:\Users\andyl\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
100%|██████████| 12/12 [00:42<00:00,  3.58s/ba]
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5_small_news.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [10]:
tokenized_news

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 11490
})

In [44]:
test_ds = tokenized_news.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=8,
    collate_fn=data_collator,
)

In [45]:
def compute_metrics(metric, pred, actual):
    """ Compute the model's rouge performance on an instance. """

    metric.add(predictions=pred, references=actual)
    final_score = metric.compute()
    
    return final_score

In [46]:
metric = load_metric('rouge')
result = [[] for x in range(3)]
cnt = 100

for item in test_ds:
    if cnt < 0:
        break
    cnt -= 1

    article = item['input_ids']
    actual = item['labels']
    
    pred = model.generate(
        do_sample=True,
        input_ids=article,
        min_length=56,
        max_length=128,
        temperature=0.8, 
        top_k=45,
        no_repeat_ngram_size=3,
        num_beams=5,
        early_stopping=True
    )

    rouge_score = compute_metrics(metric, pred, actual)
    rouge1 = 100 * rouge_score['rouge1'][1][2]
    rouge2 = 100 * rouge_score['rouge2'][1][2]
    rougeL = 100 * rouge_score['rougeL'][1][2]

    result[0].append(rouge1)
    result[1].append(rouge2)
    result[2].append(rougeL)

    print(f"rouge1: {rouge1}")
    print(f"rouge2: {rouge2}")
    print(f"rougeL: {rougeL}\n")

rouge1: 43.52517985611511
rouge2: 19.63963963963964
rougeL: 26.258992805755394

rouge1: 39.58333333333333
rouge2: 18.138041733547354
rougeL: 23.076923076923077

rouge1: 37.738853503184714
rouge2: 17.38437001594896
rougeL: 23.2484076433121

rouge1: 40.816326530612244
rouge2: 18.56899488926746
rougeL: 22.10884353741497

rouge1: 39.67391304347826
rouge2: 12.522686025408348
rougeL: 19.746376811594203

rouge1: 40.136054421768705
rouge2: 15.332197614991482
rougeL: 21.42857142857143

rouge1: 32.6530612244898
rouge2: 11.41396933560477
rougeL: 17.346938775510207

rouge1: 44.48529411764706
rouge2: 19.337016574585633
rougeL: 24.264705882352942

rouge1: 42.41379310344828
rouge2: 22.970639032815203
rougeL: 25.517241379310345

rouge1: 45.78125
rouge2: 21.909233176838807
rougeL: 24.53125

rouge1: 34.640522875817
rouge2: 14.566284779050736
rougeL: 20.098039215686274

rouge1: 44.99999999999999
rouge2: 20.408163265306122
rougeL: 27.77777777777778

rouge1: 41.24203821656051
rouge2: 17.38437001594896
roug

In [47]:
result_df = pd.DataFrame(data = np.array(result).T, columns = ['Rouge1','Rouge2','RougeL'])
result_df

Unnamed: 0,Rouge1,Rouge2,RougeL
0,43.525180,19.639640,26.258993
1,39.583333,18.138042,23.076923
2,37.738854,17.384370,23.248408
3,40.816327,18.568995,22.108844
4,39.673913,12.522686,19.746377
...,...,...,...
96,44.463087,27.226891,30.201342
97,36.858974,16.853933,20.512821
98,46.626984,23.856859,29.761905
99,45.270270,20.473773,27.364865


In [48]:
result_df.describe()

Unnamed: 0,Rouge1,Rouge2,RougeL
count,101.0,101.0,101.0
mean,41.122402,18.848548,23.775518
std,3.732857,3.693768,3.519158
min,28.965517,11.413969,17.346939
25%,39.137931,16.695652,21.656051
50%,41.134752,18.568995,23.310811
75%,43.387097,20.582524,25.178571
max,56.21118,38.102644,42.236025


## Play around with text summarization

In [55]:
article = """ 
(CNN)Ancient humans ate copious quantities of oysters -- shucking billions of shells over thousands of years in a way that did
not appear to cause oyster populations to collapse as they have in many places today.
New research, based on an analysis of dozens of archaeological sites in the United States and Australia,
suggested that oysters were sustainably farmed on a massive scale by Indigenous groups. 
The mollusks were an abundant source of food despite being harvested intensively.
The authors of the study, which published Tuesday in the scientific journal Nature Communications, 
said these sites were a "forgotten resource" that could inform the future management of oyster beds.
"The fact that there are so many oysters at archaeological sites in so many different regions is an important lesson," 
said study author Leslie Reeder-Myers, an assistant professor of anthropology at Temple University in Philadelphia, in a statement.
"These systems have a ton of potential and huge quantities of oysters can be sustainably harvested over long time periods if the ecosystem is healthy," 
said Reeder-Myers, who is also director of Temple's anthropology laboratory. The amount of oysters consumed in some places was staggering, the study found.
"""

In [54]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenized_input = tokenizer("summarize: " + article, max_length=1024, truncation=True, return_tensors='tf')

pred = model.generate(
    do_sample=True,
    input_ids=tokenized_input['input_ids'],
    min_length=56,
    max_length=128,
    temperature=0.8, 
    top_k=45,
    no_repeat_ngram_size=3,
    num_beams=5,
    early_stopping=True
)

pred_sentence = tokenizer.decode(pred[0], skip_special_tokens=True)

print(f"original = {article}\n")
print(f"pred = {pred_sentence}\n")

original =  
(CNN)Ancient humans ate copious quantities of oysters -- shucking billions of shells over thousands of years in a way that did
not appear to cause oyster populations to collapse as they have in many places today.
New research, based on an analysis of dozens of archaeological sites in the United States and Australia,
suggested that oysters were sustainably farmed on a massive scale by Indigenous groups. 
The mollusks were an abundant source of food despite being harvested intensively.
The authors of the study, which published Tuesday in the scientific journal Nature Communications, 
said these sites were a "forgotten resource" that could inform the future management of oyster beds.
"The fact that there are so many oysters at archaeological sites in so many different regions is an important lesson," 
said study author Leslie Reeder-Myers, an assistant professor of anthropology at Temple University in Philadelphia, in a statement.
"These systems have a ton of potential and hu