In [1]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import f1_score
from datasets import load_dataset, load_metric
from transformers import DataCollatorForSeq2Seq, AdamWeightDecay, \
    TFT5ForConditionalGeneration, T5Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_function(examples):
    """ Use tokenizer to preprocess data. """
    
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    prefix = "summarize: "

    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=80, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


def download_and_preprocess_data():
    """ Load dataset from HuggingFace and preprocess. """
    
    news_ds = load_dataset("cnn_dailymail", "3.0.0", split="test")

    # Tokenized using preprocess_function
    tokenized_news = news_ds.map(preprocess_function, batched=True)

    return tokenized_news

In [3]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

optimizer = AdamWeightDecay(
    learning_rate=2e-5, 
    weight_decay_rate=0.01
)

model = TFT5ForConditionalGeneration.from_pretrained("t5_small_news")
model.compile(optimizer=optimizer)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model, 
    return_tensors="tf",
)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5_small_news.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [4]:
tokenized_news = download_and_preprocess_data()
tokenized_news

Reusing dataset cnn_dailymail (C:\Users\andyl\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Loading cached processed dataset at C:\Users\andyl\.cache\huggingface\datasets\cnn_dailymail\3.0.0\3.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234\cache-55793dd6c08d73ff.arrow


Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 11490
})

In [8]:
test_ds = tokenized_news.to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)

In [9]:
def compute_metrics(metric, pred, actual):
    """ Compute the model's rouge performance on an instance. """

    metric.add(predictions=pred, references=actual)
    final_score = metric.compute()
    
    return final_score

In [13]:
metric = load_metric('rouge')
result = [[] for x in range(3)]

cnt = 0
for item in test_ds:
    article = item['input_ids']
    actual = item['labels']
    
    pred = model.generate(
        do_sample=True,
        input_ids=article,
        # min_length=56,
        max_length=80,
        temperature=0.8, 
        top_k=45,
        no_repeat_ngram_size=3,
        num_beams=5,
        early_stopping=True
    )

    rouge_score = compute_metrics(metric, pred, actual)
    rouge1 = 100 * rouge_score['rouge1'][1][2]
    rouge2 = 100 * rouge_score['rouge2'][1][2]
    rougeL = 100 * rouge_score['rougeL'][1][2]

    cnt += 1 
    if cnt % 25 == 0:
        print(f'Round: {cnt * 4}')

    result[0].append(rouge1)
    result[1].append(rouge2)
    result[2].append(rougeL)

Round: 100
Round: 200
Round: 300
Round: 400
Round: 500
Round: 600
Round: 700
Round: 800
Round: 900
Round: 1000
Round: 1100
Round: 1200
Round: 1300
Round: 1400
Round: 1500
Round: 1600
Round: 1700
Round: 1800
Round: 1900
Round: 2000
Round: 2100
Round: 2200
Round: 2300
Round: 2400
Round: 2500
Round: 2600
Round: 2700
Round: 2800
Round: 2900
Round: 3000
Round: 3100
Round: 3200
Round: 3300
Round: 3400
Round: 3500
Round: 3600
Round: 3700
Round: 3800
Round: 3900
Round: 4000
Round: 4100
Round: 4200
Round: 4300
Round: 4400
Round: 4500
Round: 4600
Round: 4700
Round: 4800
Round: 4900
Round: 5000
Round: 5100
Round: 5200
Round: 5300
Round: 5400
Round: 5500
Round: 5600
Round: 5700
Round: 5800
Round: 5900
Round: 6000
Round: 6100
Round: 6200
Round: 6300
Round: 6400
Round: 6500
Round: 6600
Round: 6700
Round: 6800
Round: 6900
Round: 7000
Round: 7100
Round: 7200
Round: 7300
Round: 7400
Round: 7500
Round: 7600
Round: 7700
Round: 7800
Round: 7900
Round: 8000
Round: 8100
Round: 8200
Round: 8300
Round: 8400
R

In [14]:
result_df = pd.DataFrame(data = np.array(result).T, columns = ['Rouge1','Rouge2','RougeL'])
result_df

Unnamed: 0,Rouge1,Rouge2,RougeL
0,43.043478,21.834061,29.130435
1,43.362832,20.444444,25.663717
2,40.849673,20.983607,27.450980
3,43.478261,20.087336,29.565217
4,32.352941,11.475410,18.627451
...,...,...,...
2868,54.375000,33.855799,39.375000
2869,52.500000,25.391850,30.000000
2870,54.375000,29.153605,31.875000
2871,35.312500,11.285266,19.375000


In [15]:
result_df.describe()

Unnamed: 0,Rouge1,Rouge2,RougeL
count,2873.0,2873.0,2873.0
mean,48.644851,25.545884,30.80656
std,6.285275,6.257816,5.866729
min,23.553719,6.639004,12.809917
25%,44.921875,21.316614,26.875
50%,48.75,25.07837,30.254777
75%,52.8125,29.153605,34.375
max,69.6875,50.15674,54.6875


## Play around with text summarization

In [16]:
article = """ 
NEW DELHI, India (CNN) -- A high court in northern India on Friday acquitted a wealthy businessman facing the death sentence for the killing of a teen in a case dubbed "the house of horrors."
Moninder Singh Pandher was sentenced to death by a lower court in February.
The teen was one of 19 victims -- children and young women -- in one of the most gruesome serial killings in India in recent years.
The Allahabad high court has acquitted Moninder Singh Pandher, his lawyer Sikandar B. Kochar told CNN.
Pandher and his domestic employee Surinder Koli were sentenced to death in February by a lower court for the rape and murder of the 14-year-old.
The high court upheld Koli's death sentence, Kochar said.
"""

In [17]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenized_input = tokenizer("summarize: " + article, max_length=1024, truncation=True, return_tensors='tf')

pred = model.generate(
    do_sample=True,
    input_ids=tokenized_input['input_ids'],
    min_length=56,
    max_length=128,
    temperature=0.8, 
    top_k=45,
    no_repeat_ngram_size=3,
    num_beams=5,
    early_stopping=True
)

pred_sentence = tokenizer.decode(pred[0], skip_special_tokens=True)

print(f"original = {article}\n")
print(f"pred = {pred_sentence}\n")

original =  
NEW DELHI, India (CNN) -- A high court in northern India on Friday acquitted a wealthy businessman facing the death sentence for the killing of a teen in a case dubbed "the house of horrors."
Moninder Singh Pandher was sentenced to death by a lower court in February.
The teen was one of 19 victims -- children and young women -- in one of the most gruesome serial killings in India in recent years.
The Allahabad high court has acquitted Moninder Singh Pandher, his lawyer Sikandar B. Kochar told CNN.
Pandher and his domestic employee Surinder Koli were sentenced to death in February by a lower court for the rape and murder of the 14-year-old.
The high court upheld Koli's death sentence, Kochar said.


pred = Moninder Singh Pandher was one of 19 victims in one of most gruesome serial killings in India. Pandhe and his domestic employee Surinder Koli were sentenced to death in February. High court upheld Koli's death sentence, lawyer says.

