# Modeling with T5

## Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import math

from datasets import load_dataset
import evaluate

import inspect

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

import regex as re

import os, re
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# These auto classes load the right type of tokenizer and model based on a model name
from transformers import AutoTokenizer, TFAutoModel
from transformers import pipeline
from transformers import AutoModel

  from .autonotebook import tqdm as notebook_tqdm


## Necessary Functions

In [2]:
rouge = evaluate.load('rouge')

In [3]:
chrf = evaluate.load("chrf")

In [4]:
def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

In [5]:
# This function is used to generate candidates and scores using a Huggingface model. 

# Default hyperparameters in this function are the same as those in the Huggingface models.  
# https://huggingface.co/docs/transformers/main_classes/text_generation

def t5_scores(mod, data, do_sample = False, num_beams = 1, top_k = 50, num_beam_groups = 1):

    bart_r1 = []
    bart_r2 = []
    bart_rL = []
    bart_rLs = []
    bart_chrf = []

    for i in range(int(len(data['text']))):


        candidate = mod(data['text'][i], 
                               truncation = True, # truncated to first 1024 words, because that is all the model can handle
                               max_length = 256, # same as one of the max reference lengths used in PEGASUS training
                               min_length = 0, 
                               do_sample = do_sample,
                               num_beams = num_beams, 
                               top_k = top_k,
                               num_beam_groups = num_beam_groups,
                                )[0]
        candidate = [candidate['summary_text']]
        #pprint(candidate[0], compact=True)

        ref = [data['summary'][i]]

        results = rouge.compute(predictions=candidate,
                                references=ref)

        bart_r1.append(results['rouge1'])
        bart_r2.append(results['rouge2'])
        bart_rL.append(results['rougeL'])
        bart_rLs.append(results['rougeLsum'])

        results = chrf.compute(predictions=candidate,
                                references=ref)

        bart_chrf.append(results['score'])
    
    print('Last Article', df['text'][i])
    print('Last Reference Summary', ref)
    print('Last Candidate Summary', candidate)

    print('rouge1 average :', np.mean(bart_r1))
    print('rouge2 average :', np.mean(bart_r2))
    print('rougeL average :', np.mean(bart_rL))
    print('rougeLs average :', np.mean(bart_rLs))
    print('chrf average :', np.mean(bart_chrf))


## Huggingface Transformers Training Script

https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py

## Data

In [6]:
# Validation Set
df = pd.read_csv('../Data/xl_sum_sample_val.csv')
df.head(5)

Unnamed: 0,text,summary
0,Anthony ZurcherNorth America reporter@awzurche...,On day three of public hearings in the impeach...
1,It made a net profit of $281m (£185m) in the t...,"Yum Brands, owner of KFC and Pizza Hut restaur..."
2,Police sources told local media that the boy h...,Four members of the same family have been arre...
3,Zelda Perkins told the Financial Times she sig...,A British former assistant of Harvey Weinstein...
4,Bus workers walked out on Monday over changes ...,Bus drivers in Jersey have agreed to meet with...


In [7]:
# Test set 
dft = pd.read_csv('../Data/xl_sum_sample_test.csv')
#dft.head(5)

## Models (Untrained)

### Model 0

In [8]:
t5_base_summarizer = pipeline("summarization", model="t5-base")

Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:01<00:00, 479MB/s]
Downloading (…)/main/tokenizer.json: 100%|█| 1.39M/1.39M [00:00<00:00, 68.6MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
t5_scores(t5_base_summarizer, df, do_sample = True)

Your max_length is set to 256, but you input_length is only 180. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 256, but you input_length is only 129. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 256, but you input_length is only 232. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=116)
Your max_length is set to 256, but you input_length is only 66. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 256, but you input_length is only 174. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 256, but you input_length is only 242. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=121)
Your max_length is set to 256, but you input_length is only 230. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Model 1

In [None]:
t5_base_summarizer = pipeline("summarization", model="t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
t5_scores(t5_base_summarizer, df, do_sample = True, num_beams = 4, top_k = 75)

Your max_length is set to 256, but you input_length is only 180. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 256, but you input_length is only 129. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 256, but you input_length is only 232. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=116)
Your max_length is set to 256, but you input_length is only 66. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 256, but you input_length is only 174. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 256, but you input_length is only 242. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=121)
Your max_length is set to 256, but you input_length is only 230. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Model 2 **BEST MODEL**

In [None]:
t5_base_summarizer = pipeline("summarization", model="t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [12]:
t5_scores(t5_base_summarizer, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

Your max_length is set to 256, but you input_length is only 180. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 256, but you input_length is only 129. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 256, but you input_length is only 232. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=116)
Your max_length is set to 256, but you input_length is only 66. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 256, but you input_length is only 174. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 256, but you input_length is only 242. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=121)
Your max_length is set to 256, but you input_length is only 230. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Testing Best Untrained Model From Above

In [13]:
t5_base_summarizer = pipeline("summarization", model="t5-base")

In [14]:
t5_scores(t5_base_summarizer, dft, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

Your max_length is set to 256, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 256, but you input_length is only 96. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 256, but you input_length is only 204. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=102)
Your max_length is set to 256, but you input_length is only 213. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=106)
Your max_length is set to 256, but you input_length is only 243. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=121)
Your max_length is set to 256, but you input_length is only 192. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 256, but you input_length is only 223. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

## Models (Trained)

### Model 0

In [28]:
!python3 transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path T5-small \
    --do_train \
    --train_file 'w266_project/Datasets/xl_sum_sample_train.csv' \
    --text_column text \
    --summary_column summary \
    --max_source_length 512 \
    --max_target_length 256 \
    --num_beams 5 \
    --num_train_epochs 5 \
    --source_prefix "summarize: " \
    --per_device_train_batch_size=32 \
    --output_dir='T5/model_0/' \
    --overwrite_output_dir=True \
    --predict_with_generate 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
04/08/2023 03:12:28 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla'

In [25]:
summarizer = pipeline("summarization", model="arisanguyen/finetuned_T5_all_categories", revision = 'model_0')

In [26]:
t5_scores(summarizer, df)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Your max_length is set to 256, but you input_length is only 180. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 256, but you input_length is only 129. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 256, but you input_length is only 232. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=116)
Your max_length is set to 256, but you input_length is only 66. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 256, but you input_length is only 174. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 256, but you input_length is only 242. You might consider decre

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Model 1: Increased epochs

In [22]:
!python3 ~/transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path T5-small \
    --do_train \
    --train_file '../Data/xl_sum_sample_train.csv' \
    --text_column text \
    --summary_column summary \
    --max_source_length 512 \
    --max_target_length 256 \
    --num_beams 1 \
    --num_train_epochs 15 \
    --source_prefix "summarize: " \
    --per_device_train_batch_size=32 \
    --output_dir='T5/model_1/' \
    --overwrite_output_dir=True \
    --predict_with_generate 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
04/16/2023 07:14:12 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla'

In [23]:
summarizer = pipeline("summarization", model="arisanguyen/finetuned_T5_all_categories", revision = 'model_1')

Downloading pytorch_model.bin: 100%|██████████| 242M/242M [00:02<00:00, 103MB/s]
Downloading (…)del_1/tokenizer.json: 100%|█| 2.42M/2.42M [00:00<00:00, 91.6MB/s]


In [27]:
t5_scores(summarizer, df)

Your max_length is set to 256, but you input_length is only 180. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 256, but you input_length is only 129. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 256, but you input_length is only 232. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=116)
Your max_length is set to 256, but you input_length is only 66. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 256, but you input_length is only 174. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 256, but you input_length is only 242. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=121)
Your max_length is set to 256, but you input_length is only 230. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Model 2: Added beam groups for diversification. **BEST MODEL**

In [29]:
summarizer = pipeline("summarization", model="arisanguyen/finetuned_T5_all_categories", revision = 'model_0')

Downloading pytorch_model.bin: 100%|██████████| 242M/242M [00:02<00:00, 102MB/s]
Downloading (…)del_0/tokenizer.json: 100%|█| 2.42M/2.42M [00:00<00:00, 81.0MB/s]


In [32]:
t5_scores(summarizer, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

Your max_length is set to 256, but you input_length is only 180. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 256, but you input_length is only 129. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 256, but you input_length is only 232. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=116)
Your max_length is set to 256, but you input_length is only 66. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 256, but you input_length is only 174. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 256, but you input_length is only 242. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=121)
Your max_length is set to 256, but you input_length is only 230. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin