# Modeling with BART

## Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import math

from datasets import load_dataset
import evaluate

import inspect

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

import regex as re

import os, re
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# These auto classes load the right type of tokenizer and model based on a model name
from transformers import AutoTokenizer, TFAutoModel
from transformers import pipeline
from transformers import AutoModel

  from .autonotebook import tqdm as notebook_tqdm


## Necessary Functions

In [2]:
rouge = evaluate.load('rouge')

In [3]:
chrf = evaluate.load("chrf")

In [4]:
def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

In [5]:
# This function is used to generate candidates and scores using a Huggingface model. 

# Default hyperparameters in this function are the same as those in the Huggingface models.  
# https://huggingface.co/docs/transformers/main_classes/text_generation

def bart_scores(mod, data, do_sample = False, num_beams = 1, top_k = 50, num_beam_groups = 1):

    bart_r1 = []
    bart_r2 = []
    bart_rL = []
    bart_rLs = []
    bart_chrf = []
    texts = []
    references = []
    candidates = []

    for i in range(int(len(data['text']))):

        #art = ' '.join(df['article'][i].split(' ')[:1024]) #truncated to first 1024 words, because that is all the model can handle

        candidate = mod(data['text'][i], 
                               truncation = True, #truncated to first 1024 words, because that is all the model can handle
                               max_length = 256, # same as pegasus
                               min_length = 0, 
                               do_sample = do_sample,
                               num_beams = num_beams, 
                               top_k = top_k,
                               num_beam_groups = num_beam_groups,
                                )[0]
        candidate = [candidate['summary_text']]
        #pprint(candidate[0], compact=True)

        ref = [data['summary'][i]]

        references.append(ref)
        
        candidates.append(candidate)
        
        texts.append(data['text'][i])
        
        results = rouge.compute(predictions=candidate,
                                references=ref)

        bart_r1.append(results['rouge1'])
        bart_r2.append(results['rouge2'])
        bart_rL.append(results['rougeL'])
        bart_rLs.append(results['rougeLsum'])

        results = chrf.compute(predictions=candidate,
                                references=ref)

        bart_chrf.append(results['score'])
    
    print('Last Article', data['text'][i])
    print('Last Reference Summary', ref)
    print('Last Candidate Summary', candidate)

    print('rouge1 average :', np.mean(bart_r1))
    print('rouge2 average :', np.mean(bart_r2))
    print('rougeL average :', np.mean(bart_rL))
    print('rougeLs average :', np.mean(bart_rLs))
    print('chrf average :', np.mean(bart_chrf))

## Huggingface Transformers Training Script

https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py

## Data

In [6]:
# Validation Set
df = pd.read_csv('../Data/xl_sum_sample_val.csv')
df.head(5)

Unnamed: 0,text,summary
0,Anthony ZurcherNorth America reporter@awzurche...,On day three of public hearings in the impeach...
1,It made a net profit of $281m (£185m) in the t...,"Yum Brands, owner of KFC and Pizza Hut restaur..."
2,Police sources told local media that the boy h...,Four members of the same family have been arre...
3,Zelda Perkins told the Financial Times she sig...,A British former assistant of Harvey Weinstein...
4,Bus workers walked out on Monday over changes ...,Bus drivers in Jersey have agreed to meet with...


## Model (Untrained)

### Model 0

In [8]:
summarizer = pipeline("summarization", model="facebook/bart-base")

In [9]:
bart_scores(summarizer, df, do_sample = False, num_beams = 1, top_k = 10, num_beam_groups = 1)

Your max_length is set to 256, but you input_length is only 172. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=86)
Your max_length is set to 256, but you input_length is only 123. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 256, but you input_length is only 221. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=110)
Your max_length is set to 256, but you input_length is only 255. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=127)
Your max_length is set to 256, but you input_length is only 56. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 256, but you input_length is only 151. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 256, but you input_length is only 220. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Model 1

In [10]:
summarizer = pipeline("summarization", model="facebook/bart-base")

In [12]:
bart_scores(summarizer, df, do_sample = True, num_beams = 2, top_k = 10, num_beam_groups = 1)

Your max_length is set to 256, but you input_length is only 172. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=86)
Your max_length is set to 256, but you input_length is only 123. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 256, but you input_length is only 221. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=110)
Your max_length is set to 256, but you input_length is only 255. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=127)
Your max_length is set to 256, but you input_length is only 56. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 256, but you input_length is only 151. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 256, but you input_length is only 220. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Model 2 **BEST MODEL TIED WITH MODEL 0**

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-base")

In [13]:
bart_scores(summarizer, df, do_sample = False, num_beams = 1, top_k = 50, num_beam_groups = 1)

Your max_length is set to 256, but you input_length is only 172. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=86)
Your max_length is set to 256, but you input_length is only 123. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 256, but you input_length is only 221. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=110)
Your max_length is set to 256, but you input_length is only 255. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=127)
Your max_length is set to 256, but you input_length is only 56. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 256, but you input_length is only 151. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 256, but you input_length is only 220. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

## Model (Trained)

### Model 0

In [None]:
!python3 transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path=facebook/bart-base \
    --do_train \
    --do_eval \
    --train_file 'w266_project/xl_sum_sample_train.csv' \
    --validation_file 'w266_project/xl_sum_sample_val.csv' \
    --text_column text \
    --summary_column summary \
    --push_to_hub=True \
    --max_source_length 128 \
    --max_target_length 32 \
    --num_train_epochs 1 \
    --per_device_train_batch_size=32 \
    --per_device_eval_batch_size=32 \
    --output_dir='w266_project/finetuned-BART-all-categories/model 0/finetuned-BART-all-categories' \
    --overwrite_output_dir=True \
    --predict_with_generate 

In [93]:
summarizer = pipeline("summarization", model="arisanguyen/finetuned-BART-all-categories", revision = 'model_0')

In [94]:
bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []
bart_chrf = []

for i in range(int(len(df['text']))):
    
    candidate = summarizer(df['text'][i], 
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                            )[0]
    candidate = [candidate['summary_text']]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    results = chrf.compute(predictions=candidate,
                            references=ref)
    
    bart_chrf.append(results['score'])

Your max_length is set to 128, but you input_length is only 123. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 128, but you input_length is only 56. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 128, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


In [95]:
print('Last Article', df['text'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

In [96]:
print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))
print('chrf average :', np.mean(bart_chrf))

rouge1 average : 0.3192639042917483
rouge2 average : 0.10466960084268205
rougeL average : 0.24501046685959502
rougeLs average : 0.24501046685959502
chrf average : 27.774860738085867


### Model 1 (number of epochs to 62 (~2k steps total) to be same as PEGASUS)

In [None]:
!python3 transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path=facebook/bart-base \
    --do_train \
    --train_file 'w266_project/xl_sum_sample_train.csv' \
    --text_column text \
    --summary_column summary \
    --max_source_length 512 \
    --max_target_length 256 \
    --num_train_epochs 62 \
    --per_device_train_batch_size=32 \
    --push_to_hub=True \
    --output_dir='w266_project/finetuned-BART-all-categories/model 1/finetuned-BART-all-categories' \
    --overwrite_output_dir=True \
    --predict_with_generate 

In [97]:
summarizer = pipeline("summarization", model="arisanguyen/finetuned-BART-all-categories", revision = "model_1")

In [103]:
df = pd.read_csv('./w266_project/Datasets/xl_sum_sample_val.csv')

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []
bart_chrf = []

for i in range(int(len(df['text']))):
    
    candidate = summarizer(df['text'][i], 
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                           max_length = 256, # same as pegasus
                           min_length = 0, 
                             #max_length=130, min_length=30, do_sample=False
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    results = chrf.compute(predictions=candidate,
                            references=ref)
    
    bart_chrf.append(results['score'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs, 'chrf': bart_chrf}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_trained_1_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs,'chrf': bart_chrf}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_trained_1_scores.csv', index=False)

print('Last Article', df['text'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))
print('chrf average :', np.mean(bart_chrf))



0


Your max_length is set to 256, but you input_length is only 172. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=86)
Your max_length is set to 256, but you input_length is only 123. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 256, but you input_length is only 221. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=110)
Your max_length is set to 256, but you input_length is only 255. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=127)
Your max_length is set to 256, but you input_length is only 56. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 256, but you input_length is only 151. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 256, but you input_length is only 220. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Model 2 (added beams, chagned epochs, added validation target length)

In [37]:
!python3 transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path=facebook/bart-base \
    --do_train \
    --train_file 'w266_project/Datasets/xl_sum_sample_train.csv' \
    --text_column text \
    --summary_column summary \
    --max_source_length 512 \
    --max_target_length 256 \
    --val_max_target_length 256 \
    --num_beams 5 \
    --num_train_epochs 5 \
    --per_device_train_batch_size=32 \
    --push_to_hub=True \
    --output_dir='w266_project/finetuned-BART-all-categories/model 2/finetuned-BART-all-categories' \
    --overwrite_output_dir=True \
    --predict_with_generate 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
04/07/2023 15:57:38 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla'

In [40]:
summarizer = pipeline("summarization", model="arisanguyen/finetuned-BART-all-categories", revision = "model_2")

In [56]:
bart_scores(summarizer, df)

Your max_length is set to 256, but you input_length is only 172. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=86)
Your max_length is set to 256, but you input_length is only 123. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 256, but you input_length is only 221. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=110)
Your max_length is set to 256, but you input_length is only 255. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=127)
Your max_length is set to 256, but you input_length is only 56. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 256, but you input_length is only 151. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 256, but you input_length is only 220. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin

### Model 3 (best training parameters above with new task specific paramters) **BEST MODEL**

In [106]:
summarizer = pipeline("summarization", model="arisanguyen/finetuned-BART-all-categories", revision = "model_2")

In [108]:
bart_scores(summarizer, df, do_sample = True, num_beams = 4, top_k = 75)

Your max_length is set to 256, but you input_length is only 172. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=86)
Your max_length is set to 256, but you input_length is only 123. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 256, but you input_length is only 221. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=110)
Your max_length is set to 256, but you input_length is only 255. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=127)
Your max_length is set to 256, but you input_length is only 56. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 256, but you input_length is only 151. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 256, but you input_length is only 220. You might 

Last Article She tweeted that the clothes would be "archived & expertly cared for in the spirit & love of Michael Jackson, his bravery, & fans worldwide". The auction included a jacket worn during Jackson's Bad tour, that went for $240,000 (£148,000) and two crystal gloves. The items were all made by designers Dennis Tompkins and Michael Bush. Lady Gaga also tweeted a picture of herself and her bidding paddle at the auction. More than $5m (£3.1m) was raised by the sale, according to LA-based Julien's Auctions. Other items that went under the hammer included jackets from Michael Jackson's Dangerous and Thriller tours and a pair of jeans that went for $50,000 (£31,000). Some of the money raised by the auction is being donated to a guide dogs charity and a hospice in Las Vegas. American costume designers Michael Bush and Dennis Tompkins created thousands of original pieces for Michael Jackson during his career. However, despite Lady Gaga's assurances, some fans expressed their anger onlin