**BART Trained on XL SUM**

**Packages**

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import math

from datasets import load_dataset
import evaluate

import inspect

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

import regex as re

import os, re
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# These auto classes load the right type of tokenizer and model based on a model name
from transformers import AutoTokenizer, TFAutoModel

**Necessary Functions**

In [15]:
rouge = evaluate.load('rouge')

In [16]:
chrf = evaluate.load("chrf")

In [17]:
def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

In [18]:
def data_organize(sample_index):

    article = []
    summary = []

    for i in sample_index["index"]:
        summary.append(dataset["train"][i]['summary'])
        article.append(dataset["train"][i]['text'])

    return article, summary

**Data**

In [19]:
dataset = load_dataset("csebuetnlp/xlsum", "english")

Found cached dataset xlsum (/home/ubuntu/.cache/huggingface/datasets/csebuetnlp___xlsum/english/2.0.0/518ab0af76048660bcc2240ca6e8692a977c80e384ffb18fdddebaca6daebdce)
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 330.22it/s]


**Sampling the Data**

**Train, Val, and Test sets for all XL Sum**

In [20]:
index = pd.DataFrame({"index": list(range(len(dataset['train'])))})
sample_index = index.sample(n=2000, replace=False, random_state=1004)
sample_index[:5]

Unnamed: 0,index
235420,235420
172024,172024
253546,253546
224954,224954
214134,214134


In [21]:
article, summary = data_organize(sample_index)

In [35]:
d = {'text': article[:1000],  'summary': summary[:1000]}
df = pd.DataFrame(data=d)
df.to_csv('w266_project/xl_sum_sample_train.csv', index = False)
#df.head(5)

In [36]:
d = {'text': article[1000:1100],  'summary': summary[1000:1100]}
df = pd.DataFrame(data=d)
df.to_csv('w266_project/xl_sum_sample_val.csv', index = False)
df.head(5)

Unnamed: 0,text,summary
0,Anthony ZurcherNorth America reporter@awzurche...,On day three of public hearings in the impeach...
1,It made a net profit of $281m (£185m) in the t...,"Yum Brands, owner of KFC and Pizza Hut restaur..."
2,Police sources told local media that the boy h...,Four members of the same family have been arre...
3,Zelda Perkins told the Financial Times she sig...,A British former assistant of Harvey Weinstein...
4,Bus workers walked out on Monday over changes ...,Bus drivers in Jersey have agreed to meet with...


In [37]:
d = {'text': article[1100:1200],  'summary': summary[1100:1200]}
df = pd.DataFrame(data=d)
df.to_csv('w266_project/xl_sum_sample_test.csv', index = False)
df.head(5)

Unnamed: 0,text,summary
0,"In a statement, the White House said more than...",Hackers who breached US government networks st...
1,Fourteen fire engines were sent to tackle the ...,"A ""major"" fire on Bognor Regis seafront has be..."
2,The 240-turbine Atlantic Array would be double...,The European Commission (EC) has launched an i...
3,It is believed the patient contracted the infe...,A patient has been diagnosed with the rare vir...
4,By Chris JohnstonBusiness reporter The rise in...,UK inflation will quadruple to about 4% in the...


**Train, Val, and Test sets for each category**

In [25]:
def find_indices(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

In [None]:
categories = []

for i in range(len(dataset['train'])):
    cat = dataset['train'][i]['id']
    result = re.sub('\d','',cat)[:-1]
    result = result.split('-')[0].split('.')[0]
    categories.append(result)

**Category 1: uk**

In [None]:
uk = find_indices(categories, 'uk')
index = pd.DataFrame({"index": uk})
sample_index = index.sample(n=2000, replace=False, random_state=1004)

article, summary = data_organize(sample_index)

d = {'text': article[:1000],  'summary': summary[:1000]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_train_uk')
df.head(5)

d = {'text': article[1000:1100],  'summary': summary[1000:1100]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_val_uk')
df.head(5)

d = {'text': article[1100:1200],  'summary': summary[1100:1200]}
df = pd.DataFrame(data=d)
df.to_csv('xl_sum_sample_test_uk')
df.head(5)

**Training the Model**

https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py

**All Categories**

All Categories: Model 0

In [None]:
    --push_to_hub_model_id finetuned-BART-all-categories \

In [73]:
! python3 transformers/examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path=facebook/bart-base \
    --do_train \
    --do_eval \
    --train_file='w266_project/xl_sum_sample_train.csv' \
    --validation_file='w266_project/xl_sum_sample_val.csv' \
    --push_to_hub \
    --output_dir='w266_project/finetuned-BART-all-categories' \
    --overwrite_output_dir \
    --per_device_train_batch_size=32 \
    --per_device_eval_batch_size=32 \
    --predict_with_generate 

04/02/2023 09:25:10 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_config=None,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better

In [None]:
    --hub_model_id='arisanguyen/finetuned-BART-all-categories' \

In [74]:
! repo.git_pull()

/bin/bash: -c: line 1: syntax error: unexpected end of file


In [72]:
from transformers import pipeline
from transformers import AutoModel

model = AutoModel.from_pretrained(
    "arisanguyen/finetuned-BART-all-categories"
)

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []
bart_chrf = []

for i in range(int(len(df['article'])/2)):
    
    #art = ' '.join(df['article'][i].split(' ')[:1024]) #truncated to first 1024 words, because that is all the model can handle
    
    candidate = summarizer(df['article'][i], 
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                             #max_length=130, min_length=30, do_sample=False
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_chrf.append(results['score'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs, 'chrf': bart_chrf}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_trained_0_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs,'chrf': bart_chrf}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_trained_0_scores.csv', index=False)

print('Last Article', df['article'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

OSError: arisanguyen/finetuned-BART-all-categories does not appear to have a file named config.json. Checkout 'https://huggingface.co/arisanguyen/finetuned-BART-all-categories/main' for available files.