### Pegasus-Finetuned and Automatic Evaluation
#### Fine-tune pretrained Pegasus using labeled process data
- Experiment:
    - Summarization as a downstream task to generate process labels 
    - Model is fine-tuned with labeled process data
    - Generated labels is automatically evaluated with metrics BERTScore
- Process data:
    - Document (process text) and summary (process label)
- Outline:
    - Track the experiment and its results with WandB (Weights & Biases)
    - Define the experiment, data loading, training and validation 
    - Validation using BERTScore after each training epoch to decide the final output model
    - Automatic evaluation using BERTScore


#### Reference
- Pegasus Hugging Face: 
https://huggingface.co/docs/transformers/model_doc/pegasus
- Hugging Face Transformer fine-tuning tutorial:
https://huggingface.co/docs/transformers/training
- BERTScore github: 
https://github.com/Tiiiger/bert_score
- WandB pipeline:
https://colab.research.google.com/github/wandb/examples/blob/master/colabs/pytorch/Simple_PyTorch_Integration.ipynb#scrollTo=FH61NWlVR_SL


#### Environment Setup 
- Amazon SageMaker Studio
- Kernel - Python 3 (Data Science)

In [None]:
# %%capture
# !pip3 install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# !pip3 install transformers
# !pip3 install sentencepiece
# !pip3 install wandb --upgrade
# !pip3 install bert-score

#### Import Libraries

In [1]:
import os
import random
import json
import numpy as np
import pandas as pd
import torch
import sys

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import PegasusConfig, PegasusModel, PegasusForConditionalGeneration, PegasusTokenizerFast, get_scheduler
from transformers.optimization import Adafactor

from sklearn.model_selection import train_test_split
from bert_score import BERTScorer

import wandb

#### WandB Setup

In [2]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

In [3]:
# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myentingwang[0m ([33myenting-thesis[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

#### Define the Experiment and Pipeline 


In [5]:
# Define the configuration of the experiment
config = dict(
    epochs = 15,
    batch_size = 20, # or 22
    optimizer = "adafactor",
    loss_function = "summarization-loss", # loss calculated given ground truth summaries (process names)
    dataset = "bpmai-29-10-2019",
    architecture = "seq2seq-pegasus",
    retrain = False, # True if continue training from checkpoint of previous iteration
    input_model = "", # specify path of input model if continue training or left blank
    output_model = "./model_summarization/summarization_{}_epoch.pth" # specify path to save output model, i.e. "./model_summarization/summarization_{}_epoch.pth"
)

##### Track metadata and hyperparameters with wandb.init

In [6]:
# Define the training pipeline
def model_pipeline(hyperparameters):
    with wandb.init(project="wandb-project-name", entity="yenting-thesis", config=hyperparameters):
        config = wandb.config
        # set model, data loader, tokenizer and optimizer with defined config
        model, train_loader, tokenizer, optimizer = make(config)
        # train and validate
        train_and_val(model, train_loader, doc_val, sum_val, tokenizer, optimizer, config)

    return model

##### Set model, data loaders and optimizer with defined configuration

In [7]:
def make(config):
    # set pretrained tokenizer, model and optimizer
    model_name = 'google/pegasus-large' # 'google/pegasus-xsum'
    tokenizer = PegasusTokenizerFast.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True)
    optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
    
    # if continue training from checkpoint of previous iteration
    if config.retrain: 
        load(model, optimizer, config.input_model)
        model = PegasusForConditionalGeneration.from_pretrained(model_name, output_hidden_states=True, output_attentions=True, return_dict=True)
        if torch.cuda.device_count() > 1:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            model = nn.DataParallel(model)
    model.to(device)
    
    # set data loaders
    train_loader = make_loader(doc_train, sum_train, tokenizer, shuffle=True, batch_size=config.batch_size)
    # test print data
    for batch in train_loader:
        break
    print({k: v.shape for k, v in batch.items()})
    
    return model, train_loader, tokenizer, optimizer

#### Define Data Loading
#### Load data


In [8]:
with open('./data/train_test_labeled_dataset.json', 'r') as f:
    process = json.load(f)
doc_train, doc_val, sum_train, sum_val = train_test_split(process['document_train'], process['summary_train'], test_size=0.175, random_state=41)
doc_test = process['document_test']
sum_test = process['summary_test']

##### Load augmented train, val and test dataset

In [9]:
# with open('./data/aug_train_val_test_labeled_dataset.json', 'r') as f:
#     process = json.load(f)
# doc_train = process['document_train']
# sum_train = process['summary_train']
# doc_val = process['document_val']
# sum_val = process['summary_val']
# doc_test = process['document_test']
# sum_test = process['summary_test']

##### Define Process Dataset

In [10]:
class ProcessDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx])
        return item # input_ids, attention_mask, labels
    def __len__(self):
        return len(self.labels['input_ids']) # len(self.labels)

##### Define Process Data Loader

In [11]:
def make_loader(texts, labels, tokenizer, shuffle, batch_size):
    process_dataset = process_data(texts, labels, tokenizer)
    process_dataloader = DataLoader(
        process_dataset, shuffle=shuffle, batch_size=batch_size
    )  
    return process_dataloader

In [12]:
# Define function needed to tokenize texts and labels
def process_data(texts, labels, tokenizer):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    process_dataset = ProcessDataset(encodings, decodings)
    return process_dataset

#### Define Training Logic
##### Track gradients and weights with wandb.watch and everything else, i.e. loss, with wandb.log

In [13]:
def train_and_val(model, train_loader, doc_val, sum_val, tokenizer, optimizer, config):
    # set the model to train
    wandb.watch(model, log="all", log_freq=10)

    # run training and track with wandb
    total_batches = len(train_loader) * config.epochs
    print('num_training_steps', total_batches)
    progress_bar = tqdm(range(total_batches))

    batch_ct = 0
    running_loss = 0.
    last_loss = 0.
    model_save_epoch = 0
    for epoch in range(config.epochs):
        model.train()
        for idx, process_batch in enumerate(train_loader):
            loss = train_batch(idx, process_batch, model, optimizer, progress_bar)
            batch_ct += 1
            # report metrics every 5th batch
            running_loss += loss.item()
            if (batch_ct % 5) == 0:
                last_loss = running_loss / 5 # log loss in average term
                train_log(last_loss, batch_ct, epoch)
                running_loss = 0.
        # validate model after train at each epoch
        model.eval()
        P, R, F1 = val(model, tokenizer, doc_val, sum_val)
        val_log(P, R, F1) # log validation loss        
            
        # save model after train each epoch
        if epoch >= model_save_epoch:
            output_model = config.output_model.format(epoch+1)
            save(model, optimizer, output_model)

##### Define functions needed in the training loop

In [14]:
def train_batch(idx, batch, model, optimizer, progress_bar):                                                                                 
    process_item = {k: v.to(device) for k, v in batch.items()}        
    # forward pass
    model_output = model(**process_item)
    loss = model_output.loss    
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    # step with optimizer
    optimizer.step()
    progress_bar.update(1)

    return loss

In [15]:
def val(model, tokenizer, doc_val, sum_val):
    P = torch.tensor([])
    R = torch.tensor([])
    F1 = torch.tensor([])
    doc_list = np.array(doc_val).reshape((9, 7)) # should adapt reshape size according to different input
    sum_list = np.array(sum_val).reshape((9, 7))
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    with torch.no_grad():
        for document, summary in zip(doc_list, sum_list):
            doc_tokenized = tokenizer(document.tolist(), truncation=True, padding='longest', return_tensors="pt").to(device)
            translated = model.generate(**doc_tokenized)
            generated = tokenizer.batch_decode(translated, skip_special_tokens=True)
            P_temp, R_temp, F1_temp = scorer.score(generated, summary.tolist())
            P = torch.cat([P, P_temp])
            R = torch.cat([R, R_temp])
            F1 = torch.cat([F1, F1_temp])
        P = P.mean()
        R = R.mean()
        F1 = F1.mean()
    return P, R, F1

In [16]:
def train_log(loss, batch_num, epoch):
    wandb.log({"epoch": epoch, "loss": loss}, step=batch_num)
    print(f"Loss after " + str(batch_num).zfill(5) + f" steps: {loss:.3f}")
    
def val_log(P, R, F1):
    wandb.log({"bert_score_P": P, "bert_score_R": R, "bert_score_F1": F1})
    print(f"bert_score_P: {P:.3f}, bert_score_R, {R:.3f}, bert_score_F1, {F1:.3f}")

In [17]:
def save(model, optimizer, output_model):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

def load(model, optimizer, output_model):
    checkpoint = torch.load(output_model)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

#### Build, train and analyze the model with the pipeline

In [None]:
# Build, train and analyze the model with the pipeline
model = model_pipeline(config)


#### Automatic Evaluation on Test Data

##### Define functions needed in the evaluation

In [19]:
def evaluate(model, tokenizer, doc_test, sum_test):
    generated_summary = []
    P = torch.tensor([])
    R = torch.tensor([])
    F1 = torch.tensor([])
    doc_list = np.array(doc_test).reshape((10, 9)) # should adapt reshape size according to different input
    sum_list = np.array(sum_test).reshape((10, 9))
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    with torch.no_grad():
        for document, summary in zip(doc_list, sum_list):
            doc_tokenized = tokenizer(document.tolist(), truncation=True, padding='longest', return_tensors="pt").to(device)
            translated = model.generate(**doc_tokenized)
            generated = tokenizer.batch_decode(translated, skip_special_tokens=True)
            P_temp, R_temp, F1_temp = scorer.score(generated, summary.tolist())
            P = torch.cat([P, P_temp])
            R = torch.cat([R, R_temp])
            F1 = torch.cat([F1, F1_temp])
            generated_summary += generated     
        P = P.mean()
        R = R.mean()
        F1 = F1.mean()
        
    return P, R, F1, generated_summary

In [20]:
def eval_log(P, R, F1):
    wandb.log({"bert_score_P_eval": P, "bert_score_R_eval": R, "bert_score_F1_eval": F1})
    print(f"bert_score_P: {P:.3f}, bert_score_R, {R:.3f}, bert_score_F1, {F1:.3f}")
    

##### Track metadata with wandb.init and run evaluation


In [21]:
# Define the configuration of the evaluation
config = dict(
    input_model = "", # specify path of the trained input model, i.e. "./model_summarization/summarization_7_epoch.pth"
    file_name = "" # name the output file of the produced labels, i.e. "generated_labels.txt"
)

In [None]:
with wandb.init(project="wandb-project-name", entity="yenting-thesis"): 
    model_name = 'google/pegasus-large'
    model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True).to(device)
    tokenizer = PegasusTokenizerFast.from_pretrained(model_name)
    optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
    wandb.watch(model, log="all", log_freq=10)
    # load input model
    input_model = config['input_model']
    load(model, optimizer, input_model)
    # model eval
    model.eval()
    P, R, F1, generated_summary = evaluate(model, tokenizer, doc_test, sum_test)
    # output generated labels or summary in the file
    with open(config['file_name'], 'w') as f:
        for line in generated_summary:
            f.write(line)
            f.write('\n')
    eval_log(P, R, F1)

#### Inspect the Generated Labels

In [None]:
for i, (ref, hypo) in enumerate(zip(sum_test, generated_summary)):
    print("ref: " + ref + "\n" + "hypo: " + hypo + "\n")
    