In [None]:
# !pip3 install virtualenv
# !virtualenv my_env
# !python -m ipykernel install --user --name=my_env

In [1]:
%%capture
!pip3 install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install wandb --upgrade

In [2]:
import os
import random
import json
import numpy as np
import pandas as pd
import torch
import sys

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import PegasusConfig, PegasusModel, PegasusForConditionalGeneration, PegasusTokenizerFast, get_scheduler
from transformers.optimization import Adafactor

In [3]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: yenting-thesis (use `wandb login --relogin` to force relogin)


True

#### Define the Experiment and Pipeline
##### Track metadata and hyperparameters with wandb.init

In [5]:
config = dict(
    epochs = 20,
    batch_size = 2,
    optimizer = "Adafactor",
    learning_rate = 5e-5,
    weight_decay = 0.01,
    es_patience = 5,
    loss_function = "triplet-margin-loss",
    dataset = "BPMAI-29-10-2019",
    architecture = "encoder-seq2seq-Pegasus"
)

In [6]:
def model_pipeline(hyperparameters):

    # tell wandb to get started
    with wandb.init(project="thesis-TML-maskedSent", entity="yenting-thesis", config=hyperparameters):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # make the model, data, and optimization problem
        model, train_loader, test_loader, optimizer = make(config)
#         print(model)

        # make the early stopping
        es = EarlyStopping(patience = config.es_patience)
        
        # and use them to train the model
        train_and_val(model, train_loader, test_loader, optimizer, es, config)

        # and test its final performance
#         test(model, test_loader)

    return model

In [7]:
def make(config):
    # Make the pretrained tokenizer and model
    model_name = 'google/pegasus-large' # 'google/pegasus-xsum'
    tokenizer = PegasusTokenizerFast.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True)
#     if torch.cuda.device_count() > 1:
#         print("Let's use", torch.cuda.device_count(), "GPUs!")
#         model = nn.DataParallel(model)
    model.to(device)
    
    # Make the data
    train_loader = make_loader(train_data, tokenizer, shuffle=True, batch_size=config.batch_size)
    test_loader = make_loader(test_data, tokenizer, shuffle=True, batch_size=config.batch_size)
    for batch in train_loader:
        break
    print({k: v.shape for k, v in batch.items()})
#     for anchor, positive, negative in train_loader:
#         break
#     print({k: v.shape for k, v in anchor.items()})

    # Make the model
    kepler_pegasus_model = KeplerPegasusModel(model)
    
    # Make optimizer
#     optimizer = torch.optim.AdamW(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate)
    # replace AdamW with Adafactor
    optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
#     optimizer = Adafactor(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay, relative_step=False)
    
    return kepler_pegasus_model, train_loader, test_loader, optimizer

#### Define the Data Loading and Model

In [8]:
with open('./masked_sent_train.json', 'r') as f:
    train_data = json.load(f)
with open('./masked_sent_val.json', 'r') as f:
    test_data = json.load(f)
    
# # slice data - can't random pick dataset
# train_graph_data = train_data['easy_negatives'] + train_data['negatives'] + train_data['one_step_away_negs'] + train_data['hard_negatives']
# test_graph_data = test_data['easy_negatives'] + test_data['negatives'] + test_data['one_step_away_negs'] + test_data['hard_negatives']

In [9]:
class SubprocessDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item # input_ids, attention_mask, labels
    def __len__(self):
        return len(self.labels['input_ids'])

In [10]:
class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, triplets, tokenizer):
        self.triplets = triplets
        self.tokenizer = tokenizer
    def __getitem__(self, idx):
        triplet = self.triplets[idx]
        triplet_encodings = self.tokenizer(triplet, truncation=True, padding='max_length', max_length=50)
        anchor = {key: torch.tensor(val[0]) for key, val in triplet_encodings.items()}
        positive = {key: torch.tensor(val[1]) for key, val in triplet_encodings.items()}
        negative = {key: torch.tensor(val[2]) for key, val in triplet_encodings.items()}
        anchor['labels'] = torch.tensor(triplet_encodings['input_ids'][0])
        positive['labels'] = torch.tensor(triplet_encodings['input_ids'][1])
        negative['labels'] = torch.tensor(triplet_encodings['input_ids'][2])
        return anchor, positive, negative # input_ids, attention_mask, labels
    def __len__(self):
        return len(self.triplets)

In [11]:
def tokenize_data(texts, labels, tokenizer):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    return encodings, decodings

In [12]:
def make_loader(dataset, tokenizer, shuffle, batch_size):
    texts = dataset['document']
    labels = dataset['summary']
    encodings, decodings = tokenize_data(texts, labels, tokenizer)
    subprocess_dataset = SubprocessDataset(encodings, decodings)
    subprocess_dataloader = DataLoader(
        dataset=subprocess_dataset, shuffle=shuffle, batch_size=batch_size
    )
#     graph_dataset = GraphDataset(dataset, tokenizer)
#     graphdata_dataloader = DataLoader(
#         dataset=graph_dataset, shuffle=shuffle, batch_size=batch_size
#     )    

    return subprocess_dataloader

In [13]:
def get_eos_idx(batch):
    for input_ids in batch['input_ids']:
        eos_id = input_ids == 1
        idx = eos_id.nonzero()[0]
        if 'eos_idx' in locals():
            eos_idx = torch.cat((eos_idx, idx), 0)
        else:
            eos_idx = eos_id.nonzero()[0]
    return eos_idx

In [14]:
class KeplerPegasusModel(nn.TripletMarginLoss):
    
    def __init__(self, model, margin: float = 1.0, p: float = 2., eps: float = 1e-6, 
                 swap: bool = False, size_average=None, reduce=None, reduction: str = 'mean'):
        super().__init__(margin, p, eps, swap, size_average, reduce, reduction)
        self.model = model
        
        # forward(self, graphdata_batch, subprocess_batch)
    def forward(self, subprocess_batch):
        MLM_output = self.model(**subprocess_batch)
        maskedSent_loss = MLM_output.loss
        
#         anchors, positives, negatives = graphdata_batch[0], graphdata_batch[1], graphdata_batch[2]
#         anchor_eos, positive_eos, negative_eos = get_eos_idx(anchors), get_eos_idx(positives), get_eos_idx(negatives)
#         # Triplet margin loss
#         model_output_a = self.model(**anchors)
#         model_output_p = self.model(**positives)
#         model_output_n = self.model(**negatives)
        
#         encoder_output_a = model_output_a.encoder_last_hidden_state
#         encoder_output_p = model_output_p.encoder_last_hidden_state
#         encoder_output_n = model_output_n.encoder_last_hidden_state
        
#         a_eos = torch.vstack([encoder_output_a[i][anchor_eos[i]] for i in range(encoder_output_a.size(0))])
#         p_eos = torch.vstack([encoder_output_p[i][positive_eos[i]] for i in range(encoder_output_p.size(0))])
#         n_eos = torch.vstack([encoder_output_n[i][negative_eos[i]] for i in range(encoder_output_n.size(0))])     
        
#         # compute the loss
#         triplet_margin_loss = F.triplet_margin_loss(a_eos, p_eos, n_eos, 
#                                                     margin=self.margin, p=self.p,
#                                                     eps=self.eps, swap=self.swap, 
#                                                     reduction=self.reduction)        
#         loss = MLM_output.loss + triplet_margin_loss         

        return maskedSent_loss
    

#### Early Stopping
##### Track validation loss to prevent overfitting

In [15]:
class EarlyStopping(object):
    def __init__(self, mode='min', min_delta=0, patience=10, percentage=False):
        self.mode = mode
        self.min_delta = min_delta
        self.patience = patience
        self.best = None
        self.num_bad_epochs = 0
        self.is_better = None
        self._init_is_better(mode, min_delta, percentage)

        if patience == 0:
            self.is_better = lambda a, b: True
            self.step = lambda a: False

    def step(self, metrics):
        if self.best is None:
            self.best = metrics
            return False

        if torch.isnan(metrics):
            return True

        if self.is_better(metrics, self.best):
            self.num_bad_epochs = 0
            self.best = metrics
        else:
            self.num_bad_epochs += 1

        if self.num_bad_epochs >= self.patience:
            return True

        return False

    def _init_is_better(self, mode, min_delta, percentage):
        if mode not in {'min', 'max'}:
            raise ValueError('mode ' + mode + ' is unknown!')
        if not percentage:
            if mode == 'min':
                self.is_better = lambda a, best: a < best - min_delta
            if mode == 'max':
                self.is_better = lambda a, best: a > best + min_delta
        else:
            if mode == 'min':
                self.is_better = lambda a, best: a < best - (
                            best * min_delta / 100)
            if mode == 'max':
                self.is_better = lambda a, best: a > best + (
                            best * min_delta / 100)

#### Define Training Logic
##### Track gradients with wandb.watch and everything else with wandb.log

In [16]:
def train_and_val(model, train_loader, test_loader, optimizer, es, config):
    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, log="all", log_freq=10)

    # Run training and track with wandb
    total_batches = len(train_loader) * config.epochs
    print('num_training_steps', total_batches)
    progress_bar = tqdm(range(total_batches))
#     lr_scheduler = get_scheduler(
#         "linear",
#         optimizer=optimizer,
#         num_warmup_steps=500,
#         num_training_steps=total_batches,
#     )
    batch_ct = 0
    for epoch in range(config.epochs):
        model.train()
#         for _, graphdata_batch in enumerate(train_loader):
        for _, subprocess_batch in enumerate(train_loader):  
            loss = train_batch(subprocess_batch, model, optimizer, progress_bar)
#             loss = train_batch(graphdata_batch, model, optimizer, lr_scheduler, progress_bar)
            batch_ct += 1
            # Report metrics every 25th batch
            if ((batch_ct) % 25) == 0:
                train_log(loss, batch_ct, epoch)

        metric = test(model, test_loader, batch_ct, epoch)
        if es.step(metric):
            output_model = './models_maskedSent/maskedSent_{}_epoch.pth'.format(epoch+1)
#             output_model = './models_TML/TML_{}_epoch.pth'.format(epoch+1)
            save(model, optimizer, output_model)
            break
        if epoch == (config.epochs-1):
            output_model = './models_maskedSent/maskedSent_{}_epoch.pth'.format(epoch+1)
#             output_model = './models_TML/TML_{}_epoch.pth'.format(config.epochs)
            save(model, optimizer, output_model)

# train_batch(batch, model, optimizer, lr_scheduler, progress_bar)
def train_batch(batch, model, optimizer, progress_bar):
                                                                                    
#     graphdata_items = []
#     for item in batch:
#         graphdata_items.append({k: v.to(device) for k, v in item.items()})
    subprocess_item = {k: v.to(device) for k, v in batch.items()}
    
    # Forward pass ➡
    # model(graphdata_items, subprocess_item)
    loss = model(subprocess_item)                                        
    
    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer and lr_scheduler
    optimizer.step()
#     lr_scheduler.step()
    progress_bar.update(1)

    return loss

In [17]:
def test(model, test_loader, batch_num, epoch):
    model.eval()

    # Run the model on some test examples
    with torch.no_grad():
        loss = 0
#         for _, graphdata_batch in enumerate(test_loader):
        for _, subprocess_batch in enumerate(test_loader):
#             graphdata_items = []
#             for item in graphdata_batch:
#                 graphdata_items.append({k: v.to(device) for k, v in item.items()})
            subprocess_item = {k: v.to(device) for k, v in subprocess_batch.items()}
            loss += model(subprocess_item)
        
        loss /= len(test_loader)
        test_log(loss, batch_num, epoch)
    
    return loss

#     # Save the model in the exchangeable ONNX format
#     torch.onnx.export(model, images, "model.onnx")
#     wandb.save("model.onnx")

In [18]:
def train_log(loss, batch_num, epoch):
    # Where the magic happens
    wandb.log({"epoch": epoch, "loss": loss}, step=batch_num)
    print(f"Loss after " + str(batch_num).zfill(5) + f" steps: {loss:.3f}")

def test_log(loss, batch_num, epoch):
    wandb.log({"val_loss": loss})
    print(f"Validation Loss after " + str(batch_num).zfill(5) + f" training steps: {loss:.3f}")

In [19]:
def save(model, optimizer, output_model):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

def load(output_model):
    checkpoint = torch.load(output_model)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# Build, train and analyze the model with the pipeline
model = model_pipeline(config)

{'input_ids': torch.Size([2, 849]), 'attention_mask': torch.Size([2, 849]), 'labels': torch.Size([2, 88])}
num_training_steps 17060


HBox(children=(FloatProgress(value=0.0, max=17060.0), HTML(value='')))

Loss after 00025 steps: 11.063
Loss after 00050 steps: 11.186
Loss after 00075 steps: 10.681
Loss after 00100 steps: 10.317
Loss after 00125 steps: 9.836
Loss after 00150 steps: 9.006
Loss after 00175 steps: 7.937
Loss after 00200 steps: 3.222
Loss after 00225 steps: 0.636
Loss after 00250 steps: 0.327
Loss after 00275 steps: 0.314
Loss after 00300 steps: 0.495
Loss after 00325 steps: 0.351
Loss after 00350 steps: 1.250
Loss after 00375 steps: 0.347
Loss after 00400 steps: 0.308
Loss after 00425 steps: 1.126
Loss after 00450 steps: 0.551
Loss after 00475 steps: 0.238
Loss after 00500 steps: 0.326
Loss after 00525 steps: 0.192
Loss after 00550 steps: 0.209
Loss after 00575 steps: 0.201
Loss after 00600 steps: 0.321
Loss after 00625 steps: 0.257
Loss after 00650 steps: 0.253
Loss after 00675 steps: 0.526
Loss after 00700 steps: 0.647
Loss after 00725 steps: 0.164
Loss after 00750 steps: 0.395
Loss after 00775 steps: 1.297
Loss after 00800 steps: 0.170
Loss after 00825 steps: 0.070
Loss a

In [None]:
# train model with whole dataset once epoch number is set