In [None]:
# !pip3 install virtualenv
# !virtualenv my_env
# !python -m ipykernel install --user --name=my_env

In [2]:
%%capture
!pip3 install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install wandb --upgrade

In [3]:
import os
import random
import json
import numpy as np
import pandas as pd
import torch
import sys

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import PegasusModel, PegasusForConditionalGeneration, PegasusTokenizerFast, get_scheduler

In [4]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: yenting-thesis (use `wandb login --relogin` to force relogin)


True

#### Define the Experiment and Pipeline
##### Track metadata and hyperparameters with wandb.init

In [6]:
config = dict(
    epochs = 10,
    batch_size = 16,
    optimizer = "AdamW",
    learning_rate = 5e-5,
    weight_decay = 0.01,
    loss_function = "triplet-margin-loss",
    dataset = "BPMAI-29-10-2019",
    architecture = "encoder-seq2seq-Pegasus"
)

In [7]:
def model_pipeline(hyperparameters):

    # tell wandb to get started
    with wandb.init(project="thesis", entity="yenting-thesis", config=hyperparameters):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # make the model, data, and optimization problem
#         model, train_loader, test_loader, optimizer = make(config)
        model, train_loader, optimizer = make(config)
#         print(model)

        # and use them to train the model
        train(model, train_loader, optimizer, config)

        # and test its final performance
#         test(model, test_loader)

    return model

In [8]:
def make(config):
    # Make the pretrained tokenizer and model
    model_name = 'google/pegasus-large' # 'google/pegasus-xsum'
    tokenizer = PegasusTokenizerFast.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True)
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)
    model.to(device)
    
    # Make the data
    train_loader = make_loader(graph_data, tokenizer, shuffle=True, batch_size=config.batch_size)
#     test_loader = make_loader(test, batch_size=config.batch_size)
    for anchor, positive, negative in train_loader:
        break
    print({k: v.shape for k, v in anchor.items()})

    # Make the model
    kepler_pegasus_model = KeplerPegasusModel(model)
    
    # Make optimizer
    optimizer = torch.optim.AdamW(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate)
    
#     return model, train_loader, test_loader, optimizer
    return kepler_pegasus_model, train_loader, optimizer

#### Define the Data Loading and Model

In [9]:
with open('./triplet_train_dataset.json', 'r') as f:
    graph_data = json.load(f)
    
graph_data = graph_data['negatives'] + graph_data['hard_negatives']

In [10]:
class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, triplets, tokenizer):
        self.triplets = triplets
        self.tokenizer = tokenizer
    def __getitem__(self, idx):
        triplet = self.triplets[idx]
        triplet_encodings = self.tokenizer(triplet, truncation=True, padding='max_length', max_length=50)
        anchor = {key: torch.tensor(val[0]) for key, val in triplet_encodings.items()}
        positive = {key: torch.tensor(val[1]) for key, val in triplet_encodings.items()}
        negative = {key: torch.tensor(val[2]) for key, val in triplet_encodings.items()}
        anchor['labels'] = torch.tensor(triplet_encodings['input_ids'][0])
        positive['labels'] = torch.tensor(triplet_encodings['input_ids'][1])
        negative['labels'] = torch.tensor(triplet_encodings['input_ids'][2])
        return anchor, positive, negative
    def __len__(self):
        return len(self.triplets) # len(self.labels)

In [11]:
def make_loader(dataset, tokenizer, shuffle, batch_size):
    graph_dataset = GraphDataset(dataset, tokenizer)
    graphdata_train_dataloader = DataLoader(
        dataset=graph_dataset, shuffle=shuffle, batch_size=batch_size
    )    

    return graphdata_train_dataloader

In [12]:
def get_eos_idx(batch):
    for input_ids in batch['input_ids']:
        eos_id = input_ids == 1
        idx = eos_id.nonzero()[0]
        if 'eos_idx' in locals():
            eos_idx = torch.cat((eos_idx, idx), 0)
        else:
            eos_idx = eos_id.nonzero()[0]
    return eos_idx

In [13]:
class KeplerPegasusModel(nn.TripletMarginLoss):
    
    def __init__(self, model, margin: float = 1.0, p: float = 2., eps: float = 1e-6, 
                 swap: bool = False, size_average=None, reduce=None, reduction: str = 'mean'):
        super().__init__(margin, p, eps, swap, size_average, reduce, reduction)
        self.model = model
        
    def forward(self, graphdata_batch):

        # triplet margin loss
        anchors, positives, negatives = graphdata_batch[0], graphdata_batch[1], graphdata_batch[2]
        # get eos indices
        anchor_eos, positive_eos, negative_eos = get_eos_idx(anchors), get_eos_idx(positives), get_eos_idx(negatives)
        # get model encoder output
        model_output_a = self.model(**anchors)
        model_output_p = self.model(**positives)
        model_output_n = self.model(**negatives)
        encoder_output_a = model_output_a.encoder_last_hidden_state
        encoder_output_p = model_output_p.encoder_last_hidden_state
        encoder_output_n = model_output_n.encoder_last_hidden_state
        # extract eos embeddings
        a_eos = torch.vstack([encoder_output_a[i][anchor_eos[i]] for i in range(encoder_output_a.size(0))])
        p_eos = torch.vstack([encoder_output_p[i][positive_eos[i]] for i in range(encoder_output_p.size(0))])
        n_eos = torch.vstack([encoder_output_n[i][negative_eos[i]] for i in range(encoder_output_n.size(0))])     
        # compute the loss
        triplet_margin_loss = F.triplet_margin_loss(a_eos, p_eos, n_eos, 
                                                    margin=self.margin, p=self.p,
                                                    eps=self.eps, swap=self.swap, 
                                                    reduction=self.reduction)        
        
        return triplet_margin_loss
    

#### Define Training Logic
##### Track gradients with wandb.watch and everything else with wandb.log

In [14]:
def train(model, loader, optimizer, config):
    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, log="all", log_freq=10)

    # Run training and track with wandb
    total_batches = len(loader) * config.epochs
    print('num_training_steps', total_batches)
    progress_bar = tqdm(range(total_batches))
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=500,
        num_training_steps=total_batches,
    )
    batch_ct = 0
    for epoch in range(config.epochs):
        for _, graphdata_batch in enumerate(loader):                                 
            loss = train_batch(graphdata_batch, model, optimizer, lr_scheduler, progress_bar)
            batch_ct += 1
            
            # Report metrics every 25th batch
            if ((batch_ct) % 25) == 0:
                train_log(loss, batch_ct, epoch)


def train_batch(batch, model, optimizer, lr_scheduler, progress_bar):
                                                                                    
    graphdata_batch = []
    for item in batch:
        graphdata_batch.append({k: v.to(device) for k, v in item.items()})
                                            
    # Forward pass ➡
    loss = model(graphdata_batch)                                        
    
    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer and lr_scheduler
    optimizer.step()
    lr_scheduler.step()
    progress_bar.update(1)

    return loss

In [15]:
def train_log(loss, batch_num, epoch):
    # Where the magic happens
    wandb.log({"epoch": epoch, "loss": loss}, step=batch_num)
    print(f"Loss after " + str(batch_num).zfill(5) + f" steps: {loss:.3f}")

In [None]:
# Build, train and analyze the model with the pipeline
model = model_pipeline(config)

{'input_ids': torch.Size([16, 50]), 'attention_mask': torch.Size([16, 50]), 'labels': torch.Size([16, 50])}
num_training_steps 1230


HBox(children=(FloatProgress(value=0.0, max=1230.0), HTML(value='')))

Loss after 00025 steps: 0.874
Loss after 00050 steps: 0.886
Loss after 00075 steps: 0.809
Loss after 00100 steps: 0.979
Loss after 00125 steps: 0.958
Loss after 00150 steps: 0.863
Loss after 00175 steps: 0.919
Loss after 00200 steps: 0.966
Loss after 00225 steps: 0.611
Loss after 00250 steps: 0.954
Loss after 00275 steps: 0.785
Loss after 00300 steps: 0.955
Loss after 00325 steps: 0.897
Loss after 00350 steps: 0.923
Loss after 00375 steps: 0.711
Loss after 00400 steps: 0.635
Loss after 00425 steps: 0.768
Loss after 00450 steps: 0.782
Loss after 00475 steps: 0.716
Loss after 00500 steps: 0.759
Loss after 00525 steps: 1.182
Loss after 00550 steps: 0.841
Loss after 00575 steps: 0.681
Loss after 00600 steps: 0.820
Loss after 00625 steps: 0.276
Loss after 00650 steps: 0.693
Loss after 00675 steps: 0.162
Loss after 00700 steps: 0.595
