In [1]:
# !pip3 install virtualenv
# !virtualenv my_env
# !python -m ipykernel install --user --name=my_env

In [2]:
# !pip3 install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# !pip3 install transformers
# !pip3 install sentencepiece

In [1]:
import json
import numpy as np
import pandas as pd
import torch

from transformers import PegasusModel, PegasusForConditionalGeneration, PegasusTokenizerFast, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

##### Load Train Dataset, Tokenizer & Model

In [5]:
with open('./train_dataset.json', 'r') as f:
    text_data = json.load(f)
with open('./triplet_train_dataset.json', 'r') as f:
    graph_data = json.load(f)
graph_data = graph_data['negatives'][:4263] + graph_data['hard_negatives'][:2133]

In [7]:
model_name = 'google/pegasus-large'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizerFast.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True).to(torch_device)

##### Define Custom Dataset, DataLoader

In [20]:
class SubprocessDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx])
        return item # input_ids, attention_mask, labels
    def __len__(self):
        return len(self.labels['input_ids']) # len(self.labels)

In [8]:
class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # 1 training step - see if decoder weights change
        item['labels'] = torch.tensor(self.encodings['input_ids'][idx])
        return item # input_ids, attention_mask, labels
    def __len__(self):
        return len(self.encodings['input_ids']) # len(self.labels)

In [9]:
def tokenize_data(tasks, texts, labels):
    graph_encodings = tokenizer(tasks, truncation=True, padding=True)
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    graphdata_tokenized = GraphDataset(graph_encodings)
    subprocess_tokenized = SubprocessDataset(encodings, decodings)
    return graphdata_tokenized, subprocess_tokenized

In [10]:
def get_eos_idx(batch):
    for input_ids in batch['input_ids']:
        eos_id = input_ids == 1
        idx = eos_id.nonzero()[0]
        if 'eos_idx' in locals():
            eos_idx = torch.cat((eos_idx, idx), 0)
        else:
            eos_idx = eos_id.nonzero()[0]
    return eos_idx

### KEPLER Pegasus Model 
https://pytorch.org/docs/stable/_modules/torch/nn/modules/loss.html#TripletMarginLoss

In [11]:
import torch.nn as nn
import torch.nn.functional as F

class KeplerPegasusModel(nn.TripletMarginLoss):
    
    def __init__(self, model, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
                 reduce=None, reduction: str = 'mean'):
        super().__init__(margin, p, eps, swap, size_average, reduce, reduction)
        self.model = model
        
    def forward(self, graphdata_eos_idx, graphdata_batch, subprocess_batch):
        # Triplet margin loss
        model_output = self.model(**graphdata_batch)
        encoder_output = model_output.encoder_last_hidden_state
        anchor = encoder_output[0][graphdata_eos_idx[0]]
        positive = encoder_output[1][graphdata_eos_idx[1]]
        negative = encoder_output[2][graphdata_eos_idx[2]]
        triplet_margin_loss = F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
                                 eps=self.eps, swap=self.swap, reduction=self.reduction)
        # Pegasus loss
        MLM_output = self.model(**subprocess_batch)
        
        loss = MLM_output.loss + triplet_margin_loss 
        # monitor losses
        # assertions?
        
        return loss
    

In [12]:
graphdata_train_dataset, subprocess_train_dataset = tokenize_data(graph_data, text_data['document'], text_data['summary'])

graphdata_train_dataloader = DataLoader(
    graphdata_train_dataset, shuffle=False, batch_size=3
#     num_workers=4
)
subprocess_train_dataloader = DataLoader(
    subprocess_train_dataset, shuffle=True, batch_size=1
#     num_workers=4
)

for graphdata_batch, subprocess_batch in zip(graphdata_train_dataloader, subprocess_train_dataloader):
    break
print({k: v.shape for k, v in graphdata_batch.items()})
print({k: v.shape for k, v in subprocess_batch.items()})

{'input_ids': torch.Size([3, 34]), 'attention_mask': torch.Size([3, 34]), 'labels': torch.Size([3, 34])}
{'input_ids': torch.Size([1, 849]), 'attention_mask': torch.Size([1, 849]), 'labels': torch.Size([1, 134])}


In [13]:
optimizer = AdamW(model.parameters(), weight_decay=0.01, lr=5e-5)

num_epochs = 10
num_training_steps = num_epochs * len(graphdata_train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=500,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

21320


In [35]:
def save(model, optimizer, output_model):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

# # load
# checkpoint = torch.load(output_model, map_location='cpu')
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for graphdata_batch, subprocess_batch in zip(graphdata_train_dataloader, subprocess_train_dataloader):        
        graphdata_eos_idx = get_eos_idx(graphdata_batch)
        graphdata_batch = {k: v.to(torch_device) for k, v in graphdata_batch.items()}
        subprocess_batch = {k: v.to(torch_device) for k, v in subprocess_batch.items()}
        kepler_pegasus_model = KeplerPegasusModel(model)
        loss = kepler_pegasus_model(graphdata_eos_idx, graphdata_batch, subprocess_batch)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    if epoch % 2 == 0:
        output_model = './models/kepler_pegasus_{}_epoch.pth'.format(epoch)
        save(model, optimizer, output_model)

HBox(children=(FloatProgress(value=0.0, max=21320.0), HTML(value='')))

### Further finetuning using labeled data

In [2]:
class SubprocessDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx])
        return item # input_ids, attention_mask, labels
    def __len__(self):
        return len(self.labels['input_ids']) # len(self.labels)

In [3]:
model_name = 'google/pegasus-large'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizerFast.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True).to(torch_device)

In [4]:
# load
epoch = 4
optimizer = AdamW(model.parameters(), weight_decay=0.01, lr=5e-5)
output_model = './models/kepler_pegasus_{}_epoch.pth'.format(epoch)
checkpoint = torch.load(output_model)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [4]:
with open('./train_subprocess.json', 'r') as f:
    subprocess = json.load(f)

In [5]:
subprocess_text = subprocess['document']
subprocess_summary = subprocess['summary']
unique_idx = np.unique(subprocess_summary, return_index=True)[1]
subprocess_text = [subprocess_text[x] for x in unique_idx]
subprocess_summary = [subprocess_summary[x] for x in unique_idx]
subprocess_flows_extracted = [subprocess['flows_extracted'][x] for x in unique_idx]
# subprocess_text = [subprocess_text[x] for x in np.sort(unique_idx)]
# subprocess_summary = [subprocess_summary[x] for x in np.sort(unique_idx)]
# subprocess_flows_extracted = [subprocess['flows_extracted'][x] for x in np.sort(unique_idx)]

In [6]:
len(subprocess_summary)

153

In [8]:
def subprocess_tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    subprocess_tokenized = SubprocessDataset(encodings, decodings)
    return subprocess_tokenized

In [9]:
subprocess_labeled_dataset = subprocess_tokenize_data(subprocess_text[:133], subprocess_summary[:133])

subprocess_labeled_dataloader = DataLoader(
    subprocess_labeled_dataset, shuffle=True, batch_size=1
)

for subprocess_batch in subprocess_labeled_dataloader:
    break
print({k: v.shape for k, v in subprocess_batch.items()})

{'input_ids': torch.Size([1, 105]), 'attention_mask': torch.Size([1, 105]), 'labels': torch.Size([1, 22])}


In [10]:
num_epochs = 11
num_training_steps = num_epochs * len(subprocess_labeled_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=500,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1463


In [11]:
def save(model, optimizer, output_model):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)


In [13]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for subprocess_batch in subprocess_labeled_dataloader:        
        batch = {k: v.to(torch_device) for k, v in subprocess_batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
    if epoch % 2 == 0:
        output_model = './models_subprocess_2/kepler_pegasus_subprocess_{}_epoch.pth'.format(epoch)
        save(model, optimizer, output_model)

HBox(children=(FloatProgress(value=0.0, max=1463.0), HTML(value='')))

##### Check results -- Inference

In [7]:
epoch = 6
output_model = './models_subprocess_2/kepler_pegasus_subprocess_{}_epoch.pth'.format(epoch)
checkpoint = torch.load(output_model)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0): PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, element

In [18]:
batch = tokenizer(subprocess_text[133:], truncation=True, padding='longest', return_tensors="pt").to(torch_device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

In [50]:
n = -4

In [51]:
subprocess_text[133:][n]

'payment by creditcard, payment by bank-account transfer, payment by PayPal'

In [52]:
subprocess_summary[133:][n]

'payment'

In [40]:
tgt_text

['Manage account',
 'Editing and proofreading',
 'Check compliance',
 'Check for duplicates',
 'create insurance proposal',
 'Check completeness',
 'Fill journey information',
 'Fill loan proposal',
 'write a letter to applicant',
 'Membership process',
 'Check if information is complete',
 'NummerZwei()',
 'archive invoice per payment date',
 'Booking',
 'Negotiate terms and conditions',
 'Check if cash is available',
 'payment by installments',
 'Check completeness and car existence',
 'Traffic infringement resolution. processing traffic infringement',
 'Booking']

In [53]:
reesults = {'subprocess_text': subprocess_text[133:], 'subprocess_summary': subprocess_summary[133:],
           'model_output':  tgt_text,'subprocess_flows_extracted': subprocess_flows_extracted[133:]}

with open('results_subprocess_2.json', 'w') as f:
    json.dump(reesults, f)

In [None]:
# taking eos as sentence representation
# use encodings itself (or random generated text) as labels, decoder_input_ids needed for graph data output...
# create some sort of assertions? to make sure that the loss is calculated correctly
# input data design questions (input data repetitions of subprocess side)
# summarisation evaluation metrics questions

# list of planned experiments - hypothesis vs. realities
# thesis - experiment sections - data & sentence representation & KE loss definitions & evaluation metrics
# github colab...
# https://wandb.ai/site

In [None]:
# def freeze_params(model: nn.Module):
#     """Set requires_grad=False for each of model.parameters()"""
#     for par in model.parameters():
#         par.requires_grad = False

# def freeze_embeds(model):
#     """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
#     model_type = model.config.model_type

#     if model_type in ["t5", "mt5"]:
#         freeze_params(model.shared)
#         for d in [model.encoder, model.decoder]:
#             freeze_params(d.embed_tokens)
#     elif model_type == "fsmt":
#         for d in [model.model.encoder, model.model.decoder]:
#             freeze_params(d.embed_positions)
#             freeze_params(d.embed_tokens)
#     else:
#         freeze_params(model.model.shared)
#         for d in [model.model.encoder, model.model.decoder]:
#             freeze_params(d.embed_positions)
#             freeze_params(d.embed_tokens)
# freeze_embeds(model)

In [20]:
class KeplerPegasusModel_test(nn.TripletMarginLoss):
    
    def __init__(self, model, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
                 reduce=None, reduction: str = 'mean'):
        super().__init__(margin, p, eps, swap, size_average, reduce, reduction)
        self.model = model
        
    def forward(self, anchor, positive, negative):
        triplet_margin_loss = F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
                                 eps=self.eps, swap=self.swap, reduction=self.reduction)       
        return triplet_margin_loss

In [21]:
anchor = encoder_output[0][eos_idx[0]]
positive = encoder_output[1][eos_idx[1]]
negative = encoder_output[2][eos_idx[2]]

In [22]:
model_K = KeplerPegasusModel_test(model)
loss_1 = model_K(anchor, positive, negative)
loss_1

tensor(0.0858, grad_fn=<MeanBackward0>)

In [23]:
triplet_margin_loss = nn.TripletMarginLoss(margin=1.0, p=2)
loss_2 = triplet_margin_loss(anchor, positive, negative)
loss_2

tensor(0.0858, grad_fn=<MeanBackward0>)

In [24]:
loss_1 == loss_2

tensor(True)

In [None]:
# triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
# anchor = torch.randn(100, 128, requires_grad=True)
# positive = torch.randn(100, 128, requires_grad=True)
# negative = torch.randn(100, 128, requires_grad=True)
# output = triplet_loss(anchor, positive, negative)
# output.backward()

In [None]:
# anchor_output = ...  # shape [None, 128]
# positive_output = ...  # shape [None, 128]
# negative_output = ...  # shape [None, 128]

# d_pos = tf.reduce_sum(tf.square(anchor_output - positive_output), 1)
# d_neg = tf.reduce_sum(tf.square(anchor_output - negative_output), 1)

# loss = tf.maximum(0., margin + d_pos - d_neg)
# loss = tf.reduce_mean(loss)

# #         d_pos = torch.sum(torch.square(anchor_output - positive_output), -1)
# #         d_neg = torch.sum(torch.square(anchor_output - negative_output), -1)
# #         tripletloss = torch.maximum(0., margin + d_pos - d_neg)
# #         loss = torch.mean(loss)

In [69]:
# class SubprocessDataset(Dataset):
#     def __init__(
#         self,
#         data: pd.DataFrame,
#         tokenizer: PegasusTokenizerFast
#     ):                
#         self.data = data
#         self.tokenizer = tokenizer
        
#     def __len__(self):
#         return len(self.data)  
    
#     def __getitem__(self, idx:int):
#         data_row = self.data.iloc[idx]
#         text = data_row['text']
#         masked_sent = data_row['masked_sent']
#         text_encoding = self.tokenizer(text, truncation=True, padding=True)
#         masked_sent_encoding = self.tokenizer(text, truncation=True, padding=True)
#         return dict(
#             text=text,
#             masked_sent=masked_sent,
#             text_input_ids=text_encoding['input_ids'].flatten(),
#             text_attetion_mask=text_encoding['attention_mask'].flatten(),
#             labels = masked_sent_encoding['input_ids'].flatten()
#         )

In [142]:
# import pytorch_lightning as pl

# class SubprocessDataModule(pl.LightningDataModule):
#     def __init__(
#         self,
#         train_df: pd.DataFrame,
#         tokenizer: PegasusTokenizerFast,
#         batch_size: int = 1
#     ):
#         super().__init__()
        
#         self.train_df = train_df
#         self.tokenizer = tokenizer
#         self.batch_size = batch_size
        
#     def setup(self, stage=None):
#         self.train_dataset = SubprocessDataset(self.train_df, self.tokenizer)
        
#     def train_dataloader(self):
#         return DataLoader(
#             self.train_dataset,
#             batch_size = self.batch_size,
#             shuffle=True,
#             num_workers=8
#         )

In [143]:
# n_epoch = 1
# batch_size = 1
# data_module = SubprocessDataModule(train_df, tokenizer, batch_size=batch_size)

### Model

In [144]:
# class KeplerPegasusModel(pl.LightningModule):
    
#     def __init__(self):
#         super().__init__()
#         self.model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True).to(torch_device)
        
#     def forward(self, input_ids, attention_mask, labels):
#         output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
#         return output.loss, output.logits
    
#     def training_step(self, batch, batch_idx):
#         input_ids = batch['text_input_ids']
#         attention_mask = batch['text_attetion_mask']
#         labels = batch['labels']
        
#         loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         self.log("train_loss", loss, prog_bar=True, logger=True)
#         return loss
    
#     def configure_optimizers(self):
#         return AdamW(self.parameters(), lr=0.0001)
        

In [145]:
# model = KeplerPegasusModel()

In [149]:
# from pytorch_lightning.callbacks import ModelCheckpoint
# from pytorch_lightning.loggers import TensorBoardLogger

# %load_ext tensorboard
# %tensorboard --logdir ./lightning_logs

In [147]:
# checkpoint_callback = ModelCheckpoint(
#     dirpath = 'checkpoints',
#     filename = 'best-checkpoint',
#     save_top_k = 1,
#     verbose = True,
# #     monitor = 'val_loss',
#     mode = 'min'
# )

# logger = TensorBoardLogger('lightning_logs', name='process-abstraction-pretrain')

# trainer = pl.Trainer(
#     logger=logger,
#     enable_checkpointing=checkpoint_callback,
#     max_epochs=n_epoch,
#     gpus=0,
#     progress_bar_refresh_rate=30,
#     log_every_n_steps=1
# )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [148]:
# trainer.fit(model, data_module)


  | Name  | Type                            | Params
----------------------------------------------------------
0 | model | PegasusForConditionalGeneration | 570 M 
----------------------------------------------------------
568 M     Trainable params
2.1 M     Non-trainable params
570 M     Total params
2,283.188 Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/I543118/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/I543118/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'SubprocessDataset' on <module '__main__' (built-in)>
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [68]:
# train_texts = ['<mask_1>, Make sure there is no clash between social event and orientation sessions, Advertise social events for new students, Conduct the social event, Post event pictures in social network',
#  '<mask_1>, Verify Sales Quote, Analyze RFQ and produce L&M estimates, Generate overhead costs and determine pricing options, Review Pricing Options and Generate Sales Quote, Sales Quote Delivered',
#  '<mask_1>, <mask_1>, deny request, accept request, check security, accept request, accept request']

# train_labels = ['Create social events',
#  'RFQ Recieved',
#  'check assigment rule , check credit rating']

# subprocess_dataset = pd.DataFrame(list(zip(train_texts, train_labels)), columns = ['text', 'masked_sent'])
# subprocess_dataset

In [9]:
# class PegasusDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels
#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
#         return item
#     def __len__(self):
#         return len(self.labels['input_ids'])  # len(self.labels)

In [11]:
# def tokenize_data(texts, labels):
#         encodings = tokenizer(texts, truncation=True, padding=True)
#         decodings = tokenizer(labels, truncation=True, padding=True)
#         dataset_tokenized = PegasusDataset(encodings, decodings)
#         return dataset_tokenized

# train_dataset = tokenize_data(train_texts, train_labels)

In [20]:
# train_dataloader = DataLoader(
#     train_dataset, shuffle=True, batch_size=2, collate_fn=DataCollator
# )
# for batch in train_dataloader:
#     break
# print({k: v.shape for k, v in batch.items()})
# print(batch)