In [1]:
import json
import pandas as pd
import torch

from transformers import PegasusModel, PegasusForConditionalGeneration, PegasusTokenizerFast, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

##### Load Train Dataset, Tokenizer & Model

In [2]:
with open('./train_dataset.json', 'r') as f:
    data = json.load(f)

In [3]:
model_name = 'google/pegasus-large'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizerFast.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True).to(torch_device)

##### Define Custom Dataset, DataLoader

In [4]:
class SubprocessDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx])
        return item # input_ids, attention_mask, labels
    def __len__(self):
        return len(self.labels['input_ids']) # len(self.labels)

In [5]:
class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # 1 training step - see if decoder weights change
        item['labels'] = torch.tensor(self.encodings['input_ids'][idx])
        return item # input_ids, attention_mask, labels
    def __len__(self):
        return len(self.encodings['input_ids']) # len(self.labels)

In [6]:
def tokenize_data(texts, labels):
        encodings = tokenizer(texts, truncation=True, padding=True)
        decodings = tokenizer(labels, truncation=True, padding=True)
        graphdata_tokenized = GraphDataset(encodings)
        subprocess_tokenized = SubprocessDataset(encodings, decodings)
        return graphdata_tokenized, subprocess_tokenized

graphdata_train_dataset, subprocess_train_dataset = tokenize_data(data['document'][:9], data['summary'][:9])

In [7]:
graphdata_train_dataloader = DataLoader(
    graphdata_train_dataset, shuffle=False, batch_size=3
#     num_workers=4
)
subprocess_train_dataloader = DataLoader(
    subprocess_train_dataset, shuffle=True, batch_size=3
#     num_workers=4
)

for graphdata_batch, subprocess_batch in zip(graphdata_train_dataloader, subprocess_train_dataloader):
    break
print({k: v.shape for k, v in graphdata_batch.items()})
print({k: v.shape for k, v in subprocess_batch.items()})

{'input_ids': torch.Size([3, 77]), 'attention_mask': torch.Size([3, 77]), 'labels': torch.Size([3, 77])}
{'input_ids': torch.Size([3, 77]), 'attention_mask': torch.Size([3, 77]), 'labels': torch.Size([3, 24])}


In [8]:
decoded_string = tokenizer.decode(graphdata_batch['input_ids'][0])
print(decoded_string)

10 am,<mask_1>, In system, Not in system, Record Vendor master,<mask_1>, Send to outbox, Send to priority outbox, Urgent</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [9]:
graphdata_batch['input_ids'][0]

tensor([  377,   346,   108,     2,   110,   108,   222,   327,   108,  1089,
          115,   327,   108, 10297, 26658,  2080,   110,   108,     2,   110,
          108,  8462,   112,   165,  4835,   108,  8462,   112,  3559,   165,
         4835,   108, 42298,     1,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])

In [10]:
def get_eos_idx(batch):
    for input_ids in batch['input_ids']:
        eos_id = input_ids == 1
        idx = eos_id.nonzero()[0]
        if 'eos_idx' in locals():
            eos_idx = torch.cat((eos_idx, idx), 0)
        else:
            eos_idx = eos_id.nonzero()[0]
    return eos_idx
get_eos_idx(graphdata_batch)

tensor([33, 42, 66])

##### Extract Model Output - encoder_last_hidden_state & loss

In [10]:
output = model(**batch)
output.loss

tensor(8.3893, grad_fn=<NllLossBackward0>)

In [11]:
output.encoder_last_hidden_state

tensor([[[-2.4870e-02, -4.7812e-02,  1.0340e-01,  ...,  4.9686e-02,
          -5.3139e-02, -3.3579e-02],
         [-1.5876e-02,  8.8630e-02, -1.5995e-02,  ...,  2.4677e-01,
          -2.0305e-01, -5.9902e-03],
         [-2.2632e-02,  1.6212e-03,  5.9650e-02,  ..., -7.6430e-02,
           9.9388e-02, -3.3480e-02],
         ...,
         [ 1.0401e-01, -7.4977e-02,  5.6770e-02,  ...,  1.9212e-01,
           2.0357e-01,  1.0390e-01],
         [ 4.4121e-03, -1.8117e-02,  3.4443e-02,  ...,  1.6542e-01,
           1.1914e-01,  2.3607e-01],
         [ 2.3097e-02, -2.0389e-02,  2.1372e-02,  ...,  1.3392e-01,
           1.5417e-01,  2.0417e-01]],

        [[ 6.8591e-02, -8.6850e-02,  3.6081e-03,  ..., -3.4109e-02,
          -8.6330e-02,  1.4675e-01],
         [-1.7537e-02,  2.3032e-01,  2.4777e-02,  ..., -2.2457e-02,
           3.6170e-02,  1.5164e-02],
         [ 1.1273e-01, -1.7025e-01,  1.7403e-03,  ..., -3.4095e-02,
          -2.1881e-02,  1.7849e-01],
         ...,
         [ 8.9253e-02,  5

In [12]:
output.encoder_last_hidden_state.shape

torch.Size([3, 77, 1024])

In [13]:
eos_idx = get_eos_ids(batch)
encoder_output = output.encoder_last_hidden_state
encoder_output[0][eos_idx[0]].shape

torch.Size([1024])

##### Try Model Fine-tuning

In [9]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [10]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

15


In [12]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(torch_device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/15 [00:00<?, ?it/s]

### KEPLER Pegasus Model 
https://pytorch.org/docs/stable/_modules/torch/nn/modules/loss.html#TripletMarginLoss

In [11]:
import torch.nn as nn
import torch.nn.functional as F

class KeplerPegasusModel(nn.TripletMarginLoss):
    
    def __init__(self, model, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
                 reduce=None, reduction: str = 'mean'):
        super().__init__(margin, p, eps, swap, size_average, reduce, reduction)
        self.model = model
        
    def forward(self, graphdata_eos_idx, graphdata_batch, subprocess_batch):
        # Triplet margin loss
        model_output = self.model(**graphdata_batch)
        encoder_output = model_output.encoder_last_hidden_state
        anchor = encoder_output[0][graphdata_eos_idx[0]]
        positive = encoder_output[1][graphdata_eos_idx[1]]
        negative = encoder_output[2][graphdata_eos_idx[2]]
        triplet_margin_loss = F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
                                 eps=self.eps, swap=self.swap, reduction=self.reduction)
        # Pegasus loss
        MLM_output = self.model(**subprocess_batch)
        
        loss = MLM_output.loss + triplet_margin_loss 
        # monitor losses
        # assertions?
        
        return loss
    

In [12]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(graphdata_train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

9


In [13]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for graphdata_batch, subprocess_batch in zip(graphdata_train_dataloader, subprocess_train_dataloader):        
        graphdata_eos_idx = get_eos_idx(graphdata_batch)
        graphdata_batch = {k: v.to(torch_device) for k, v in graphdata_batch.items()}
        subprocess_batch = {k: v.to(torch_device) for k, v in subprocess_batch.items()}
        kepler_pegasus_model = KeplerPegasusModel(model)
        loss = kepler_pegasus_model(graphdata_eos_idx, graphdata_batch, subprocess_batch)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
# taking eos as sentence representation
# use encodings itself (or random generated text) as labels, decoder_input_ids needed for graph data output...
# create some sort of assertions? to make sure that the loss is calculated correctly
# input data design questions (input data repetitions of subprocess side)
# summarisation evaluation metrics questions

# list of planned experiments - hypothesis vs. realities
# thesis - experiment sections - data & sentence representation & KE loss definitions & evaluation metrics
# github colab...
# https://wandb.ai/site

In [20]:
class KeplerPegasusModel_test(nn.TripletMarginLoss):
    
    def __init__(self, model, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
                 reduce=None, reduction: str = 'mean'):
        super().__init__(margin, p, eps, swap, size_average, reduce, reduction)
        self.model = model
        
    def forward(self, anchor, positive, negative):
        triplet_margin_loss = F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
                                 eps=self.eps, swap=self.swap, reduction=self.reduction)       
        return triplet_margin_loss

In [21]:
anchor = encoder_output[0][eos_idx[0]]
positive = encoder_output[1][eos_idx[1]]
negative = encoder_output[2][eos_idx[2]]

In [22]:
model_K = KeplerPegasusModel_test(model)
loss_1 = model_K(anchor, positive, negative)
loss_1

tensor(0.0858, grad_fn=<MeanBackward0>)

In [23]:
triplet_margin_loss = nn.TripletMarginLoss(margin=1.0, p=2)
loss_2 = triplet_margin_loss(anchor, positive, negative)
loss_2

tensor(0.0858, grad_fn=<MeanBackward0>)

In [24]:
loss_1 == loss_2

tensor(True)

In [None]:
# triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
# anchor = torch.randn(100, 128, requires_grad=True)
# positive = torch.randn(100, 128, requires_grad=True)
# negative = torch.randn(100, 128, requires_grad=True)
# output = triplet_loss(anchor, positive, negative)
# output.backward()

In [None]:
# anchor_output = ...  # shape [None, 128]
# positive_output = ...  # shape [None, 128]
# negative_output = ...  # shape [None, 128]

# d_pos = tf.reduce_sum(tf.square(anchor_output - positive_output), 1)
# d_neg = tf.reduce_sum(tf.square(anchor_output - negative_output), 1)

# loss = tf.maximum(0., margin + d_pos - d_neg)
# loss = tf.reduce_mean(loss)

# #         d_pos = torch.sum(torch.square(anchor_output - positive_output), -1)
# #         d_neg = torch.sum(torch.square(anchor_output - negative_output), -1)
# #         tripletloss = torch.maximum(0., margin + d_pos - d_neg)
# #         loss = torch.mean(loss)

In [69]:
# class SubprocessDataset(Dataset):
#     def __init__(
#         self,
#         data: pd.DataFrame,
#         tokenizer: PegasusTokenizerFast
#     ):                
#         self.data = data
#         self.tokenizer = tokenizer
        
#     def __len__(self):
#         return len(self.data)  
    
#     def __getitem__(self, idx:int):
#         data_row = self.data.iloc[idx]
#         text = data_row['text']
#         masked_sent = data_row['masked_sent']
#         text_encoding = self.tokenizer(text, truncation=True, padding=True)
#         masked_sent_encoding = self.tokenizer(text, truncation=True, padding=True)
#         return dict(
#             text=text,
#             masked_sent=masked_sent,
#             text_input_ids=text_encoding['input_ids'].flatten(),
#             text_attetion_mask=text_encoding['attention_mask'].flatten(),
#             labels = masked_sent_encoding['input_ids'].flatten()
#         )

In [142]:
# import pytorch_lightning as pl

# class SubprocessDataModule(pl.LightningDataModule):
#     def __init__(
#         self,
#         train_df: pd.DataFrame,
#         tokenizer: PegasusTokenizerFast,
#         batch_size: int = 1
#     ):
#         super().__init__()
        
#         self.train_df = train_df
#         self.tokenizer = tokenizer
#         self.batch_size = batch_size
        
#     def setup(self, stage=None):
#         self.train_dataset = SubprocessDataset(self.train_df, self.tokenizer)
        
#     def train_dataloader(self):
#         return DataLoader(
#             self.train_dataset,
#             batch_size = self.batch_size,
#             shuffle=True,
#             num_workers=8
#         )

In [143]:
# n_epoch = 1
# batch_size = 1
# data_module = SubprocessDataModule(train_df, tokenizer, batch_size=batch_size)

### Model

In [144]:
# class KeplerPegasusModel(pl.LightningModule):
    
#     def __init__(self):
#         super().__init__()
#         self.model = PegasusForConditionalGeneration.from_pretrained(model_name, return_dict=True).to(torch_device)
        
#     def forward(self, input_ids, attention_mask, labels):
#         output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
#         return output.loss, output.logits
    
#     def training_step(self, batch, batch_idx):
#         input_ids = batch['text_input_ids']
#         attention_mask = batch['text_attetion_mask']
#         labels = batch['labels']
        
#         loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         self.log("train_loss", loss, prog_bar=True, logger=True)
#         return loss
    
#     def configure_optimizers(self):
#         return AdamW(self.parameters(), lr=0.0001)
        

In [145]:
# model = KeplerPegasusModel()

In [149]:
# from pytorch_lightning.callbacks import ModelCheckpoint
# from pytorch_lightning.loggers import TensorBoardLogger

# %load_ext tensorboard
# %tensorboard --logdir ./lightning_logs

In [147]:
# checkpoint_callback = ModelCheckpoint(
#     dirpath = 'checkpoints',
#     filename = 'best-checkpoint',
#     save_top_k = 1,
#     verbose = True,
# #     monitor = 'val_loss',
#     mode = 'min'
# )

# logger = TensorBoardLogger('lightning_logs', name='process-abstraction-pretrain')

# trainer = pl.Trainer(
#     logger=logger,
#     enable_checkpointing=checkpoint_callback,
#     max_epochs=n_epoch,
#     gpus=0,
#     progress_bar_refresh_rate=30,
#     log_every_n_steps=1
# )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [148]:
# trainer.fit(model, data_module)


  | Name  | Type                            | Params
----------------------------------------------------------
0 | model | PegasusForConditionalGeneration | 570 M 
----------------------------------------------------------
568 M     Trainable params
2.1 M     Non-trainable params
570 M     Total params
2,283.188 Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/I543118/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/I543118/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'SubprocessDataset' on <module '__main__' (built-in)>
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [68]:
# train_texts = ['<mask_1>, Make sure there is no clash between social event and orientation sessions, Advertise social events for new students, Conduct the social event, Post event pictures in social network',
#  '<mask_1>, Verify Sales Quote, Analyze RFQ and produce L&M estimates, Generate overhead costs and determine pricing options, Review Pricing Options and Generate Sales Quote, Sales Quote Delivered',
#  '<mask_1>, <mask_1>, deny request, accept request, check security, accept request, accept request']

# train_labels = ['Create social events',
#  'RFQ Recieved',
#  'check assigment rule , check credit rating']

# subprocess_dataset = pd.DataFrame(list(zip(train_texts, train_labels)), columns = ['text', 'masked_sent'])
# subprocess_dataset

In [9]:
# class PegasusDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels
#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
#         return item
#     def __len__(self):
#         return len(self.labels['input_ids'])  # len(self.labels)

In [11]:
# def tokenize_data(texts, labels):
#         encodings = tokenizer(texts, truncation=True, padding=True)
#         decodings = tokenizer(labels, truncation=True, padding=True)
#         dataset_tokenized = PegasusDataset(encodings, decodings)
#         return dataset_tokenized

# train_dataset = tokenize_data(train_texts, train_labels)

In [20]:
# train_dataloader = DataLoader(
#     train_dataset, shuffle=True, batch_size=2, collate_fn=DataCollator
# )
# for batch in train_dataloader:
#     break
# print({k: v.shape for k, v in batch.items()})
# print(batch)