In [9]:
"""Script for fine-tuning Pegasus
Example usage:
  # use XSum dataset as example, with first 1000 docs as training data
  from datasets import load_dataset
  dataset = load_dataset("xsum")
  train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
  
  # use Pegasus Large model as base for fine-tuning
  model_name = 'google/pegasus-large'
  train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
  trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
  trainer.train()
 
Reference:
  https://huggingface.co/transformers/master/custom_datasets.html
"""

from transformers import PegasusForConditionalGeneration, PegasusTokenizerFast, Trainer, TrainingArguments
import torch


class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

    

In [10]:
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
    """
    Prepare input data for model fine-tuning
    """
    tokenizer = PegasusTokenizerFast.from_pretrained(model_name)

    prepare_val = False if val_texts is None or val_labels is None else True
    prepare_test = False if test_texts is None or test_labels is None else True

    def tokenize_data(texts, labels):
        encodings = tokenizer(texts, truncation=True, padding=True)
        decodings = tokenizer(labels, truncation=True, padding=True)
        dataset_tokenized = PegasusDataset(encodings, decodings)
        return dataset_tokenized

    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
    test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

    return train_dataset, val_dataset, test_dataset, tokenizer

In [11]:
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    """
    Prepare configurations and base model for fine-tuning
    """
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    if freeze_encoder:
        for param in model.model.encoder.parameters():
            param.requires_grad = False

    if val_dataset is not None:
        training_args = TrainingArguments(
          output_dir=output_dir,           # output directory
          num_train_epochs=2000,           # total number of training epochs
          per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
          per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
          save_steps=500,                  # number of updates steps before checkpoint saves
          save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
          evaluation_strategy='steps',     # evaluation strategy to adopt during training
          eval_steps=100,                  # number of update steps before evaluation
          warmup_steps=500,                # number of warmup steps for learning rate scheduler
          weight_decay=0.01,               # strength of weight decay
          logging_dir='./logs',            # directory for storing logs
          logging_steps=10,
        )

        trainer = Trainer(
          model=model,                         # the instantiated 🤗 Transformers model to be trained
          args=training_args,                  # training arguments, defined above
          train_dataset=train_dataset,         # training dataset
          eval_dataset=val_dataset,            # evaluation dataset
          tokenizer=tokenizer
        )

    else:
        training_args = TrainingArguments(
          output_dir=output_dir,           # output directory
          num_train_epochs=3, #2000      # total number of training epochs
          per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
          save_steps=500,                  # number of updates steps before checkpoint saves
          save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
          warmup_steps=500,                # number of warmup steps for learning rate scheduler
          weight_decay=0.01,               # strength of weight decay
          logging_dir='./logs',            # directory for storing logs
          logging_steps=10,
        )

        trainer = Trainer(
          model=model,                         # the instantiated 🤗 Transformers model to be trained
          args=training_args,                  # training arguments, defined above
          train_dataset=train_dataset,         # training dataset
          tokenizer=tokenizer
        )

    return trainer

In [12]:
# !pip3 install datasets
# from datasets import load_dataset
# dataset = load_dataset("xsum")
# train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]

In [42]:
import json
with open('train_dataset.json') as json_file:
    data = json.load(json_file)
    input_texts, input_labels, flows_extracted, models_skipped = data['document'], data['summary'], data['flows_extracted'], data['models_skipped']

In [16]:
print(len(train_texts))
print(len(train_labels))
print(len(models_skipped))

2132
2132
100


In [17]:
import numpy as np
unique_train_labels, label_ind = np.unique(input_labels, return_index=True)
train_labels = [input_labels[x] for x in sorted(label_ind)]
train_texts = [t for i, t in enumerate(input_texts) if i in label_ind]
print(len(unique_train_labels))
print(len(train_texts))

1640
1640


In [62]:
train_texts[50:55]

['<mask_1>, Login, Redirected to homepage, Click "Create Event", Enter information about event, Save',
 'Validate and process resource request, <mask_1>, Task or Subprocess, Advertise job, Assess applications & shortlist',
 'Claim Received, <mask_1>, Request more Information, <mask_1>, <mask_1>, Send Payment to Customer, Reject Claim, <mask_1>, Send Payment to Customer, Reject Claim',
 '<mask_1>, Iterview, Define process models and related metrics, Publsih models to PCE, Provide comments & feedback, Provide comments & feedback, Collect and review feedback',
 '<mask_1>, create meeting folder, upload, -1 week, reminder, attend, prepare minutes']

In [63]:
train_labels[50:55]

['Call www.keskispass.ch',
 'Consider candidates at cross-site resource meeting',
 'Review Claim, Review Claim, Review Claim, Review Claim',
 'Collect process list and scope',
 'meeting due']

In [64]:
flows_extracted[53]

'1048832532'

In [34]:
[i for i, t in enumerate(train_labels) if t == 'RFQ Recieved']

[1, 157, 379, 385, 1142, 1380, 1535, 1641]

In [35]:
for i in [1, 157, 379, 385, 1142, 1380, 1535, 1641]:
    print(flows_extracted[i])

1000982613
1138902023
1321534353
1330479125
2019585323
282222542
450107762
558814631


In [16]:
# use Pegasus Large model as base for fine-tuning
model_name = 'google/pegasus-large'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
trainer.train()

***** Running training *****
  Num examples = 1758
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5274


Step,Training Loss
10,12.3348
20,12.5731


KeyboardInterrupt: 

In [32]:
flows_extracted[500]

'1430644864'

In [139]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = 'result-1000ds/checkpoint-6000'

tokenizer = PegasusTokenizer.from_pretrained(model)
model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)

OSError: Error no file named ['pytorch_model.bin', 'tf_model.h5', 'model.ckpt.index', 'flax_model.msgpack'] found in directory result-1000ds/checkpoint-6000 or `from_tf` and `from_flax` set to False.

In [83]:
# # Check results
# src_text = input_texts[500]
# batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
# translated = model.generate(**batch)
# tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

# print(tgt_text)

In [68]:
# importing dependencies for transformers
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [69]:
# load tokenizer
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
# load model
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

In [73]:
# load tokenizer
tokenizer_a = PegasusTokenizer.from_pretrained('google/pegasus-aeslc')
# load model
model_a = PegasusForConditionalGeneration.from_pretrained('google/pegasus-aeslc')

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [110]:
input_texts[500]

'Vacancy, Creat staff requisition, send for approval , <mask_1>, Advertise through Agencies, Advertise through Internal NGA.net, Advertise through External NGA.net, Received applications, 14 days since advertisment , Close Advertisement , Send applications to the Recruitment Manager , Received Nominated applications for Interview , Create short list, <mask_1>, psychometric test, <mask_1>, second interview, Select the Candidate, Determine terms and conditions, Complete Appointment Approval , Offer Appointment, Close Off the Recruitment and Selection Process'

In [111]:
input_labels[500]

'Approval received , face to face Interview , Conduct two reference check'

In [112]:
text = 'Vacancy, Creat staff requisition, send for approval , Approval received , Advertise through Agencies, Advertise through Internal NGA.net, Advertise through External NGA.net, Received applications, 14 days since advertisment , Close Advertisement , Send applications to the Recruitment Manager , Received Nominated applications for Interview , Create short list, face to face Interview , psychometric test, Conduct two reference check, second interview, Select the Candidate, Determine terms and conditions, Complete Appointment Approval , Offer Appointment, Close Off the Recruitment and Selection Process'

In [113]:
tokens = tokenizer_a(text, truncation=True, padding='longest', return_tensors='pt')
summary = model_a.generate(**tokens)
tokenizer.decode(summary[0])

'URGENT REQUIRES IMMEDIATE ACTION, URGENT'

In [131]:
[i for i, t in enumerate(input_labels) if t == 'Daily']

[47, 61, 83, 96, 125]

In [133]:
for i in [47, 61, 83, 96, 125]:
    print(flows_extracted[i])

1312704923
1421639530
1568206019
1671787493
2065109784


In [15]:
import json
with open('train_masked_optimized.json') as json_file:
    data = json.load(json_file)
    input_texts, input_labels, flows_extracted, models_skipped = data['document'], data['summary'], data['flows_extracted'], data['models_skipped']

In [17]:
print(len(input_texts))
print(len(input_labels))
print(len(flows_extracted))
print(len(models_skipped))

1901
1901
1901
100


In [18]:
import numpy as np
unique_train_labels, label_ind = np.unique(input_labels, return_index=True)
train_labels = [input_labels[x] for x in sorted(label_ind)]
train_texts = [t for i, t in enumerate(input_texts) if i in label_ind]
train_process_models = [t for i, t in enumerate(flows_extracted) if i in label_ind]
print(len(train_labels))
print(len(train_texts))
print(len(train_process_models))

1451
1451
1451


In [12]:
with open('results-masked-optimized-step1.json') as json_file:
    data = json.load(json_file)
data['test_process_models'] = train_process_models[1000:]

In [14]:
data.keys()

dict_keys(['test_texts', 'test_labels', 'test_process_models', 'results'])

In [11]:
len(data['test_labels'])

445

In [143]:
train_labels[-20:]

['Invoices with errors',
 "Sort invoice per Vendor, Enter vendor name, client name, date of arrival at SSP on invoice entry form, Enter invoice process date according to client's SLA on invoice entry form, Follow up further error",
 'Generation of ID Code',
 'Give Administration/Director payment details',
 'Receive details, Transaction Approved',
 'Check quote',
 'Request for quote received',
 'Check process model , Extent process to L 2, 3',
 'decide if person can become member',
 '6.1. Shipment documents verification, 6.7. Verification of prepared invoice and accounted invoice ',
 'phone rings, receive letter',
 'takes all letters from Charles',
 'Check customer data, Check customers liquidity',
 '1 week before meeting ',
 'prepares plan and valuation',
 'check available dishes, ask for tiramisu, ask for check',
 'Begin Patent Process',
 'Pays for application, Receives info, Checks entrance test and exceptions',
 'Personal Information',
 'Check Customer, Enter Customer Requirements']

In [144]:
train_texts[-20:]

['<mask_1>, Next business day, Process invoice, Invoice ready for Validation, Vendor master record needed, Vendor Master Record exception, 11am and 3pm, Invoices collected',
 "Invoices from Mail Centre Clerk, Sort invoice per Client, <mask_1>, Staple 'Invoice Entry Form' to invoice, <mask_1>, Post invoice back to Client, Look it up in excel file, <mask_1>, Highlight Invoice as 'Urgent', Send invoice back to Client, Send reminder, Follow up by phone, Disregard invoice, Invoices ready and allocated to Data Entry Officers, Send invoice back to Client, Send reminder, Send reminder, <mask_1>, Disregard invoice, Don't follow up, Don't send reminders, Scalate to experienced SSP member, Invoice from Experienced SSP member",
 'Receive a Cash transfer order, Check reachebility of destination & maximum transfer limit, Insert money to bank account, <mask_1>, min. 2 days, Bank Withdrawal, Receive a requirement for withdrawal., Dispensing cash, Cash',
 'BIA office hours, Call BIA office, Request for