In [1]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch


class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

      
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2000,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=2000,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer
    )

  return trainer


In [2]:
import pandas as pd
df = pd.read_csv(
    r'D:\backup_user\crypto\thesis\my-repos\cryptocurrencies-kpa\data\processed\ARG_KP_2021\all_complete.csv')
df = df[["topic", "stance", "argument", "key_point"]]

In [3]:
import ast
all_texts = []
all_sum = []
for i, row in df.iterrows():
    all_texts.append(row['argument'])
    all_sum.append(row['key_point'])
  


In [4]:
# use XSum dataset as example, with first 1000 docs as training data
#from datasets import load_dataset
#dataset = load_dataset("xsum")

train_texts, train_labels = all_texts[:1000], all_sum[:1000]

# use Pegasus Large model as base for fine-tuning
model_name = 'google/pegasus-large'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 2000
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 2000000


Step,Training Loss
10,7.8886
20,9.0096
30,8.4217
40,9.2245
50,7.5653
60,7.2943
70,7.6768
80,7.3406
90,5.97
100,5.8386


KeyboardInterrupt: 

In [None]:
# fine tuning of KPA failed due to gpu memory limitation, looks it takes very long time to fine tune

In [6]:
len(all_texts)

56

In [7]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch


model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
summaries = []
batch_size = 1
for i in range(0, len(all_texts), batch_size):
    batch = tokenizer(all_texts[i:i+batch_size], truncation=True, padding="longest", return_tensors="pt" ).to(device)
    translated = model.generate(**batch)
    summaries.append(translated)


In [10]:
all_preds = []
for s in summaries:
    tgt_text = tokenizer.batch_decode(s, skip_special_tokens=True)
    all_preds.extend(tgt_text)

In [11]:
kpa_test_df['pegasus_sum'] = all_preds

In [17]:
kpa_test_df.iloc[4]['pegasus_sum']

'The vow of celibacy should not be abandoned.'

In [18]:
kpa_test_df.to_csv('./PPLM/kpa_dataset/processed_kpa.csv', index=None)

In [19]:
kpa_test_df.head()

Unnamed: 0,topic,stance,argument,key_point,extracted_kps,pegasus_sum
0,Assisted suicide should be a criminal offence,-1,{'if a patient is suffering with cancer or oth...,"{'Assisted suicide reduces suffering', 'People...",if a patient is suffering with cancer or other...,People have the right to die with dignity and ...
1,Assisted suicide should be a criminal offence,1,{'assisted suicide is killing by another name ...,{'Assisted suicide should not be allowed becau...,assisted suicide is killing by another name ...,What do you think about assisted suicide?
2,Homeschooling should be banned,-1,{'homeschooling removes bullying from a childs...,{'Homeschools can be personalized to the child...,homeschooling removes bullying from a childs l...,Should homeschooled children be allowed to att...
3,Homeschooling should be banned,1,"{'homeschooling is not government regulated ',...",{'Homeschools cannot be regulated standardized...,homeschooling is not government regulated. the...,What is the best way to educate a child at home?
4,The vow of celibacy should be abandoned,-1,{'celibacy before marriage protects young peop...,{'Religious experiences and traditions should ...,celibacy before marriage protects young people...,The vow of celibacy should not be abandoned.


In [9]:
import pandas as pd
terra_df = pd.read_csv('./PPLM/kpa_dataset/processed_terra.csv', index_col=False)

In [10]:
terra_df.head(5)

Unnamed: 0,text,section,extracted_kps
0,While many see the benefits of a price-stable ...,Abstract,While many see the benefits of a price-stable ...
1,The price-volatility of cryptocurrencies is a ...,Introduction,The price-volatility of cryptocurrencies is a ...
2,A stable-coin mechanism must answer three key ...,Multi-fiat peg monetary policy,A stable-coin mechanism must answer three key ...
3,The existential objective of a stable-coin is ...,Defining stability against regional fiat curre...,The existential objective of a stable-coin is ...
4,Since the price of Terra currencies in seconda...,Measuring stability with miner oracles,Since the price of Terra currencies in seconda...


In [11]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch


model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
summaries = []
all_texts = terra_df['text'].tolist()
batch_size = 5
for i in range(0, len(all_texts), batch_size):
    batch = tokenizer(all_texts[i:i+batch_size], truncation=True, padding="longest", return_tensors="pt" ).to(device)
    translated = model.generate(**batch)
    summaries.append(translated)


In [12]:
all_preds = []
for s in summaries:
    tgt_text = tokenizer.batch_decode(s, skip_special_tokens=True)
    all_preds.extend(tgt_text)

In [13]:
all_preds

['The adoption of cryptocurrencies is on the rise.',
 'In this paper, we present the Terra Protocol, an elastic monetary policy for cryptocurrencies.',
 'A stable-coin is an asset with a long-term track record of price stability.',
 'The Terra Protocol aims to create a stable-coin ecosystem.',
 'A price oracle is a key part of the Terra protocol.',
 'The Terra money market is a pegged fiat system in which the price of a Terra currency is pegged to the price of money.',
 'The Terra Protocol is a Proof of Stake (PoS) blockchain, where miners need to stake a native cryptocurrency Luna to mine Terra transactions. The Terra Protocol runs on a Proof of Stake (PoS) blockchain, where miners need to stake a native cryptocurrency Luna to mine Terra transactions.',
 'The Terra protocol aims to provide stable and predictable rewards to miners.',
 'The Terra Platform DApps will offer a stable platform for building financial applications that use Terra as their underlying currency. Terra will offer 

In [14]:
terra_df['pegasus_sum'] = all_preds

In [15]:
terra_df.to_csv('./PPLM/kpa_dataset/processed_terra.csv', index=None)

In [6]:
import pandas as pd
kpa = pd.read_csv('./PPLM/kpa_dataset/processed_kpa.csv', index_col=False)
btc = pd.read_csv('./PPLM/kpa_dataset/processed_btc.csv', index_col=False)
terra = pd.read_csv('./PPLM/kpa_dataset/processed_terra.csv', index_col=False)

In [2]:
kpa.iloc[3]['pegasus_sum']

'What is the best way to educate a child at home?'

In [7]:
terra.iloc[9]['pegasus_sum']

'Terra is a digital currency that is designed to complement both existing fiat and cryptocurrencies as a way to transact and store value.'