### Text summarization using PEGASUS
Introducing text summarization technique using PEGASUS which is powerful model for abstractive summarization. In PEGASUS, important sentences are removed/masked from an input txt such as BERT and are generated together as one output sequence from the remaining sentences, similar to an extractive summary.

ArXiv: https://arxiv.org/abs/1912.08777

![](https://ai2-s2-public.s3.amazonaws.com/figures/2017-08-08/f4061bd225b3be5b3f5b18eb1a229ce991efefeb/2-Figure1-1.png)

In [1]:
!pip install pyarrow>=6.1.0
!pip install transformers
!pip install sentencepiece

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
import pandas as pd


In [4]:
df=pd.read_csv("../input/snerhack/train.csv")
df.head()


In [5]:
n=15
tx=df["Abstract"][n]

In [6]:
tx

In [7]:
# from transformers import PegasusForConditionalGeneration, AutoTokenizer
# import torch

# # You can chose models from following list
# # https://huggingface.co/models?sort=downloads&search=google%2Fpegasus
# model_name = 'google/pegasus-cnn_dailymail'
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
# batch = tokenizer(tx, truncation=True, padding='longest', return_tensors="pt").to(device)
# translated = model.generate(**batch)
# tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

# tgt_text

In [8]:
# from rouge_score import rouge_scorer

# scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
# scores = scorer.score(df["RHS"][n],
#                       tgt_text[0])

In [9]:
# scores

In [10]:
"""Script for fine-tuning Pegasus
Example usage:
  # use XSum dataset as example, with first 1000 docs as training data
  from datasets import load_dataset
  dataset = load_dataset("xsum")
  train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
  
  # use Pegasus Large model as base for fine-tuning
  model_name = 'google/pegasus-large'
  train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
  trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
  trainer.train()
 
Reference:
  https://huggingface.co/transformers/master/custom_datasets.html
"""

from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch


class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

      
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
    """
    Prepare input data for model fine-tuning
    """
    tokenizer = PegasusTokenizer.from_pretrained(model_name)

    prepare_val = False if val_texts is None or val_labels is None else True
    prepare_test = False if test_texts is None or test_labels is None else True

    def tokenize_data(texts, labels):
        encodings = tokenizer(texts, truncation=True, padding=True)
        decodings = tokenizer(labels, truncation=True, padding=True)
        dataset_tokenized = PegasusDataset(encodings, decodings)
        return dataset_tokenized

    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
    test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

    return train_dataset, val_dataset, test_dataset, tokenizer

In [11]:
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    """
    Prepare configurations and base model for fine-tuning
    """
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    if freeze_encoder:
        for param in model.model.encoder.parameters():
            param.requires_grad = False

    if val_dataset is not None:
        training_args = TrainingArguments(
        output_dir=output_dir,           # output directory
        num_train_epochs=1,           # total number of training epochs
        per_device_train_batch_size=2,   # batch size per device during training, can increase if memory allows
        per_device_eval_batch_size=2,    # batch size for evaluation, can increase if memory allows
        save_steps=100,                  # number of updates steps before checkpoint saves
        save_total_limit=2,              # limit the total amount of checkpoints and deletes the older checkpoints
        evaluation_strategy='steps',     # evaluation strategy to adopt during training
        eval_steps=100,                  # number of update steps before evaluation
        warmup_steps=200,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
     )

        trainer = Trainer(
          model=model,                         # the instantiated 🤗 Transformers model to be trained
          args=training_args,                  # training arguments, defined above
          train_dataset=train_dataset,         # training dataset
          eval_dataset=val_dataset,            # evaluation dataset
          tokenizer=tokenizer
        )

    else:
        training_args = TrainingArguments(
          output_dir=output_dir,           # output directory
          num_train_epochs=1,           # total number of training epochs
          per_device_train_batch_size=2,   # batch size per device during training, can increase if memory allows
          save_steps=100,                  # number of updates steps before checkpoint saves
          save_total_limit=2,              # limit the total amount of checkpoints and deletes the older checkpoints
          warmup_steps=200,                # number of warmup steps for learning rate scheduler
          weight_decay=0.01,               # strength of weight decay
          logging_dir='./logs',            # directory for storing logs
          logging_steps=10,
        )

        trainer = Trainer(
          model=model,                         # the instantiated 🤗 Transformers model to be trained
          args=training_args,                  # training arguments, defined above
          train_dataset=train_dataset,         # training dataset
          tokenizer=tokenizer
        )

    return trainer


if __name__=='__main__':
  # use XSum dataset as example, with first 1000 docs as training data
    train_texts, train_labels = list(df['Abstract'][:1000].values), list(df['RHS'][:1000].values)
  
  # use Pegasus Large model as base for fine-tuning
    model_name = 'google/pegasus-large'
    train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
    trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
    trainer.train()

In [13]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch

# You can chose models from following list
# https://huggingface.co/models?sort=downloads&search=google%2Fpegasus
model_name = './results/checkpoint-500'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
batch = tokenizer(tx, truncation=True, padding='longest', return_tensors="pt").to(device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

tgt_text