# DL implementation of abstractive summarization by finetuning PEGASUS model

## Required libraries are imported or installed

In [None]:
!pip install sentencepiece
!pip install transformers
!pip install datasets
!pip install rouge_score

Collecting rouge_score
  Downloading https://files.pythonhosted.org/packages/1f/56/a81022436c08b9405a5247b71635394d44fe7e1dbedc4b28c740e09c2840/rouge_score-0.0.4-py2.py3-none-any.whl
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
from datasets import load_metric
from rouge_score import rouge_scorer
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np

## Required functions

In [None]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  
        return item
    def __len__(self):
        return len(self.labels)

In [None]:
def tokenize_data(texts, labels):
  encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
  decodings = tokenizer(labels, truncation=True, padding=True, return_tensors="pt")
  dataset_tokenized = PegasusDataset(encodings, decodings)
  return dataset_tokenized

In [None]:
def prepare_data(model_name, train_texts, train_labels, 
                 val_texts, val_labels, test_texts, test_labels):

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) 
  test_dataset = tokenize_data(test_texts, test_labels) 

  return train_dataset, val_dataset, test_dataset

In [None]:
def prepare_fine_tuning(model, train_dataset, val_dataset, freeze_encoder=False, output_dir='./results'):

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  training_args = TrainingArguments(
    output_dir = output_dir,
    do_train = True, 
    num_train_epochs=2,              
    per_device_train_batch_size=1,   
    save_steps=500,                  
    save_total_limit=5,              
    evaluation_strategy='steps',     
    eval_steps=100,                  
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
  )

  trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
  )

  return trainer

In [None]:
def compute_rouge(txt,ref):
  rs_1 = rouge.compute(predictions=txt, references=ref, rouge_types=["rouge1"])["rouge1"].high
  rs_2 = rouge.compute(predictions=txt, references=ref, rouge_types=["rouge2"])["rouge2"].high
  rs_L = rouge.compute(predictions=txt, references=ref, rouge_types=["rougeL"])["rougeL"].high
  return rs_1, rs_2, rs_L

## Data modification

### Data cleaning and preparing

In [None]:
!gdown https://drive.google.com/uc?id=1kY6AEpZHEbqqFh_te_ITybsE4R9xEOa9

Downloading...
From: https://drive.google.com/uc?id=1kY6AEpZHEbqqFh_te_ITybsE4R9xEOa9
To: /content/archive.zip
20.7MB [00:00, 42.2MB/s]


In [None]:
!unzip '/content/archive.zip'
!rm /content/archive.zip

Archive:  /content/archive.zip
  inflating: news_summary.csv        
  inflating: news_summary_more.csv   


In [None]:
df = pd.read_csv('news_summary.csv', encoding = "ISO-8859-1")

In [None]:
print(df.shape)

(4514, 6)


In [None]:
df.dropna(subset = ["ctext"], inplace=True)

In [None]:
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [None]:
lst = []
for i in range(len(df)):
  lst.append(i)
df.index = lst

In [None]:
print(df.shape)

(4396, 6)


### Train-validation-Test data spliting

In [None]:
train_texts, train_labels = df["ctext"][:3500], df["text"][:3500]
val_texts, val_labels = df["ctext"][3500:4000], df["text"][3500:4000]
test_texts, test_labels = df["ctext"][4000:], df["text"][4000:]

In [None]:
train_texts = list(train_texts)
train_labels = list(train_labels)
val_texts = list(val_texts)
val_labels = list(val_labels)
test_texts = list(test_texts)
test_labels = list(test_labels)

## Preparing the data according to PEGASUS model requirements

In [None]:
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
train_dataset, val_dataset, test_dataset = prepare_data(model_name, train_texts, train_labels, val_texts, val_labels, test_texts, test_labels)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1912529.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=88.0, style=ProgressStyle(description_w…




## Finetune model and train

In [None]:
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
trainer = prepare_fine_tuning(model, train_dataset, val_dataset)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3093.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2275327883.0, style=ProgressStyle(descr…




## Testing the finetuned model on test dataset 

In [None]:
trainer.evaluate(test_dataset)

  
  import sys


{'eval_loss': 4.043250560760498,
 'eval_mem_cpu_alloc_delta': 10063872,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 454349824,
 'eval_runtime': 1.1716,
 'eval_samples_per_second': 1.707,
 'init_mem_cpu_alloc_delta': 28672,
 'init_mem_cpu_peaked_delta': 0,
 'init_mem_gpu_alloc_delta': 0,
 'init_mem_gpu_peaked_delta': 0}

In [None]:
Rouge_1 = []
Rouge_2 = []
Rouge_L = []
rouge = load_metric("rouge")
for i in tqdm(range(len(test_texts))):
  mydict = {}
  mydict['input_ids'] = test_dataset.encodings['input_ids'][i].view(1, -1).to(device)
  mydict['attention_mask'] = test_dataset.encodings['attention_mask'][i].view(1, -1).to(device)
  ref = [test_labels[i]]
  s = trainer.model.generate(**mydict)
  txt = tokenizer.batch_decode(s, skip_special_tokens=True)
  (rs_1, rs_2, rs_L) = compute_rouge(txt,ref)
  Rouge_1.append(rs_1)
  Rouge_2.append(rs_2)
  Rouge_L.append(rs_L)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2170.0, style=ProgressStyle(description…

  0%|          | 0/396 [00:00<?, ?it/s]




100%|██████████| 396/396 [32:13<00:00,  4.88s/it]


In [None]:
k1_p = 0; k1_r = 0; k1_f = 0
k2_p = 0; k2_r = 0; k2_f = 0
kl_p = 0; kl_r = 0; kl_f = 0
for i in range(len(Rouge_1)):
  k1_p += Rouge_1[i][0]
  k1_r += Rouge_1[i][1]
  k1_f += Rouge_1[i][2]
  R1_p = (k1_p/len(Rouge_1)*100)
  R1_r = (k1_r/len(Rouge_1)*100)
  R1_f = (k1_f/len(Rouge_1)*100)
 
  k2_p += Rouge_2[i][0]
  k2_r += Rouge_2[i][1]
  k2_f += Rouge_2[i][2]
  R2_p = (k2_p/len(Rouge_2)*100)
  R2_r = (k2_r/len(Rouge_2)*100)
  R2_f = (k2_f/len(Rouge_2)*100)

  kl_p += Rouge_L[i][0]
  kl_r += Rouge_L[i][1]
  kl_f += Rouge_L[i][2]
  RL_p = (kl_p/len(Rouge_L)*100)
  RL_r = (kl_r/len(Rouge_L)*100)
  RL_f = (kl_f/len(Rouge_L)*100)

In [None]:
print(".........Rouge1.........")
print("Precision: {:.2f}, Recall: {:.2f}, F1 measure: {:.2f}".format(R1_p,R1_r,R1_f))
print("-"*50)
print(".........Rouge2.........")
print("Precision: {:.2f}, Recall: {:.2f}, F1 measure: {:.2f}".format(R2_p,R2_r,R2_f))
print("-"*50)
print(".........RougeL.........")
print("Precision: {:.2f}, Recall: {:.2f}, F1 measure: {:.2f}".format(RL_p,RL_r,RL_f))
print("-"*50)

.........Rouge1.........
Precision: 37.09, Recall: 41.43, F1 measure: 35.73
--------------------------------------------------
.........Rouge2.........
Precision: 17.01, Recall: 19.09, F1 measure: 16.21
--------------------------------------------------
.........RougeL.........
Precision: 25.28, Recall: 27.78, F1 measure: 24.03
--------------------------------------------------


## Demonstration on four example summaries

### Example 1

In [None]:
src = [test_texts[0]]
ref = [test_labels[0]]
batch = tokenizer(src, truncation=True, return_tensors="pt").to(device)
summarized = trainer.model.generate(**batch)
txt = tokenizer.batch_decode(summarized, skip_special_tokens=True)
rouge_score = rouge.compute(predictions=txt, references=ref, rouge_types=["rouge1"])["rouge1"].high

In [None]:
print(ref)  #original summary
print("-"*50)
print(txt)  #generated summary
print("-"*50)
print("Rouge1 score {}".format(rouge_score))

['Actress-turned-author Twinkle Khanna, while speaking at an event, said that sex is important at every stage of life. "The things I found extremely attractive in Akshay have changed over time," she added. Twinkle and Akshay, who got married in 2001, completed 16 years of marriage in January this year. They have a 14-year-old son Aarav and a 4-year-old daughter Nitara.']
--------------------------------------------------
["From tracing her story as a little girl who was asked to smile a lot to be liked by everyone to her first kiss resulting in a lot of Maths homework to the funny hastags running on social media by men asking for equality--Swara Bhaskar nails the society's hypocrisy towards women.She then speaks as Sexism to the woman, about impure grapes being sour."]
--------------------------------------------------
Rouge1 score Score(precision=0.10606060606060606, recall=0.10606060606060606, fmeasure=0.10606060606060608)


### Example 2

In [None]:
src = [test_texts[120]]
ref = [test_labels[120]]
batch = tokenizer(src, truncation=True, return_tensors="pt").to(device)
summarized = trainer.model.generate(**batch)
txt = tokenizer.batch_decode(summarized, skip_special_tokens=True)
rouge_score = rouge.compute(predictions=txt, references=ref, rouge_types=["rouge1"])["rouge1"].high

In [None]:
print(ref)  #original summary
print("-"*50)
print(txt)  #generated summary
print("-"*50)
print("Rouge1 score {}".format(rouge_score))

['Singer Shreya Ghoshal is set to get a wax figure at the Madame Tussauds in New Delhi. Ghoshal said, "it is an honour to be featured among such talented stars, artists, historians and renowned celebrities." The wax museum, which will open later this year, will also feature wax figures of Bollywood actors Amitabh Bachchan and Shah Rukh Khan. ']
--------------------------------------------------
['With Madame Tussauds all set to come to Delhi, one of the unexpected names of celeb statues that has surfaced is that of singer Shreya Ghoshal.The statue will be created in a distinctive singing pose, and will be open to the public when the museum opens at Regal Palace, in the heart of Delhi later this year.']
--------------------------------------------------
Rouge1 score Score(precision=0.43103448275862066, recall=0.43103448275862066, fmeasure=0.43103448275862066)


### Example 3

In [None]:
src = [test_texts[-20]]
ref = [test_labels[-20]]
batch = tokenizer(src, truncation=True, return_tensors="pt").to(device)
summarized = trainer.model.generate(**batch)
txt = tokenizer.batch_decode(summarized, skip_special_tokens=True)
rouge_score = rouge.compute(predictions=txt, references=ref, rouge_types=["rouge1"])["rouge1"].high

In [None]:
print(ref)  #original summary
print("-"*50)
print(txt)  #generated summary
print("-"*50)
print("Rouge1 score {}".format(rouge_score))

['A 26-year-old who has been sitting on an indefinite hunger strike for nearly 10 days for Special Backward Classes quota got married at the protest site in Rajasthan. Devraj Gujjar continued with his "fast unto death" after the rituals while his wife left with her in-laws. Meanwhile, his wife said she would join him if the demands were not met.']
--------------------------------------------------
['Jaipur, Feb 24 (PTI) The site of an indefinite hunger strike for Special Backward Class quota turned into the marriage venue for 26-year-old Devraj Gujjar, who tied the nuptial knot here, as he chose not to leave the protest.']
--------------------------------------------------
Rouge1 score Score(precision=0.5121951219512195, recall=0.3333333333333333, fmeasure=0.40384615384615385)


### Example 4

In [None]:
src = [test_texts[300]]
ref = [test_labels[300]]
batch = tokenizer(src, truncation=True, return_tensors="pt").to(device)
summarized = trainer.model.generate(**batch)
txt = tokenizer.batch_decode(summarized, skip_special_tokens=True)
rouge_score = rouge.compute(predictions=txt, references=ref, rouge_types=["rouge1"])["rouge1"].high

In [None]:
print(ref)  #original summary
print("-"*50)
print(txt)  #generated summary
print("-"*50)
print("Rouge1 score {}".format(rouge_score))

["The Delhi Metro is planning to play instrumental music in its stations on the New Delhi-Dwarka Airport Line. An official said the decision was taken after a public survey revealed that 80% people wanted light music in stations. The Metro, which has applied for the required license, will introduce music on other stations and inside trains based on users' feedback. "]
--------------------------------------------------
['Delhiites may soon look forward to a soothing Metro commute during rush hours.The Delhi Metro Rail Corporation has decided to start playing music in stations on the New Delhi-Dwarka airport line and plans to extend it inside trains and across its network gradually, depending on user feedback.The decision to roll out instrumental music on the airport line? Depending on feedback, the music may soon be extended inside trains.?These two companies give license if a public transport facility wants to play music.']
--------------------------------------------------
Rouge1 scor

# BONUS part

## 5-Fold maunal assessment

In [None]:
# Import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold

In [None]:
kfold = model_selection.KFold(n_splits=5)
model_kfold = trainer.model
results_kfold = model_selection.cross_val_score(model_kfold, train_texts, train_labels, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 