**Packages**

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from datasets import load_dataset
import evaluate

#let's make longer output readable without horizontal scrolling
from pprint import pprint

**Necessary Functions**

In [5]:
rouge = evaluate.load('rouge')

**Data**

In [6]:
dataset = load_dataset("gopalkalpande/bbc-news-summary")

Found cached dataset csv (C:/Users/arisa/.cache/huggingface/datasets/gopalkalpande___csv/gopalkalpande--bbc-news-summary-f610c9f6377bc0fc/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# EDA
len(dataset['train'])

2224

In [59]:
# EDA
dataset['train'][1]

{'File_path': 'politics',
 'Articles': 'Army chiefs in regiments decision..Military chiefs are expected to meet to make a final decision on the future of Scotland\'s Army regiments...A committee of the Army Board, which is made up of the most senior defence figures, will discuss plans for restructuring regiments on Monday. The proposals include cutting Scotland\'s six single-battalion regiments to five and merging these into a super regiment. The plans have faced stiff opposition from campaigners and politicians alike. The committee\'s decision must be ratified by Defence Secretary Geoff Hoon and Prime Minister Tony Blair. It is expected that it will be made public next week. When ministers announced a reorganisation of the Army it drew a question mark over the futures of the Black Watch, the Kings Own Scottish Borderers, the Royal Scots, the Royal Highland Fusiliers and the Argyll and Sutherland Highlanders. In October, the Council of Scottish Colonels proposed the merger of the Royal

In [7]:
category = []
title = []
article = []
summary = []

for data in dataset['train']:
    category.append(data['File_path'])
    title.append(data['Articles'].split('..')[0])
    article.append(data['Articles'][data['Articles'].index("..") + 2:])
    summary.append(data['Summaries'])

In [8]:
d = {'category': category, 'title': title, 'article': article, 'summary': summary}
df = pd.DataFrame(data=d)
df.head(5)

Unnamed: 0,category,title,article,summary
0,politics,Budget to set scene for election,Gordon Brown will seek to put the economy at t...,- Increase in the stamp duty threshold from £6...
1,politics,Army chiefs in regiments decision,Military chiefs are expected to meet to make a...,"""They are very much not for the good and will ..."
2,politics,Howard denies split over ID cards,Michael Howard has denied his shadow cabinet w...,Michael Howard has denied his shadow cabinet w...
3,politics,Observers to monitor UK election,Ministers will invite international observers ...,The report said individual registration should...
4,politics,Kilroy names election seat target,Ex-chat show host Robert Kilroy-Silk is to con...,"UKIP's leader, Roger Knapman, has said he is g..."


**Baseline**

In [80]:
base_r1 = []
base_r2 = []
base_rL = []
base_rLs = []

for i in range(len(df['article'])):

    string = df['article'][i].replace('...', '. ')
    
    # first three sentences 
    candidate = ". ".join(string.split('. ')[0:3]) + "."
    candidate = [candidate]
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references= ref)
    
    base_r1.append(results['rouge1'])
    base_r2.append(results['rouge2'])
    base_rL.append(results['rougeL'])
    base_rLs.append(results['rougeLsum'])

In [82]:
print('rouge1 average: ', np.mean(base_r1))
print('rouge2 average: ', np.mean(base_r2))
print('rougeL average: ', np.mean(base_rL))
print('rougeLs average:', np.mean(base_rLs))

rouge1 average:  0.443348604683322
rouge2 average:  0.3602253823635416
rougeL average:  0.345243750111791
rougeLs average: 0.345243750111791


**BERT Extractive Model**

In [None]:
from summarizer import Summarizer

model = Summarizer()

In [None]:
bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []

for i in range(len(df['article'])):

    candidate = model(df['article'][i])["summary"]    
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])

In [None]:
print('rouge1 average': np.mean(bert_r1))
print('rouge2 average': np.mean(bert_r2))
print('rougeL average': np.mean(bert_rL))
print('rougeLs average': np.mean(bert_rLs))

**T5**

In [1]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
t5_r1 = []
t5_r2 = []
t5_rL = []
t5_rLs = []

for i in range(len(df['article'])):

    T5ARTICLE_TO_SUMMARIZE = 'summarize: ' + df['article'][i]

    inputs = t5tokenizer(T5ARTICLE_TO_SUMMARIZE, max_length=1024, truncation=True, return_tensors="tf")

    summary_ids = t5model.generate(inputs["input_ids"], 
                                  # ADD HYPER PARAMETERS HERE 
                                  )
    
    candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    t5_r1.append(results['rouge1'])
    t5_r2.append(results['rouge2'])
    t5_rL.append(results['rougeL'])
    t5_rLs.append(results['rougeLsum'])

In [None]:
print('rouge1 average': np.mean(t5_r1))
print('rouge2 average': np.mean(t5_r2))
print('rougeL average': np.mean(t5_rL))
print('rougeLs average': np.mean(t5_rLs))

**Pegasus**

In [None]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration

cnnmodel = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail", from_pt=True)
cnntokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail", from_pt=True)

In [None]:
p_r1 = []
p_r2 = []
p_rL = []
p_rLs = []

for i in range(len(df['article'])):
    
    cnninputs = cnntokenizer(df['article'][i], max_length=1024, truncation=True, return_tensors="tf")
    
    summary_ids = cnnmodel.generate(cnninputs["input_ids"] 
                                   # ADD HYPER PARAMETERS HERE 
                                   )
    
    candidate = cnntokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    p_r1.append(results['rouge1'])
    p_r2.append(results['rouge2'])
    p_rL.append(results['rougeL'])
    p_rLs.append(results['rougeLsum'])

**BART**