**Packages**

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import math

from datasets import load_dataset
import evaluate

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

import regex as re

**Necessary Functions**

In [2]:
rouge = evaluate.load('rouge')

In [None]:
chrf = evaluate.load("chrf")

In [3]:
import inspect

def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

**Data**

In [4]:
dataset = load_dataset("csebuetnlp/xlsum", "english")

Found cached dataset xlsum (/home/ubuntu/.cache/huggingface/datasets/csebuetnlp___xlsum/english/2.0.0/518ab0af76048660bcc2240ca6e8692a977c80e384ffb18fdddebaca6daebdce)
100%|█████████████████████████████████████████████| 3/3 [00:00<00:00,  4.39it/s]


In [7]:
index = pd.DataFrame({"index": list(range(len(dataset['train'])))})
sample_index = index.sample(n=2000, replace=False, random_state=1004)
sample_index[:5]

Unnamed: 0,index
235420,235420
172024,172024
253546,253546
224954,224954
214134,214134


In [8]:
id = []
url = []
title = []
article = []
article_num_sentences = []
article_num_characters = []
article_num_words = []
summary = []
summary_num_sentences = []
summary_num_characters = []
summary_num_words = []

for i in sample_index["index"]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    summary_num_sentences.append(len(dataset["train"][i]['summary'].split(".")))
    summary_num_words.append(len(dataset["train"][i]['summary'].split(" ")))
    summary_num_characters.append(len(dataset["train"][i]['summary']))
    article.append(dataset["train"][i]['text'])
    article_num_sentences.append(len(dataset["train"][i]['text'].split(".")))
    article_num_characters.append(len(dataset["train"][i]['text']))
    article_num_words.append(len(dataset["train"][i]['text'].split(" ")))
    

In [9]:
d = {'id': id, 'url': url, "title": title, 'article': article, "article_num_sentences": article_num_sentences, "article_num_words": article_num_words, "article_num_characters": article_num_characters, 'summary': summary,"summary_num_sentences": summary_num_sentences,"summary_num_words": summary_num_words, "summary_num_characters": summary_num_characters}
df = pd.DataFrame(data=d)
df.head(5)

Unnamed: 0,id,url,title,article,article_num_sentences,article_num_words,article_num_characters,summary,summary_num_sentences,summary_num_words,summary_num_characters
0,uk-england-cornwall-55191422,https://www.bbc.com/news/uk-england-cornwall-5...,Care home manager: 'It felt like we were losin...,By Rebecca Ricks & Johnny O'SheaBBC Spotlight ...,37,697,3755,"During the spring, at the height of the Covid-...",2,27,147
1,uk-43893709,https://www.bbc.com/news/uk-43893709,Tafida Raqeeb: Who decides the care of sick ch...,"By Rachel SchraerBBC Reality Check So, why did...",33,760,4531,"The parents of five-year-old Tafida Raqeeb, wh...",2,20,121
2,uk-politics-57050659,https://www.bbc.com/news/uk-politics-57050659,Labour reshuffle: Anneliese Dodds out in Starm...,Anneliese Dodds will now become the Labour Par...,36,846,4845,Sir Keir Starmer has sacked his shadow chancel...,2,20,115
3,entertainment-arts-38221420,https://www.bbc.com/news/entertainment-arts-38...,Vinyl sales made more than downloads last week,By Mark SavageBBC Music reporter Vinyl sales m...,27,354,2082,More money was spent on vinyl than downloaded ...,2,15,83
4,entertainment-arts-24046991,https://www.bbc.com/news/entertainment-arts-24...,Pirates of the Caribbean sequel delayed,Disney's Pirates of The Caribbean: Dead Men Te...,14,253,1569,The next Pirates of the Caribbean film has bee...,2,15,88


**Default Hyperparameters**

In [10]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-base")

In [11]:
get_default_args(pipeline)

{'task': None,
 'model': None,
 'config': None,
 'tokenizer': None,
 'feature_extractor': None,
 'image_processor': None,
 'framework': None,
 'revision': None,
 'use_fast': True,
 'use_auth_token': None,
 'device': None,
 'device_map': None,
 'torch_dtype': None,
 'trust_remote_code': None,
 'model_kwargs': None,
 'pipeline_class': None}

In [12]:
get_default_args(summarizer)

{}

**BART Models**

Model 0. BART base model, Running with default hyperparameters

In [39]:
%%time

warnings.filterwarnings("ignore")

summarizer = pipeline("summarization", model="facebook/bart-base")

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []

for i in range(int(len(df['article'])/2)):
    
    #art = ' '.join(df['article'][i].split(' ')[:1024]) #truncated to first 1024 words, because that is all the model can handle
    
    candidate = summarizer(df['article'][i], 
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                             #max_length=130, min_length=30, do_sample=False
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_0_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_0_scores.csv', index=False)

print('Last Article', df['article'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

0


Your max_length is set to 128, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 128, but you input_length is only 108. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 128, but you input_length is only 118. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 128, but you input_length is only 103. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)
Your max_length is set to 128, but you input_length is only 114. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 128, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


100


Your max_length is set to 128, but you input_length is only 108. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 128, but you input_length is only 98. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 128, but you input_length is only 63. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)
Your max_length is set to 128, but you input_length is only 96. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


200


Your max_length is set to 128, but you input_length is only 115. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 128, but you input_length is only 113. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 128, but you input_length is only 122. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 128, but you input_length is only 111. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Your max_length is set to 128, but you input_length is only 77. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


300


Your max_length is set to 128, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 128, but you input_length is only 98. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 128, but you input_length is only 126. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 128, but you input_length is only 65. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
Your max_length is set to 128, but you input_length is only 123. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


400


Your max_length is set to 128, but you input_length is only 117. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 128, but you input_length is only 77. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
Your max_length is set to 128, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


500


Your max_length is set to 128, but you input_length is only 105. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 128, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 128, but you input_length is only 126. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 128, but you input_length is only 89. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 128, but you input_length is only 119. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)


600


Your max_length is set to 128, but you input_length is only 84. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Your max_length is set to 128, but you input_length is only 106. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 128, but you input_length is only 95. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
Your max_length is set to 128, but you input_length is only 102. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)
Your max_length is set to 128, but you input_length is only 108. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


700


Your max_length is set to 128, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 128, but you input_length is only 118. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 128, but you input_length is only 79. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Your max_length is set to 128, but you input_length is only 122. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 128, but you input_length is only 102. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


800


Your max_length is set to 128, but you input_length is only 109. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 128, but you input_length is only 118. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 128, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 128, but you input_length is only 75. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 128, but you input_length is only 86. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


900


Your max_length is set to 128, but you input_length is only 104. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 128, but you input_length is only 76. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


Last Article A spokesman said Mr Duterte missed four scheduled events on Wednesday because he was "catching up on sleep". Mr Duterte and other world leaders are in Singapore attending the Association of South East Asian Nations (Asean) meeting. The 73-year-old defended his actions, saying: "What's wrong with my nap?" Mr Duterte has previously missed events at other international summits as well as in the Philippines. His health has been a constant source of speculation over the year, and he revealed in October that he had had a cancer scare. 'In top physical shape' Spokesman Salvador Panelo said the president had "worked late and had only less than three hours of sleep". When asked if he was fully rested, Mr Duterte added that the naps were "still not good enough, but enough to sustain the endurance for the last days". Mr Panelo also clarified that Mr Duterte's absence had "nothing to do with his health". "The president's constantly punishing work schedule is proof that he is in top ph

In [40]:
print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))

rouge1 average : 0.16257622621656392
rouge2 average : 0.028702447415317477
rougeL average : 0.10383911035216536
rougeLs average : 0.10383911035216536


Model 1. BART base model, Max and min length equal to overall max and min length of summaries in train set

In [41]:
%%time

warnings.filterwarnings("ignore")

summarizer = pipeline("summarization", model="facebook/bart-base")

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []

for i in range(int(len(df['article'])/2)):
    
    #art = ' '.join(df['article'][i].split(' ')[0:1024]) #truncated to first 1024 words, because that is all the model can handle
    
    candidate = summarizer(df['article'][i], #limiting to the first 1024 tokens, because that is all the model can handle
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                           max_length=max(df["summary_num_words"]), 
                           min_length=min(df["summary_num_words"]), 
                           #do_sample=False
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_1_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_1_scores.csv', index=False)
print(i)

print('Last Article', df['article'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

0


Your max_length is set to 88, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 88, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


100


Your max_length is set to 88, but you input_length is only 63. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)


200


Your max_length is set to 88, but you input_length is only 77. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


300


Your max_length is set to 88, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 88, but you input_length is only 65. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


400


Your max_length is set to 88, but you input_length is only 77. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
Your max_length is set to 88, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


500


Your max_length is set to 88, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


600


Your max_length is set to 88, but you input_length is only 84. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


700


Your max_length is set to 88, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 88, but you input_length is only 79. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


800


Your max_length is set to 88, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 88, but you input_length is only 75. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 88, but you input_length is only 86. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


900


Your max_length is set to 88, but you input_length is only 76. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


999
Last Article A spokesman said Mr Duterte missed four scheduled events on Wednesday because he was "catching up on sleep". Mr Duterte and other world leaders are in Singapore attending the Association of South East Asian Nations (Asean) meeting. The 73-year-old defended his actions, saying: "What's wrong with my nap?" Mr Duterte has previously missed events at other international summits as well as in the Philippines. His health has been a constant source of speculation over the year, and he revealed in October that he had had a cancer scare. 'In top physical shape' Spokesman Salvador Panelo said the president had "worked late and had only less than three hours of sleep". When asked if he was fully rested, Mr Duterte added that the naps were "still not good enough, but enough to sustain the endurance for the last days". Mr Panelo also clarified that Mr Duterte's absence had "nothing to do with his health". "The president's constantly punishing work schedule is proof that he is in to

In [42]:
print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))

rouge1 average : 0.1826260348179354
rouge2 average : 0.02811462587745744
rougeL average : 0.11675478118385149
rougeLs average : 0.11675478118385149


Model 2. Model 1 + beam_search 

In [43]:
%%time

warnings.filterwarnings("ignore")

summarizer = pipeline("summarization", model="facebook/bart-base")

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []

for i in range(int(len(df['article'])/2)):
    
    #art = ' '.join(df['article'][i].split(' ')[0:1024]) #truncated to first 1024 words, because that is all the model can handle
    
    candidate = summarizer(df['article'][i], #limiting to the first 1024 tokens, because that is all the model can handle
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                           max_length=max(df["summary_num_words"]), 
                           min_length=min(df["summary_num_words"]), 
                           num_beams = 3,                           
                           #do_sample=False
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_2_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_2_scores.csv', index=False)
print(i)

print('Last Article', df['article'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

0


Your max_length is set to 88, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 88, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


100


Your max_length is set to 88, but you input_length is only 63. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)


200


Your max_length is set to 88, but you input_length is only 77. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


300


Your max_length is set to 88, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 88, but you input_length is only 65. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


400


Your max_length is set to 88, but you input_length is only 77. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
Your max_length is set to 88, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


500


Your max_length is set to 88, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


600


Your max_length is set to 88, but you input_length is only 84. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


700


Your max_length is set to 88, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 88, but you input_length is only 79. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


800


Your max_length is set to 88, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 88, but you input_length is only 75. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 88, but you input_length is only 86. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


900


Your max_length is set to 88, but you input_length is only 76. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


999
Last Article A spokesman said Mr Duterte missed four scheduled events on Wednesday because he was "catching up on sleep". Mr Duterte and other world leaders are in Singapore attending the Association of South East Asian Nations (Asean) meeting. The 73-year-old defended his actions, saying: "What's wrong with my nap?" Mr Duterte has previously missed events at other international summits as well as in the Philippines. His health has been a constant source of speculation over the year, and he revealed in October that he had had a cancer scare. 'In top physical shape' Spokesman Salvador Panelo said the president had "worked late and had only less than three hours of sleep". When asked if he was fully rested, Mr Duterte added that the naps were "still not good enough, but enough to sustain the endurance for the last days". Mr Panelo also clarified that Mr Duterte's absence had "nothing to do with his health". "The president's constantly punishing work schedule is proof that he is in to

In [44]:
print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))

rouge1 average : 0.18207739272187834
rouge2 average : 0.02802670430092509
rougeL average : 0.11668340007295606
rougeLs average : 0.11668340007295606


Model 3. Model 1 + do_sample

In [13]:
%%time

warnings.filterwarnings("ignore")

summarizer = pipeline("summarization", model="facebook/bart-base")

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []

for i in range(int(len(df['article'])/2)):
    
    #art = ' '.join(df['article'][i].split(' ')[0:1024]) #truncated to first 1024 words, because that is all the model can handle
    
    candidate = summarizer(df['article'][i], #limiting to the first 1024 tokens, because that is all the model can handle
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                           max_length=max(df["summary_num_words"]), 
                           min_length=min(df["summary_num_words"]), 
                           #num_beams = 3,                           
                           do_sample=True
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_3_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_3_scores.csv', index=False)
print(i)

print('Last Article', df['article'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

0


Your max_length is set to 88, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 88, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


100


Your max_length is set to 88, but you input_length is only 63. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)


200


Your max_length is set to 88, but you input_length is only 77. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


300


Your max_length is set to 88, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 88, but you input_length is only 65. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


400


Your max_length is set to 88, but you input_length is only 77. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
Your max_length is set to 88, but you input_length is only 80. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)


500


Your max_length is set to 88, but you input_length is only 87. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


600


Your max_length is set to 88, but you input_length is only 84. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)


700


Your max_length is set to 88, but you input_length is only 82. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 88, but you input_length is only 79. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


800


Your max_length is set to 88, but you input_length is only 71. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 88, but you input_length is only 75. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 88, but you input_length is only 86. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)


900


Your max_length is set to 88, but you input_length is only 76. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


999
Last Article A spokesman said Mr Duterte missed four scheduled events on Wednesday because he was "catching up on sleep". Mr Duterte and other world leaders are in Singapore attending the Association of South East Asian Nations (Asean) meeting. The 73-year-old defended his actions, saying: "What's wrong with my nap?" Mr Duterte has previously missed events at other international summits as well as in the Philippines. His health has been a constant source of speculation over the year, and he revealed in October that he had had a cancer scare. 'In top physical shape' Spokesman Salvador Panelo said the president had "worked late and had only less than three hours of sleep". When asked if he was fully rested, Mr Duterte added that the naps were "still not good enough, but enough to sustain the endurance for the last days". Mr Panelo also clarified that Mr Duterte's absence had "nothing to do with his health". "The president's constantly punishing work schedule is proof that he is in to

In [14]:
print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))

rouge1 average : 0.18211818372243202
rouge2 average : 0.02774788864496601
rougeL average : 0.11674524650559745
rougeLs average : 0.11674524650559745


Model 4. Model 1 + beam_search + group_beam_search + diversity penalty + changes in max_length 

In [21]:
%%time

warnings.filterwarnings("ignore")

summarizer = pipeline("summarization", model="facebook/bart-base")

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []

for i in range(int(len(df['article'])/2)):
    
    #art = ' '.join(df['article'][i].split(' ')[0:1024]) #truncated to first 1024 words, because that is all the model can handle
    
    candidate = summarizer(df['article'][i], #limiting to the first 1024 tokens, because that is all the model can handle
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                           max_length=min(max(df["summary_num_words"]), df['article_num_words'][i]), 
                           min_length=min(df["summary_num_words"]), 
                           num_beams = 4,
                           num_beam_groups = 2,
                           diversity_penalty= 2.0,
                           do_sample=False #has to be set to false w/ diverse beam search
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_4_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_4_scores.csv', index=False)
print(i)

print('Last Article', df['article'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

0
100
200
300
400
500
600
700
800
900
999
Last Article A spokesman said Mr Duterte missed four scheduled events on Wednesday because he was "catching up on sleep". Mr Duterte and other world leaders are in Singapore attending the Association of South East Asian Nations (Asean) meeting. The 73-year-old defended his actions, saying: "What's wrong with my nap?" Mr Duterte has previously missed events at other international summits as well as in the Philippines. His health has been a constant source of speculation over the year, and he revealed in October that he had had a cancer scare. 'In top physical shape' Spokesman Salvador Panelo said the president had "worked late and had only less than three hours of sleep". When asked if he was fully rested, Mr Duterte added that the naps were "still not good enough, but enough to sustain the endurance for the last days". Mr Panelo also clarified that Mr Duterte's absence had "nothing to do with his health". "The president's constantly punishing w

In [23]:
print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))

rouge1 average : 0.18262546186561612
rouge2 average : 0.028182282803885305
rougeL average : 0.11704318718639922
rougeLs average : 0.11704318718639922


Model 4. Model 1 + beam_search + group_beam_search + diversity penalty + changes in max_length 

In [21]:
%%time

warnings.filterwarnings("ignore")

summarizer = pipeline("summarization", model="facebook/bart-base")

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []

for i in range(int(len(df['article'])/2)):
    
    #art = ' '.join(df['article'][i].split(' ')[0:1024]) #truncated to first 1024 words, because that is all the model can handle
    
    candidate = summarizer(df['article'][i], #limiting to the first 1024 tokens, because that is all the model can handle
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                           max_length=min(max(df["summary_num_words"]), df['article_num_words'][i]), 
                           min_length=min(df["summary_num_words"]), 
                           num_beams = 4,
                           num_beam_groups = 2,
                           diversity_penalty= 2.0,
                           do_sample=False #has to be set to false w/ diverse beam search
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_4_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_4_scores.csv', index=False)
print(i)

print('Last Article', df['article'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

0
100
200
300
400
500
600
700
800
900
999
Last Article A spokesman said Mr Duterte missed four scheduled events on Wednesday because he was "catching up on sleep". Mr Duterte and other world leaders are in Singapore attending the Association of South East Asian Nations (Asean) meeting. The 73-year-old defended his actions, saying: "What's wrong with my nap?" Mr Duterte has previously missed events at other international summits as well as in the Philippines. His health has been a constant source of speculation over the year, and he revealed in October that he had had a cancer scare. 'In top physical shape' Spokesman Salvador Panelo said the president had "worked late and had only less than three hours of sleep". When asked if he was fully rested, Mr Duterte added that the naps were "still not good enough, but enough to sustain the endurance for the last days". Mr Panelo also clarified that Mr Duterte's absence had "nothing to do with his health". "The president's constantly punishing w

In [23]:
print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))

rouge1 average : 0.18262546186561612
rouge2 average : 0.028182282803885305
rougeL average : 0.11704318718639922
rougeLs average : 0.11704318718639922


Model 5. Model 4 + using ratio to calculate max and min length (see EDA file for calcs)

In [28]:
%%time

warnings.filterwarnings("ignore")

summarizer = pipeline("summarization", model="facebook/bart-base")

bart_r1 = []
bart_r2 = []
bart_rL = []
bart_rLs = []

for i in range(int(len(df['article'])/2)):
    
    #art = ' '.join(df['article'][i].split(' ')[0:1024]) #truncated to first 1024 words, because that is all the model can handle
    
    candidate = summarizer(df['article'][i], #limiting to the first 1024 tokens, because that is all the model can handle
                           truncation = True, #truncated to first 1024 words, because that is all the model can handle
                           max_length=math.ceil(df['article_num_words'][i]*0.1), #most summaries are below 0.1 according to histogram
                           min_length=0, #min summary ratio was 0.002
                           num_beams = 4,
                           num_beam_groups = 2,
                           diversity_penalty= 2.0,
                           do_sample=False #has to be set to false w/ diverse beam search
                            )[0]
    candidate = [candidate['summary_text']]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bart_r1.append(results['rouge1'])
    bart_r2.append(results['rouge2'])
    bart_rL.append(results['rougeL'])
    bart_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, 2000, 100):
        data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BART_4_scores.csv', index=False)
        print(i)
        
data = {'rouge1': bart_r1, 'rouge2': bart_r2, 'rogueL': bart_rL, 'rogueLs': bart_rLs}
scores = pd.DataFrame(data)
scores.to_csv(r'BART_4_scores.csv', index=False)
print(i)

print('Last Article', df['article'][i])
print('Last Reference Summary', ref)
print('Last Candidate Summary', candidate)

0
100
200
300
400
500
600
700
800
900
999
Last Article A spokesman said Mr Duterte missed four scheduled events on Wednesday because he was "catching up on sleep". Mr Duterte and other world leaders are in Singapore attending the Association of South East Asian Nations (Asean) meeting. The 73-year-old defended his actions, saying: "What's wrong with my nap?" Mr Duterte has previously missed events at other international summits as well as in the Philippines. His health has been a constant source of speculation over the year, and he revealed in October that he had had a cancer scare. 'In top physical shape' Spokesman Salvador Panelo said the president had "worked late and had only less than three hours of sleep". When asked if he was fully rested, Mr Duterte added that the naps were "still not good enough, but enough to sustain the endurance for the last days". Mr Panelo also clarified that Mr Duterte's absence had "nothing to do with his health". "The president's constantly punishing w

In [29]:
print('rouge1 average :', np.mean(bart_r1))
print('rouge2 average :', np.mean(bart_r2))
print('rougeL average :', np.mean(bart_rL))
print('rougeLs average :', np.mean(bart_rLs))

rouge1 average : 0.16340185875114246
rouge2 average : 0.018879911043092612
rougeL average : 0.11635047232727201
rougeLs average : 0.11635047232727201
