**Packages**

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from datasets import load_dataset
import evaluate

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

**Necessary Functions**

In [42]:
import inspect

def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

In [43]:
rouge = evaluate.load('rouge')

**Data**

In [44]:
dataset = load_dataset("csebuetnlp/xlsum", "english")

100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 335.20it/s]


In [45]:
# EDA
len(dataset['train'])

306522

In [46]:
# EDA
dataset['train'][1]

{'id': 'uk-scotland-highlands-islands-11069985',
 'url': 'https://www.bbc.com/news/uk-scotland-highlands-islands-11069985',
 'title': 'Huge tidal turbine installed at Orkney test site',
 'summary': 'The massive tidal turbine AK1000 has been installed in 35m (114.8ft) of water at a test site in Orkney.',
 'text': 'Atlantis Resources unveiled the marine energy device at Invergordon ahead of it being shipped to Kirkwall. Trials on the device will now be run at the European Marine Energy Centre test site off Eday. The device stands 22.5m (73ft) tall, weighs 1,300 tonnes and has two sets of blades on a single unit. It could generate enough power for 1,000 homes.'}

In [47]:
index = pd.DataFrame({"index": list(range(len(dataset['train'])))})
sample_index = index.sample(n=2000, replace=False, random_state=1004)
sample_index[:5]

Unnamed: 0,index
235420,235420
172024,172024
253546,253546
224954,224954
214134,214134


In [48]:
id = []
url = []
title = []
article = []
article_num_sentences = []
article_num_characters = []
summary = []
summary_num_sentences = []
summary_num_characters = []

for i in sample_index["index"]:
    id.append(dataset["train"][i]['id'])
    url.append(dataset["train"][i]['url'])
    title.append(dataset["train"][i]['title'])
    summary.append(dataset["train"][i]['summary'])
    summary_num_sentences.append(len(dataset["train"][i]['summary'].split(".")))
    summary_num_characters.append(len(dataset["train"][i]['summary']))
    article.append(dataset["train"][i]['text'])
    article_num_sentences.append(len(dataset["train"][i]['text'].split(".")))
    article_num_characters.append(len(dataset["train"][i]['text']))

In [49]:
d = {'id': id, 'url': url, "title": title, 'article': article, "article_num_sentences": article_num_sentences, "article_num_characters": article_num_characters, 'summary': summary,"summary_num_sentences": summary_num_sentences, "summary_num_characters": summary_num_characters}
df = pd.DataFrame(data=d)
df.head(5)

Unnamed: 0,id,url,title,article,article_num_sentences,article_num_characters,summary,summary_num_sentences,summary_num_characters
0,uk-england-cornwall-55191422,https://www.bbc.com/news/uk-england-cornwall-5...,Care home manager: 'It felt like we were losin...,By Rebecca Ricks & Johnny O'SheaBBC Spotlight ...,37,3755,"During the spring, at the height of the Covid-...",2,147
1,uk-43893709,https://www.bbc.com/news/uk-43893709,Tafida Raqeeb: Who decides the care of sick ch...,"By Rachel SchraerBBC Reality Check So, why did...",33,4531,"The parents of five-year-old Tafida Raqeeb, wh...",2,121
2,uk-politics-57050659,https://www.bbc.com/news/uk-politics-57050659,Labour reshuffle: Anneliese Dodds out in Starm...,Anneliese Dodds will now become the Labour Par...,36,4845,Sir Keir Starmer has sacked his shadow chancel...,2,115
3,entertainment-arts-38221420,https://www.bbc.com/news/entertainment-arts-38...,Vinyl sales made more than downloads last week,By Mark SavageBBC Music reporter Vinyl sales m...,27,2082,More money was spent on vinyl than downloaded ...,2,83
4,entertainment-arts-24046991,https://www.bbc.com/news/entertainment-arts-24...,Pirates of the Caribbean sequel delayed,Disney's Pirates of The Caribbean: Dead Men Te...,14,1569,The next Pirates of the Caribbean film has bee...,2,88


**BERT Extractive Model**

Finetuning On First 1000

1. Baseline BERT Extractive Model

In [50]:
from summarizer import Summarizer

In [51]:
get_default_args(Summarizer)

{'model': 'bert-large-uncased',
 'custom_model': None,
 'custom_tokenizer': None,
 'hidden': -2,
 'reduce_option': 'mean',
 'sentence_handler': <summarizer.sentence_handler.SentenceHandler at 0x7ff4f591a2e0>,
 'random_state': 12345,
 'hidden_concat': False}

In [52]:
# Using bert-base instead of bert-large to reduce run times
model = Summarizer(model='bert-base-uncased')

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/pyt

In [53]:
get_default_args(model)

{'ratio': 0.2,
 'min_length': 40,
 'max_length': 600,
 'use_first': True,
 'algorithm': 'kmeans',
 'num_sentences': None,
 'return_as_list': False}

In [54]:
import warnings
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []

for i in range(int(len(df['article'])/2)):
    
    # Limiting number sentences in each summary generated to 2 sentences 
    candidate = model(df['article'][i], 
                      num_sentences = round(df["summary_num_sentences"][:1000].mean()), 
                      min_length = min(df["summary_num_characters"][:1000]),
                      max_length = max(df["summary_num_characters"][:1000]),        
                      ratio = None,
                      use_first = None,
                     )
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, (len(df['article']) + 101), 100):
        data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BERT_1_scores.csv', index=False)
        print(i)

0
100
200
300
400
500
600
700
800
900


In [55]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))

rouge1 average : 0.178192403298457
rouge2 average : 0.026733936113222955
rougeL average : 0.11966303460985027
rougeLs average : 0.11966303460985027


Tuning Hyperparameters

2. Made num_sentences based off number of clusters instead of average 

In [56]:
from summarizer import Summarizer

model = Summarizer(model='bert-base-uncased',
                  )

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/pyt

In [None]:
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []

for i in range(int(len(df['article'])/2)):

    res = model.calculate_optimal_k(df['article'][i], k_max=10)
    
    candidate = model(df['article'][i], 
                      num_sentences = res, # number of sentences determined by number of clusters
                      min_length = min(df["summary_num_characters"][:1000]),
                      max_length = max(df["summary_num_characters"][:1000]),        
                      ratio = None,
                      use_first = None,)
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, (len(df['article']) + 101), 100):
        data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BERT_2_scores.csv', index=False)
        print(i)

0


In [None]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))

3. ???? Do something else here based on findings above. Maybe changes the use_first to False bc many examples have meta data

In [None]:
from summarizer import Summarizer

model = Summarizer(model='bert-base-uncased',
                  )

In [None]:
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []

for i in range(int(len(df['article'])/2)):

    res = model.calculate_optimal_k(df['article'][i], k_max=10)
    
    candidate = model(df['article'][i], 
                      num_sentences = res, # number of sentences determined by number of clusters
                      min_length = min(df["summary_num_characters"][:1000]),
                      max_length = max(df["summary_num_characters"][:1000]),        
                      ratio = None,
                      use_first = None,)
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
    if i in np.arange(0, (len(df['article']) + 101), 100):
        data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs}
        scores = pd.DataFrame(data)
        scores.to_csv(r'BERT_3_scores.csv', index=False)
        print(i)

In [None]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))