**Packages**

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from datasets import load_dataset
import evaluate

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

import regex as re

**Necessary Functions**

In [40]:
import inspect

def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

In [41]:
rouge = evaluate.load('rouge')

In [42]:
chrf = evaluate.load("chrf")

**Data**

In [43]:
%pwd
%cd Datasets
%pwd

[Errno 2] No such file or directory: 'Datasets'
/home/ubuntu/w266_project/Datasets


'/home/ubuntu/w266_project/Datasets'

In [44]:
df = pd.read_csv('xl_sum_sample_val.csv')
df.rename(columns={"text": "article"}, inplace=True)

In [45]:
dft = pd.read_csv('xl_sum_sample_test.csv')
dft.rename(columns={"text": "article"}, inplace=True)

**Default Hyperparameters**

In [46]:
warnings.filterwarnings("ignore")

from summarizer import Summarizer

model = Summarizer()

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0a37c210431/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0a37c21

In [47]:
get_default_args(Summarizer)

{'model': 'bert-large-uncased',
 'custom_model': None,
 'custom_tokenizer': None,
 'hidden': -2,
 'reduce_option': 'mean',
 'sentence_handler': <summarizer.sentence_handler.SentenceHandler at 0x7f081a6b7760>,
 'random_state': 12345,
 'hidden_concat': False}

In [48]:
get_default_args(model)

{'ratio': 0.2,
 'min_length': 40,
 'max_length': 600,
 'use_first': True,
 'algorithm': 'kmeans',
 'num_sentences': None,
 'return_as_list': False}

**BERT Extractive Model**

Model 0: Default Hyperparameters

In [49]:
warnings.filterwarnings("ignore")

from summarizer import Summarizer

model = Summarizer()

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0a37c210431/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0a37c21

In [50]:
import warnings
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []
bert_chrf = []

for i in range(int(len(df['article']))):
    
    candidate = model(df['article'][i],
                     )
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
    bert_chrf.append(results2['score'])
    
#     if i in np.arange(0, (len(df['article']) + 101), 100):
#         data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
#         scores = pd.DataFrame(data)
#         scores.to_csv(r'BERT_0_scores.csv', index=False)
#         print(i)

# data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
# scores = pd.DataFrame(data)
# scores.to_csv(r'BERT_0_scores.csv', index=False)
# print(i)

In [51]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

rouge1 average : 0.1600066478049013
rouge2 average : 0.026148022779671355
rougeL average : 0.1076876885358689
rougeLs average : 0.1076876885358689
chrf average: 23.9904498468375


Model 1. Adjusted min_length, max_length, num_sentences (which overrides ratio)

In [52]:
from summarizer import Summarizer

In [53]:
# Using bert-base instead of bert-large to reduce run times
model = Summarizer(model='bert-base-uncased')

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421

In [54]:
import warnings
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []
bert_chrf = []

for i in range(len(df['article'])):
    
    # Limiting number sentences in each summary generated to 2 sentences (average number of sentences per summary in train set)
    candidate = model(df['article'][i], 
                      num_sentences = 2, 
                      min_length = 0,
                      max_length = 256,        
                      ratio = None,
                      use_first = None,
                     )
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
    bert_chrf.append(results2['score'])
    
#     if i in np.arange(0, (len(df['article']) + 101), 100):
#         data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
#         scores = pd.DataFrame(data)
#         scores.to_csv(r'BERT_1_scores.csv', index=False)
#         print(i)
        
# data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
# scores = pd.DataFrame(data)
# scores.to_csv(r'BERT_1_scores.csv', index=False)
# print(i)

In [55]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

rouge1 average : 0.18574665627167028
rouge2 average : 0.02606738930501224
rougeL average : 0.12140470038266893
rougeLs average : 0.12140470038266893
chrf average: 26.329590625959923


Tuning Hyperparameters

2. Made num_sentences based off number of clusters instead of average 

In [56]:
warnings.filterwarnings("ignore")

from summarizer import Summarizer

model = Summarizer(model='bert-base-uncased',
                  )

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421

In [57]:
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []
bert_chrf = []

for i in range(len(df['article'])):

    res = model.calculate_optimal_k(df['article'][i], k_max=10)
    
    candidate = model(df['article'][i], 
                      num_sentences = res, # number of sentences determined by number of clusters
                      min_length = 0,
                      max_length = 256,        
                      ratio = None,
                      use_first = None,)
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    bert_chrf.append(results2['score'])
    
#     bert_r1.append(results['rouge1'])
#     bert_r2.append(results['rouge2'])
#     bert_rL.append(results['rougeL'])
#     bert_rLs.append(results['rougeLsum'])
    
#     if i in np.arange(0, (len(df['article']) + 101), 100):
#         data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
#         scores = pd.DataFrame(data)
#         scores.to_csv(r'BERT_2_scores.csv', index=False)
#         print(i)
        
# data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
# scores = pd.DataFrame(data)
# scores.to_csv(r'BERT_2_scores.csv', index=False)
# print(i)

In [58]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

rouge1 average : nan
rouge2 average : nan
rougeL average : nan
rougeLs average : nan
chrf average: 25.487034086359877


3. Setting number of sentences with clusters didn't help in second model, so we are reverting back to first model but setting use_first = False bc many examples have meta data in first sentence.

In [59]:
from summarizer import Summarizer

warnings.filterwarnings("ignore")

model = Summarizer(model='bert-base-uncased',
                  )

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421

In [60]:
import warnings
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []
bert_chrf = []

for i in range(len(df['article'])):
    
    # Limiting number sentences in each summary generated to 2 sentences 
    candidate = model(df['article'][i], 
                      num_sentences = 2, 
                      min_length = 0,
                      max_length = 256,        
                      ratio = None,
                      use_first = False,
                     )
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    bert_chrf.append(results2['score'])
                            
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
#     if i in np.arange(0, (len(df['article']) + 101), 100):
#         data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
#         scores = pd.DataFrame(data)
#         scores.to_csv(r'BERT_3_scores.csv', index=False)
#         print(i)

# data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
# scores = pd.DataFrame(data)
# scores.to_csv(r'BERT_3_scores.csv', index=False)
# print(i)

In [61]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

rouge1 average : 0.18574665627167028
rouge2 average : 0.02606738930501224
rougeL average : 0.12140470038266893
rougeLs average : 0.12140470038266893
chrf average: 26.329590625959923


4. Setting use_first = True

In [62]:
warnings.filterwarnings("ignore")

from summarizer import Summarizer

model = Summarizer(model='bert-base-uncased',
                  )

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421

In [63]:
import warnings
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []
bert_chrf = []

for i in range(len(df['article'])):
    
    # Limiting number sentences in each summary generated to 2 sentences 
    candidate = model(df['article'][i], 
                      num_sentences = 2, 
                      min_length = 0,
                      max_length = 256,        
                      ratio = None,
                      use_first = True,
                     )
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    bert_chrf.append(results2['score'])
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
#     if i in np.arange(0, (len(df['article']) + 101), 100):
#         data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
#         scores = pd.DataFrame(data)
#         scores.to_csv(r'BERT_3_scores.csv', index=False)
#         print(i)

# data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
# scores = pd.DataFrame(data)
# scores.to_csv(r'BERT_3_scores.csv', index=False)
# print(i)

In [64]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

rouge1 average : 0.18420413444525
rouge2 average : 0.029384449091168774
rougeL average : 0.1187318906362462
rougeLs average : 0.1187318906362462
chrf average: 26.276875760389853


5. Use bert-large-uncased w/ best from above

In [65]:
warnings.filterwarnings("ignore")

from summarizer import Summarizer

model = Summarizer(model='bert-large-uncased',
                  )

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0a37c210431/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0a37c21

In [66]:
import warnings
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []
bert_chrf = []

for i in range(len(df['article'])):
    
    # Limiting number sentences in each summary generated to 2 sentences 
    candidate = model(df['article'][i], 
                      num_sentences = 2, 
                      min_length = 0,
                      max_length = 256,        
                      ratio = None,
                      use_first = False,
                     )
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    bert_chrf.append(results2['score'])
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
#     if i in np.arange(0, (len(df['article']) + 101), 100):
#         data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
#         scores = pd.DataFrame(data)
#         scores.to_csv(r'BERT_4_scores.csv', index=False)
#         print(i)

# data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
# scores = pd.DataFrame(data)
# scores.to_csv(r'BERT_4_scores.csv', index=False)
# print(i)

In [67]:
# this model took longer and performed worse
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

rouge1 average : 0.18045099952614682
rouge2 average : 0.02579665252992228
rougeL average : 0.1260598528415354
rougeLs average : 0.1260598528415354
chrf average: 25.93990117052502


6. Reduce option median

In [68]:
warnings.filterwarnings("ignore")

from summarizer import Summarizer

model = Summarizer(model='bert-base-uncased', reduce_option = 'median'
                  )

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421

In [69]:
import warnings
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []
bert_chrf = []

for i in range(len(df['article'])):
    
    # Limiting number sentences in each summary generated to 2 sentences 
    candidate = model(df['article'][i], 
                      num_sentences = 2, 
                      min_length = 0,
                      max_length = 256,        
                      ratio = None,
                      use_first = None,
                     )
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    bert_chrf.append(results2['score'])
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
#     if i in np.arange(0, (len(df['article']) + 101), 100):
#         data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
#         scores = pd.DataFrame(data)
#         scores.to_csv(r'BERT_4_scores.csv', index=False)
#         print(i)

# data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
# scores = pd.DataFrame(data)
# scores.to_csv(r'BERT_6_scores.csv', index=False)
# print(i)

In [70]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

rouge1 average : 0.18240691681643514
rouge2 average : 0.02531288951923227
rougeL average : 0.11788137317284689
rougeLs average : 0.11788137317284689
chrf average: 25.955566975088626


**7. Reduce option max**

In [71]:
warnings.filterwarnings("ignore")

from summarizer import Summarizer

model = Summarizer(model='bert-base-uncased', reduce_option = 'max'
                  )

loading configuration file config.json from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/ubuntu/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421

In [None]:
import warnings
warnings.filterwarnings("ignore")

bert_r1 = []
bert_r2 = []
bert_rL = []
bert_rLs = []
bert_chrf = []

for i in range(len(df['article'])):
    
    # Limiting number sentences in each summary generated to 2 sentences 
    candidate = model(df['article'][i], 
                      num_sentences = 2, 
                      min_length = 0,
                      max_length = 256,        
                      ratio = None,
                      use_first = None,
                     )
    candidate = [candidate]
    #pprint(candidate[0], compact=True)
    
    ref = [df['summary'][i]]
    
    results = rouge.compute(predictions=candidate,
                            references=ref)
    
    results2 = chrf.compute(predictions=candidate,
                            references= ref)
    
    bert_chrf.append(results2['score'])
    
    bert_r1.append(results['rouge1'])
    bert_r2.append(results['rouge2'])
    bert_rL.append(results['rougeL'])
    bert_rLs.append(results['rougeLsum'])
    
#     if i in np.arange(0, (len(df['article']) + 101), 100):
#         data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
#         scores = pd.DataFrame(data)
#         scores.to_csv(r'BERT_7_scores.csv', index=False)
#         print(i)

# data = {'rouge1': bert_r1, 'rouge2': bert_r2, 'rogueL': bert_rL, 'rogueLs': bert_rLs, 'chrf': bert_chrf}
# scores = pd.DataFrame(data)
# scores.to_csv(r'BERT_7_scores.csv', index=False)
# print(i)

In [None]:
print('rouge1 average :', np.mean(bert_r1))
print('rouge2 average :', np.mean(bert_r2))
print('rougeL average :', np.mean(bert_rL))
print('rougeLs average :', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

**Best BERT Extractive model applied to one category**

**Category 1: uk**

In [None]:
print('rouge1 average: ', np.mean(bert_r1))
print('rouge2 average: ', np.mean(bert_r2))
print('rougeL average: ', np.mean(bert_rL))
print('rougeLs average:', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

**Category 2: world**

In [None]:
print('rouge1 average: ', np.mean(bert_r1))
print('rouge2 average: ', np.mean(bert_r2))
print('rougeL average: ', np.mean(bert_rL))
print('rougeLs average:', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

**Category 3: business**

In [None]:
print('rouge1 average: ', np.mean(bert_r1))
print('rouge2 average: ', np.mean(bert_r2))
print('rougeL average: ', np.mean(bert_rL))
print('rougeLs average:', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

**Category 4: entertainment**

In [None]:
print('rouge1 average: ', np.mean(bert_r1))
print('rouge2 average: ', np.mean(bert_r2))
print('rougeL average: ', np.mean(bert_rL))
print('rougeLs average:', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

**Category 5: technology**

In [73]:
print('rouge1 average: ', np.mean(bert_r1))
print('rouge2 average: ', np.mean(bert_r2))
print('rougeL average: ', np.mean(bert_rL))
print('rougeLs average:', np.mean(bert_rLs))
print('chrf average:', np.mean(bert_chrf))

rouge1 average:  0.16956305823103446
rouge2 average:  0.020127432450957147
rougeL average:  0.11438383130539752
rougeLs average: 0.11438383130539752
chrf average: 26.085576408431866
