**Packages**

In [None]:
! pip install datasets --quiet
! pip install evaluate --quiet
! pip install rouge_score --quiet
! pip install sacrebleu --quiet
! pip install transformers --quiet
! pip install -q sentencepiece --quiet
! pip install summarizer --quiet
! pip install bert-extractive-summarizer --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import math

from datasets import load_dataset
import evaluate

import inspect

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

import regex as re

import os, re
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# These auto classes load the right type of tokenizer and model based on a model name
from transformers import AutoTokenizer, TFAutoModel
from transformers import pipeline
from transformers import AutoModel

**Necessary Functions**

In [None]:
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
chrf = evaluate.load("chrf")

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [None]:
def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

**Huggingface Transformers Training Resources**

https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py

https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html

https://gitlab.com/nicolalandro/summarization

Load Data

In [None]:
df = pd.read_csv('../Data/xl_sum_sample_test.csv')

In [None]:
df_pronouns = pd.read_csv('../Data/xl_sum_sample_test_pronouns.csv')

In [None]:
def generate_scores(mod, data, do_sample = False, num_beams = 1, top_k = 50, num_beam_groups = 1):

    targets = []
    candidates = []
    r1 = []
    r2 = []
    rL = []
    rLs = []
    chrfs = []

    for i in range(int(len(data['text']))):

        candidate = mod(data['text'][i], 
                              truncation = True,
                              max_length = 256,
                              min_length = 0, 
                              do_sample = do_sample,
                              num_beams = num_beams, 
                              top_k = top_k,
                              num_beam_groups = num_beam_groups,
                              )[0]
        candidate = [candidate['summary_text']]

        ref = [data['summary'][i]]

        results = rouge.compute(predictions=candidate,
                              references=ref)

        targets.append(ref)
        candidates.append(candidate)
        r1.append(results['rouge1'])
        r2.append(results['rouge2'])
        rL.append(results['rougeL'])
        rLs.append(results['rougeLsum'])

        results = chrf.compute(predictions=candidate,
                              references=ref)

        chrfs.append(results['score'])

    return pd.DataFrame({'target': targets, 'candidate': candidates,
                  'rouge1': r1, 'rouge2': r2, 'rougeL': rL, 
                      'rougeLs': rLs, 'chrf': chrfs})

# Baseline

In [None]:
def baseline_scores(data):
    base_r1 = []
    base_r2 = []
    base_rL = []
    base_rLs = []
    base_chrf = []
    candidates = []
    targets = []
    for i in range(len(data['text'])): 


        # first three sentences 
        candidate = ". ".join(data["text"][i].split('. ')[0:3]) + "."
        candidate = [candidate]

        ref = [data['summary'][i]]

        results = rouge.compute(predictions=candidate,
                              references= ref)

        results2 = chrf.compute(predictions=candidate,
                              references= ref)

        candidates.append(candidate)
        targets.append(ref)
        base_r1.append(results['rouge1'])
        base_r2.append(results['rouge2'])
        base_rL.append(results['rougeL'])
        base_rLs.append(results['rougeLsum'])

        base_chrf.append(results2['score'])

    return pd.DataFrame({'target': targets, 'candidate': candidates,
                'rouge1': base_r1, 'rouge2': base_r2, 'rougeL': base_rL, 
                    'rougeLs': base_rLs, 'chrf': base_chrf})

*All Categories*

In [None]:
baseline_score_df = baseline_scores(df)

In [None]:
baseline_score_df.to_csv('baseline_all_categories.csv')

*Proper Nouns*

In [None]:
baseline_pronouns = baseline_scores(df_pronouns)

In [None]:
baseline_pronouns.to_csv('baseline_pronouns.csv')

# T5 SCORES

### Base

In [None]:
t5_base_summarizer = pipeline("summarization", model="t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


*All Categories*

In [None]:
t5_base_scores_df = generate_scores(t5_base_summarizer, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
t5_base_scores_df.to_csv('t5_all_categories_base.csv')

*Proper Nouns*

In [None]:
t5_base_pronouns_scores_df = generate_scores(t5_base_summarizer, df_pronouns, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

Your max_length is set to 256, but you input_length is only 204. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=102)
Your max_length is set to 256, but you input_length is only 192. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 256, but you input_length is only 240. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=120)


In [None]:
t5_base_pronouns_scores_df.to_csv('t5_pronouns_base.csv')

### Finetuned

In [None]:
t5_finetuned_summarizer = pipeline("summarization", model="arisanguyen/finetuned_T5_all_categories", revision = 'model_0')

Downloading (…)/model_0/config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)del_0/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

*All Categories*

In [None]:
t5_scores_df = generate_scores(t5_finetuned_summarizer, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
t5_scores_df.to_csv('t5_all_categories_finetuned.csv')

*Proper Nouns*

In [None]:
t5_prounouns_scores_df = generate_scores(t5_finetuned_summarizer, df_pronouns, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Your max_length is set to 256, but you input_length is only 204. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=102)
Your max_length is set to 256, but you input_length is only 192. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 256, but you input_length is only 240. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=120)


In [None]:
t5_prounouns_scores_df.to_csv('t5_pronouns_finetuned.csv')

# BART SCORES

### Base

In [None]:
def generate_bart_scores(model, data, do_sample = False, num_beams = 1, top_k = 50):

    targets = []
    candidates = []
    r1 = []
    r2 = []
    rL = []
    rLs = []
    chrfs = []

    for i in range(int(len(data['text']))):
        candidate = model(data['text'][i], 
                               truncation = True, 
                               max_length = 256, 
                               min_length = 0, 
                               do_sample = do_sample,
                               num_beams = num_beams, 
                               top_k = top_k,
                                )[0]
        candidate = [candidate['summary_text']]

        ref = [data['summary'][i]]

        results = rouge.compute(predictions=candidate,
                              references=ref)

        targets.append(ref)
        candidates.append(candidate)
        r1.append(results['rouge1'])
        r2.append(results['rouge2'])
        rL.append(results['rougeL'])
        rLs.append(results['rougeLsum'])

        results = chrf.compute(predictions=candidate,
                              references=ref)

        chrfs.append(results['score'])

    return pd.DataFrame({'target': targets, 'candidate': candidates,
                  'rouge1': r1, 'rouge2': r2, 'rougeL': rL, 
                      'rougeLs': rLs, 'chrf': chrfs})

In [None]:
bart_base_summarizer = pipeline("summarization", model="facebook/bart-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

*All Categories*

In [None]:
bart_base_scores_df = generate_bart_scores(bart_base_summarizer, df, do_sample = True, num_beams = 4, top_k = 75)

Your max_length is set to 128, but you input_length is only 86. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 128, but you input_length is only 92. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)
Your max_length is set to 128, but you input_length is only 70. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)


In [None]:
bart_base_scores_df.to_csv('bart_all_categories_base.csv')

*Proper Nouns*

In [None]:
bart_base_pronouns_scores_df = generate_bart_scores(bart_base_summarizer, df_pronouns, do_sample = True, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bart_base_pronouns_scores_df.to_csv('bart_pronouns_base.csv')

### Finetuned

In [None]:
bart_finetuned_summarizer = pipeline("summarization", model="arisanguyen/finetuned-BART-all-categories", revision = 'model_2')

Downloading (…)/model_2/config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading (…)e/model_2/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)e/model_2/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)del_2/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

*All Categories*

In [None]:
bart_scores_df = generate_scores(bart_finetuned_summarizer, df, do_sample = True, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bart_scores_df.to_csv('bart_all_categories_finetuned.csv')

*Proper Nouns*

In [None]:
bart_pronouns_scores_df = generate_scores(bart_finetuned_summarizer, df_pronouns, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

Your max_length is set to 256, but you input_length is only 187. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=93)
Your max_length is set to 256, but you input_length is only 175. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 256, but you input_length is only 217. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=108)
Your max_length is set to 256, but you input_length is only 244. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=122)
Your max_length is set to 256, but you input_length is only 234. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=117)


In [None]:
bart_pronouns_scores_df.to_csv('bart_pronouns_finetuned.csv')

# BERT SCORES

In [None]:
def generate_bert_scores(model, data, do_sample = False, num_beams = 1, top_k = 50, num_beam_groups = 1):

        targets = []
        candidates = []
        r1 = []
        r2 = []
        rL = []
        rLs = []
        chrfs = []

        for i in range(int(len(data['text']))):

        candidate = model(data['text'][i], 
                      num_sentences = 2, 
                      min_length = 0,
                      max_length = 256,        
                      ratio = None,
                      use_first = False,
                     )
        candidate = [candidate]

        ref = [data['summary'][i]]

        results = rouge.compute(predictions=candidate,
                              references=ref)

        targets.append(ref)
        candidates.append(candidate)
        r1.append(results['rouge1'])
        r2.append(results['rouge2'])
        rL.append(results['rougeL'])
        rLs.append(results['rougeLsum'])

        results = chrf.compute(predictions=candidate,
                              references=ref)

        chrfs.append(results['score'])
  
    return pd.DataFrame({'target': targets, 'candidate': candidates,
                  'rouge1': r1, 'rouge2': r2, 'rogueL': rL, 
                      'rogueLs': rLs, 'chrf': chrfs})

In [None]:
from summarizer import Summarizer
bert_model = Summarizer('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

*All Categories*

In [None]:
bert_scores_df = generate_bert_scores(bert_model, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bert_scores_df.to_csv('bert_all_categories.csv')

*Proper Nouns*

In [None]:
bert_pronouns_scores_df = generate_bert_scores(bert_model, df_pronouns, do_sample = True, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bert_pronouns_scores_df.to_csv('bert_pronouns.csv')