**T5 Trained on XL SUM**

**Packages**

In [None]:
! pip install datasets --quiet
! pip install evaluate --quiet
! pip install rouge_score --quiet
! pip install sacrebleu --quiet
! pip install transformers --quiet
! pip install -q sentencepiece --quiet
! pip install summarizer --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 KB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import math

from datasets import load_dataset
import evaluate

import inspect

#let's make longer output readable without horizontal scrolling
from pprint import pprint

import warnings

import regex as re

import os, re
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# These auto classes load the right type of tokenizer and model based on a model name
from transformers import AutoTokenizer, TFAutoModel
from transformers import pipeline
from transformers import AutoModel

**Necessary Functions**

In [None]:
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
chrf = evaluate.load("chrf")

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [None]:
def get_default_args(func):
    signature = inspect.signature(func)
    return {
        k: v.default
        for k, v in signature.parameters.items()
        if v.default is not inspect.Parameter.empty
    }

**Huggingface Transformers Training Resources**

https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py

https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html

https://gitlab.com/nicolalandro/summarization

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
df = pd.read_csv('/content/drive/MyDrive/W266FinalProject/Datasets/xl_sum_sample_test.csv')
# df.head(5)

In [None]:
df_tech = pd.read_csv('/content/drive/MyDrive/W266FinalProject/Datasets/xl_sum_sample_test.csv')

In [None]:
def generate_scores(mod, data, do_sample = False, num_beams = 1, top_k = 50, num_beam_groups = 1):

  targets = []
  candidates = []
  r1 = []
  r2 = []
  rL = []
  rLs = []
  chrfs = []

  for i in range(int(len(data['text']))):

      candidate = mod(data['text'][i], 
                              truncation = True,
                              max_length = 256,
                              min_length = 0, 
                              do_sample = do_sample,
                              num_beams = num_beams, 
                              top_k = top_k,
                              num_beam_groups = num_beam_groups,
                              )[0]
      candidate = [candidate['summary_text']]

      ref = [data['summary'][i]]

      results = rouge.compute(predictions=candidate,
                              references=ref)

      targets.append(ref)
      candidates.append(candidate)
      r1.append(results['rouge1'])
      r2.append(results['rouge2'])
      rL.append(results['rougeL'])
      rLs.append(results['rougeLsum'])

      results = chrf.compute(predictions=candidate,
                              references=ref)

      chrfs.append(results['score'])
  
  return pd.DataFrame({'target': targets, 'candidate': candidates,
                  'rouge1': r1, 'rouge2': r2, 'rougeL': rL, 
                      'rougeLs': rLs, 'chrf': chrfs})

# T5 SCORES

### Base

In [None]:
t5_base_summarizer = pipeline("summarization", model="t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
t5_base_scores_df = generate_scores(t5_base_summarizer, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

Your max_length is set to 256, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 256, but you input_length is only 96. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 256, but you input_length is only 204. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=102)
Your max_length is set to 256, but you input_length is only 213. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=106)
Your max_length is set to 256, but you input_length is only 243. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=121)
Your max_length is set to 256, but you input_length is only 192. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 256, but you input_length is only 223. You might 

In [None]:
t5_base_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/t5_all_categories_base.csv')

Technology category

In [None]:
t5_base_tech_scores_df = generate_scores(t5_base_summarizer, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

Your max_length is set to 256, but you input_length is only 99. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 256, but you input_length is only 96. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 256, but you input_length is only 204. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=102)
Your max_length is set to 256, but you input_length is only 213. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=106)
Your max_length is set to 256, but you input_length is only 243. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=121)
Your max_length is set to 256, but you input_length is only 192. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 256, but you input_length is only 223. You might 

In [None]:
t5_base_tech_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/t5_tech_base.csv')

### Finetuned

In [None]:
t5_finetuned_summarizer = pipeline("summarization", model="arisanguyen/finetuned_T5_all_categories", revision = 'model_0')

In [None]:
t5_scores_df = generate_scores(t5_finetuned_summarizer, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
t5_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/t5_all_categories_finetuned.csv')

Technology category

In [None]:
t5_tech_scores_df = generate_scores(t5_finetuned_summarizer, df_tech, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
t5_tech_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/t5_tech_finetuned.csv')

# BART SCORES

### Base

In [None]:
def generate_bart_scores(tokenizer, model, data, do_sample = False, num_beams = 1, top_k = 50, num_beam_groups = 1):

  targets = []
  candidates = []
  r1 = []
  r2 = []
  rL = []
  rLs = []
  chrfs = []

  for i in range(int(len(data['text']))):
      inputs = tokenizer(data["text"][i], return_tensors="pt", max_length=512, truncation=True)
      summary_ids = model.generate(inputs["input_ids"], num_beams=num_beams, min_length=0, max_length=256)
      candidate = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
      candidate = [candidate]

      ref = [data['summary'][i]]

      results = rouge.compute(predictions=candidate,
                              references=ref)

      targets.append(ref)
      candidates.append(candidate)
      r1.append(results['rouge1'])
      r2.append(results['rouge2'])
      rL.append(results['rougeL'])
      rLs.append(results['rougeLsum'])

      results = chrf.compute(predictions=candidate,
                              references=ref)

      chrfs.append(results['score'])
  
  return pd.DataFrame({'target': targets, 'candidate': candidates,
                  'rouge1': r1, 'rouge2': r2, 'rougeL': rL, 
                      'rougeLs': rLs, 'chrf': chrfs})

In [None]:
from transformers import AutoTokenizer, BartForConditionalGeneration

bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
bart_base_scores_df = generate_bart_scores(bart_tokenizer, bart_model, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bart_base_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/bart_all_categories_base.csv')

Technology category

In [None]:
bart_base_tech_scores_df = generate_bart_scores(bart_tokenizer, bart_model, df_tech, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bart_base_tech_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/bart_tech_base.csv')

### Finetuned

In [None]:
bart_finetuned_summarizer = pipeline("summarization", model="arisanguyen/finetuned-BART-all-categories", revision = 'model_2')

In [None]:
bart_scores_df = generate_scores(bart_finetuned_summarizer, df, do_sample = True, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bart_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/bart_all_categories_finetuned.csv')

Technology category

In [None]:
bart_tech_scores_df = generate_scores(bart_finetuned_summarizer, df_tech, do_sample = True, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bart_tech_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/bart_tech_finetuned.csv')

# BERT SCORES

In [None]:
from summarizer import Summarizer
bert_model = Summarizer('bert-base-uncased')

In [None]:
bert_scores_df = generate_scores(bert_model, df, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bert_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/bert_all_categories.csv')

Technology category

In [None]:
bert_tech_scores_df = generate_scores(bert_model, df_tech, do_sample = False, num_beams = 4, top_k = 75, num_beam_groups = 2)

In [None]:
bert_tech_scores_df.to_csv('/content/drive/MyDrive/W266FinalProject/test_candidates/bert_tech.csv')