# Second attempt
- Here we tried to create a cleaner setup with reusable code
- We created a model and testing loop, with the idea of adding a chunking function into this setup
- We want to use the strategy of "Hierarchical summarization" and implement this into the model's prediction function for training and testing.
- The problem with this (and many tutorials) is that the generation of a hierarchical summarization can't be turned into just one function easily, and plugged into an existing setup.
- We did however find a good dataset we'd like to use to train the model.
- What we learned: Careful preperation and pre-planning is needed to implement a training loop using hierarchical summarization.

# Initalization
Install packages and load model

In [2]:
# Import all prereqs, set vars

from transformers import pipeline, set_seed

# import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm

import torch
nltk.download("punkt")
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "google/pegasus-cnn_dailymail"






[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# init our model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define Helpers
Define our helper functions

In [4]:

# We need to split the data into batches so we can process them in chunks
# We can't load the entire dataset into memory
# this is a generator
def generate_batch_sized_chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


# This runs the model on the dataset in batches, and calculates the metric
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="report", 
                               column_summary="summary"):

    #Get the batch of text and associated summary
    text_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))
    print("finished text batches")
    # Iterate over each batch
    for text_batch, target_batch in tqdm(zip(text_batches, target_batches), total=len(text_batches)):
        print("batch run len:", len(text_batch))
        
        # Tokenize the input batch
        inputs = tokenizer(text_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        print("tokenized")
        # Generate summaries for the input batch
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        print("summarized")
        # Decode or de-tokenize summaries into real text so we can evaluate it
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        print("decoded")
        # Remove whitespace in summaries
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        print("added batch")
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score


    

# Load Data

In [5]:
# Load all the data
datasetDict = load_dataset("ccdv/govreport-summarization")
# this dataset is a dict with train, validation, and test

# get lengths of dataset splits
split_lengths = [len(datasetDict[split])for split in datasetDict]

# Print our dataset for sanity checking
print("Our dataset:")
splits = [f"Set '{split}': rows:{datasetDict[split].num_rows}, features:{datasetDict[split].column_names}" for split in datasetDict]
print("\n".join(splits))

# Print a a sample report and summary
print("\nReport:")

print(datasetDict["test"][1]["report"])

print("\nSummary:")

print(datasetDict["test"][1]["summary"])

No config specified, defaulting to: govreport-summarization/document
Found cached dataset govreport-summarization (C:/Users/Adam/.cache/huggingface/datasets/ccdv___govreport-summarization/document/1.0.0/57ca3042de9c40c218cc94084cbc80a99a161036134bfc88112c57d251443590)


  0%|          | 0/3 [00:00<?, ?it/s]

Our dataset:
Set 'train': rows:17517, features:['report', 'summary']
Set 'validation': rows:973, features:['report', 'summary']
Set 'test': rows:973, features:['report', 'summary']

Report:
A variety of federal laws, regulations, and policies establish requirements and guidance for EPA to follow when appointing members to serve on advisory committees. For example, one purpose of FACA is to ensure that uniform procedures govern the establishment and operation of advisory committees. Also under FACA, an agency establishing an advisory committee must, among other things, require the committee’s membership to be balanced in terms of the points of view represented and the functions to be performed by the committee. In addition, federal ethics regulations establish when and how federal officials should review financial disclosure forms to identify and prevent conflicts of interest prohibited by federal law for any prospective committee members required to file these forms in connection with 

# Train

In [6]:
# testing the breaking down of the model

rouge_metric = load_metric('rouge')

score = calculate_metric_on_test_ds(datasetDict["train"], rouge_metric, model, tokenizer)

  rouge_metric = load_metric('rouge')


KeyboardInterrupt: 

In [None]:
pipe = pipeline('summarization', model = model_ckpt )

pipe_out = pipe(datasetDict['test'][0]['report'] )

print(pipe_out)