# Long Document Summarization

The BART-large-CNN model has some limitations when it comes to summarizing long documents. This model was fine-tuned on the launch/gov_report dataset from HuggingFace. It contains long reports and their associated summaries written by goverment research agencies.

In [None]:
from GPUtil import showUtilization as gpu_usage
gpu_usage()  

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#install needed packages
#!pip3 install transformers

In [None]:
#import needed libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
import numpy as np
import evaluate

In [None]:
#load dataset
data= load_dataset("launch/gov_report")
#select base model from huggingface
checkpoint= 'facebook/bart-large-cnn'

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer= AutoTokenizer.from_pretrained(checkpoint)

device= torch.device('cuda' if torch.cuda.is_available() else "cpu")
model.to(device)

def preprocess_function(examples):
    #separate each document in dataset
    inputs = [doc for doc in examples["document"]]
    #tokenize each document and truncate each sequence to the maximum length
    model_inputs = tokenizer(inputs, max_length= 1000, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length= 550, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#apply the tokenization and text preparation to the data
#setting batched to true allows for this to be doen faster
tokenized_data= data.map(preprocess_function, batched=True)


batch_size= 6
model_name= checkpoint
args= Seq2SeqTrainingArguments(
    output_dir= 'gov_doc_summarization',
    evaluation_strategy= 'epoch',
    learning_rate= 2e-5,
    per_device_train_batch_size= batch_size,
    per_device_eval_batch_size= batch_size,
    weight_decay= 0.01,
    save_total_limit= 2,
    num_train_epochs= 3,
    predict_with_generate= True,
    fp16= True)
    
data_collator= DataCollatorForSeq2Seq(tokenizer, model= model)



In [None]:
#rouge metric score compares the actual summary to the produced summary
rouge_score= evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

In [None]:
tokenized_data = tokenized_data.remove_columns(data["train"].column_names)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics= compute_metrics
    )

In [None]:
#train model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
#save new model
trainer.save_model("./long_doc_summarizer")