In [1]:
# !pip3 install transformers datasets==2.0.0 sentencepiece rouge_score sacrebleu 

In [2]:
# !pip3 install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [1]:
DATA_CACHE_PATH = './cached_data'
MODEL_CACHE_PATH = './cached_models'
PROCESSED_PATH = './processed'

In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import datasets
from datasets import load_dataset, load_metric, load_from_disk
import sentencepiece

from torch.utils.data import Dataset

from tqdm.auto import tqdm
from tqdm import tqdm

import pandas as pd
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


## Tokenize Data
- To improve runtime, pre-tokenize the data and write it disk
- Only one batch is used for proper padding
- Since each model requires a unique tokenizer, each dataset and model needs its own file (3 datasets x 3 models = 9 files)

In [2]:
def tokenize(batch, tokenizer):
    return tokenizer(
        batch["description"], 
        max_length=512, 
        padding="max_length", 
        truncation=True)

In [3]:
def write_tokenize(ckpt, ds_args):
    nickname = ckpt.replace("/", "-").replace("_", "-")
    ds = load_dataset(**ds_args)
    
    #load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        ckpt, cache_dir=MODEL_CACHE_PATH)
    
    #tokenize data
    #alternative is to tokenize batches and use data collator
    df_encoded = ds.map(tokenize,
                        fn_kwargs={'tokenizer': tokenizer},
                        batched=True, 
                        batch_size=None, #Needs to pad everything in one batch 
                        load_from_cache_file=False)
    
    df_encoded.add_column('ground_truth', list(ds['abstract']))
    
    #write df to disk
    df_encoded.save_to_disk(f"./processed/{nickname}/{ds_args['path']}")
    
    return None

In [3]:
ckpts = [
#     "google/bigbird-pegasus-large-bigpatent",
#     "sshleifer/distill-pegasus-xsum-16-4",
#     "sshleifer/distilbart-xsum-12-1",
#     "t5-small"
#     "trained_models/t5_trained"
#     "trained_models/pegasus_trained"
    "trained_models/bart_trained"
]

PATENT_ARGS = {
    'path': 'big_patent',
    'name': "h",
    'split': 'test' #test
}

In [7]:
%%time 

#CHANGE DS ARGS
#tokenize dataset for all 3 models
for ckpt in ckpts:
    write_tokenize(ckpt, PATENT_ARGS)

Reusing dataset big_patent (/home/ccmilne/.cache/huggingface/datasets/big_patent/h/1.0.0/bdefa7c0b39fba8bba1c6331b70b738e30d63c8ad4567f983ce315a5fef6131c)


  0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: user 2min 29s, sys: 6.2 s, total: 2min 36s
Wall time: 2min 38s


## Summarization

In [4]:
import string
import re

def clean(s):
    a = s.translate(str.maketrans(' ', ' ', string.punctuation))
    return re.sub('\s+',' ', a).replace('\n', '').strip()

def chunks(list_of_elements, batch_size=5):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def create_summaries(model_ckpt, total_articles, ds_args, batch_size=5):
    
    nickname = model_ckpt.replace("/", "-").replace("_", "-")
    out_path = f"./processed/{nickname}/{ds_args['path']}"
    
    #Loads tokenized df if it's on the disk
    data = load_from_disk(out_path)

    #Change inputs to tensors
    if total_articles:
        ground_truths = list(data['abstract'])[:total_articles]
        data.set_format("torch", columns=["input_ids", "attention_mask"])
        data = data.select([i for i in range(total_articles)])
    if not total_articles:
        ground_truths = list(data['abstract'])
        data.set_format("torch", columns=["input_ids", "attention_mask"])
    
    torch.cuda.empty_cache()
    all_summaries = []
    
    #Load model based on supplied checkpoint (i.e. distilbart-cnn-dailymail)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt, cache_dir=MODEL_CACHE_PATH)
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt, cache_dir=MODEL_CACHE_PATH)
    model.to(device)
        
    #Chunk articles and iterate summarize by batch
    article_batches = list(chunks(data, batch_size=batch_size))
    for article_batch in tqdm(article_batches, total=len(article_batches)):

        with torch.no_grad():
            summaries = model.generate(
                input_ids=article_batch["input_ids"].to(device),
                attention_mask=article_batch["attention_mask"].to(device),
                length_penalty=2.5, 
                num_beams=8)

        for s in summaries:
            decoded_sum = tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            decoded_sum = decoded_sum.replace('\n', '').replace('<n>', '')
            all_summaries.append(decoded_sum)

    torch.cuda.empty_cache()
    del model 

    return {'summaries': all_summaries, 'ground_truth': ground_truths}

In [5]:
%%time

for ckpt in ckpts:
    s = create_summaries(ckpt, total_articles=None, ds_args=PATENT_ARGS, batch_size=32)
    output_df = pd.DataFrame(s).reset_index()
    output_df.to_csv(f"data/output_{ckpt.replace('/','')}.csv", index=False)

100%|██████████| 447/447 [28:06<00:00,  3.77s/it]


CPU times: user 24min 10s, sys: 3min 56s, total: 28min 6s
Wall time: 28min 15s
