In [None]:
!pip install rouge_score

In [1]:
import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

import torch
DEVICE = torch.device("mps") if torch.backends.mps.is_available() else 'cpu'
DEVICE

  from .autonotebook import tqdm as notebook_tqdm


device(type='mps')

In [2]:
from datasets import load_dataset, load_metric

# Load the dataset
dataset = load_dataset('scientific_papers', 'arxiv')

# Get the training, validation and test datasets
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
from transformers import AutoTokenizer
from transformers import BartTokenizer

model_path = "facebook/bart-large-cnn"
#tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)

max_input_length = 1024
max_output_length = 128
batch_size = 2

def process_data_to_model_inputs(batch):    
    # tokenize the inputs and labels
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=max_input_length)
    outputs = tokenizer(batch["abstract"], padding="max_length", truncation=True, max_length=max_output_length)
    
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids

    # ignore the PAD token
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in
                       batch["labels"]]
    
    return batch

train_dataset = train_dataset.select(range(250))
val_dataset = val_dataset.select(range(25))

train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "abstract", "section_names"],
)

val_dataset = val_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "abstract", "section_names"],
)

train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)

Map: 100%|██████████| 250/250 [00:10<00:00, 23.28 examples/s]
Map: 100%|██████████| 25/25 [00:01<00:00, 19.48 examples/s]


In [None]:
train_dataset[2]["article"]

In [6]:
import numpy as np

task_dataset = train_dataset
for i in range(len(task_dataset)):
    ls = task_dataset[i]["input_ids"].numpy()
    indices = np.where(ls == 2)
    if len(indices) > 1:
        print(i)
        print(indices)

In [7]:
from transformers import AutoModelForSeq2SeqLM
from transformers import BartForConditionalGeneration

model = AutoModelForSeq2SeqLM.from_pretrained(model_path, gradient_checkpointing=True, use_cache=False)
#model = BartForConditionalGeneration.from_pretrained(model_path, gradient_checkpointing=True, use_cache=False)

# set hyperparameters
model.config.num_beams = 2
model.config.max_length = 128
model.config.min_length = 80
model.config.length_penalty = 2.0
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3

In [8]:
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids==-100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"] 
    )["rouge2"].mid 

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4)
    }

  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [9]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #fp16=True,
    use_mps_device=True,
    output_dir="./",
    logging_steps=5,
    eval_steps=10,
    save_steps=10,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [10]:
trainer.train()

***** Running training *****
  Num examples = 250
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 31
  Number of trainable parameters = 406290432
 16%|█▌        | 5/31 [00:29<02:20,  5.40s/it]

{'loss': 3.5405, 'learning_rate': 4.1935483870967746e-05, 'epoch': 0.16}


 32%|███▏      | 10/31 [00:54<01:49,  5.21s/it]***** Running Evaluation *****
  Num examples = 25
  Batch size = 2


{'loss': 2.9054, 'learning_rate': 3.387096774193548e-05, 'epoch': 0.32}


  next_tokens = next_tokens % vocab_size
                                               
 32%|███▏      | 10/31 [07:32<01:49,  5.21s/it]Saving model checkpoint to ./checkpoint-10
Configuration saved in ./checkpoint-10/config.json


{'eval_loss': 2.8460090160369873, 'eval_rouge2_precision': 0.1289, 'eval_rouge2_recall': 0.0981, 'eval_rouge2_fmeasure': 0.1087, 'eval_runtime': 397.5472, 'eval_samples_per_second': 0.063, 'eval_steps_per_second': 0.033, 'epoch': 0.32}


Model weights saved in ./checkpoint-10/pytorch_model.bin
tokenizer config file saved in ./checkpoint-10/tokenizer_config.json
Special tokens file saved in ./checkpoint-10/special_tokens_map.json
 48%|████▊     | 15/31 [08:28<09:41, 36.33s/it] 

{'loss': 2.8904, 'learning_rate': 2.5806451612903226e-05, 'epoch': 0.48}


 65%|██████▍   | 20/31 [08:57<01:59, 10.86s/it]***** Running Evaluation *****
  Num examples = 25
  Batch size = 2


{'loss': 2.6771, 'learning_rate': 1.774193548387097e-05, 'epoch': 0.64}


                                               
 65%|██████▍   | 20/31 [15:55<01:59, 10.86s/it]Saving model checkpoint to ./checkpoint-20
Configuration saved in ./checkpoint-20/config.json


{'eval_loss': 2.6968960762023926, 'eval_rouge2_precision': 0.1231, 'eval_rouge2_recall': 0.1244, 'eval_rouge2_fmeasure': 0.1227, 'eval_runtime': 418.3717, 'eval_samples_per_second': 0.06, 'eval_steps_per_second': 0.031, 'epoch': 0.64}


Model weights saved in ./checkpoint-20/pytorch_model.bin
tokenizer config file saved in ./checkpoint-20/tokenizer_config.json
Special tokens file saved in ./checkpoint-20/special_tokens_map.json
Deleting older checkpoint [checkpoint-30] due to args.save_total_limit
 81%|████████  | 25/31 [16:50<03:50, 38.47s/it] 

{'loss': 2.7308, 'learning_rate': 9.67741935483871e-06, 'epoch': 0.8}


 97%|█████████▋| 30/31 [17:17<00:11, 11.01s/it]***** Running Evaluation *****
  Num examples = 25
  Batch size = 2


{'loss': 2.7998, 'learning_rate': 1.6129032258064516e-06, 'epoch': 0.96}


                                               
 97%|█████████▋| 30/31 [23:26<00:11, 11.01s/it]Saving model checkpoint to ./checkpoint-30
Configuration saved in ./checkpoint-30/config.json


{'eval_loss': 2.640104055404663, 'eval_rouge2_precision': 0.1284, 'eval_rouge2_recall': 0.1252, 'eval_rouge2_fmeasure': 0.1247, 'eval_runtime': 369.0262, 'eval_samples_per_second': 0.068, 'eval_steps_per_second': 0.035, 'epoch': 0.96}


Model weights saved in ./checkpoint-30/pytorch_model.bin
tokenizer config file saved in ./checkpoint-30/tokenizer_config.json
Special tokens file saved in ./checkpoint-30/special_tokens_map.json
Deleting older checkpoint [checkpoint-10] due to args.save_total_limit
100%|██████████| 31/31 [23:58<00:00, 128.01s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 31/31 [23:58<00:00, 46.42s/it] 

{'train_runtime': 1438.9551, 'train_samples_per_second': 0.174, 'train_steps_per_second': 0.022, 'train_loss': 2.9066912820262294, 'epoch': 0.99}





TrainOutput(global_step=31, training_loss=2.9066912820262294, metrics={'train_runtime': 1438.9551, 'train_samples_per_second': 0.174, 'train_steps_per_second': 0.022, 'train_loss': 2.9066912820262294, 'epoch': 0.99})