In [1]:
!pip install rouge_score



In [2]:
from datasets import load_dataset, load_metric

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset
dataset = load_dataset('scientific_papers', 'arxiv')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6440
    })
})

In [5]:
# Get the training, validation and test datasets
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

In [6]:
train_dataset[0]["article"]

'additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models .\nit is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models .\nmany examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.g. @xcite . in the last years\nmany interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g. @xcite , @xcite , @xcite , @xcite , @xcite , @xcite and the references therein . of course , the

In [7]:
train_dataset[0]["abstract"]

' additive models play an important role in semiparametric statistics . \n this paper gives learning rates for regularized kernel based methods for additive models . \n these learning rates compare favourably in particular in high dimensions to recent results on optimal learning rates for purely nonparametric regularized kernel based quantile regression using the gaussian radial basis function kernel , provided the assumption of an additive model is valid . \n additionally , a concrete example is presented to show that a gaussian function depending only on one variable lies in a reproducing kernel hilbert space generated by an additive gaussian kernel , but does not belong to the reproducing kernel hilbert space generated by the multivariate gaussian kernel of the same variance .    * \n key words and phrases . * additive model , kernel , quantile regression , semiparametric , rate of convergence , support vector machine . '

In [8]:
len(train_dataset)

203037

In [9]:
from transformers import AutoTokenizer

In [10]:
model_path = "facebook/bart-large-cnn"

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [12]:
max_input_length = 1024
max_output_length = 128
batch_size = 2


In [13]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=max_input_length)
    outputs = tokenizer(batch["abstract"], padding="max_length", truncation=True,
                        max_length=max_output_length)
    
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids

    # ignore the PAD token
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in
                       batch["labels"]]
    
    return batch

In [14]:
train_dataset = train_dataset.select(range(250))
val_dataset = val_dataset.select(range(25))

In [15]:
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "abstract", "section_names"],
)

In [16]:
val_dataset = val_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "abstract", "section_names"],
)

Map: 100%|██████████| 25/25 [00:00<00:00, 39.65 examples/s]


In [17]:
train_dataset[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [18]:
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)

In [19]:
from transformers import AutoModelForSeq2SeqLM

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, gradient_checkpointing=True, use_cache=False)

In [21]:
# set hyperparameters
model.config.num_beams = 2
model.config.max_length = 128
model.config.min_length = 80
model.config.length_penalty = 2.0
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3

In [22]:
rouge = load_metric("rouge")

  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [23]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids==-100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"] 
    )["rouge2"].mid 

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4)
    }

In [24]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [25]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #fp16=True,
    output_dir="./",
    logging_steps=5,
    eval_steps=10,
    save_steps=10,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
)

In [26]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [27]:
trainer.train()

***** Running training *****
  Num examples = 250
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8


  Gradient Accumulation steps = 4
  Total optimization steps = 31
  Number of trainable parameters = 406290432
  0%|          | 0/31 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 16%|█▌        | 5/31 [28:37<2:28:40, 343.09s/it]

{'loss': 3.5907, 'learning_rate': 4.1935483870967746e-05, 'epoch': 0.16}


 32%|███▏      | 10/31 [57:13<2:00:15, 343.60s/it]***** Running Evaluation *****
  Num examples = 25
  Batch size = 2


{'loss': 3.028, 'learning_rate': 3.387096774193548e-05, 'epoch': 0.32}


                                                  
 32%|███▏      | 10/31 [4:21:43<2:00:15, 343.60s/it]Saving model checkpoint to ./checkpoint-10
Configuration saved in ./checkpoint-10/config.json


{'eval_loss': 2.9522244930267334, 'eval_rouge2_precision': 0.1308, 'eval_rouge2_recall': 0.0952, 'eval_rouge2_fmeasure': 0.1087, 'eval_runtime': 12269.7961, 'eval_samples_per_second': 0.002, 'eval_steps_per_second': 0.001, 'epoch': 0.32}


Model weights saved in ./checkpoint-10/pytorch_model.bin
tokenizer config file saved in ./checkpoint-10/tokenizer_config.json
Special tokens file saved in ./checkpoint-10/special_tokens_map.json
 48%|████▊     | 15/31 [4:51:09<5:30:20, 1238.79s/it] 

{'loss': 2.9611, 'learning_rate': 2.5806451612903226e-05, 'epoch': 0.48}


 65%|██████▍   | 20/31 [5:20:18<1:31:24, 498.61s/it] ***** Running Evaluation *****
  Num examples = 25
  Batch size = 2


{'loss': 2.8101, 'learning_rate': 1.774193548387097e-05, 'epoch': 0.64}


                                                    
 65%|██████▍   | 20/31 [17:57:34<1:31:24, 498.61s/it]Saving model checkpoint to ./checkpoint-20
Configuration saved in ./checkpoint-20/config.json


{'eval_loss': 2.8000071048736572, 'eval_rouge2_precision': 0.1229, 'eval_rouge2_recall': 0.1165, 'eval_rouge2_fmeasure': 0.118, 'eval_runtime': 45435.7875, 'eval_samples_per_second': 0.001, 'eval_steps_per_second': 0.0, 'epoch': 0.64}


Model weights saved in ./checkpoint-20/pytorch_model.bin
tokenizer config file saved in ./checkpoint-20/tokenizer_config.json
Special tokens file saved in ./checkpoint-20/special_tokens_map.json
 81%|████████  | 25/31 [18:27:00<6:05:04, 3650.73s/it]  

{'loss': 2.8795, 'learning_rate': 9.67741935483871e-06, 'epoch': 0.8}


 97%|█████████▋| 30/31 [18:56:01<15:02, 902.86s/it]   ***** Running Evaluation *****
  Num examples = 25
  Batch size = 2


{'loss': 2.9015, 'learning_rate': 1.6129032258064516e-06, 'epoch': 0.96}


                                                   
 97%|█████████▋| 30/31 [22:14:03<15:02, 902.86s/it]Saving model checkpoint to ./checkpoint-30
Configuration saved in ./checkpoint-30/config.json


{'eval_loss': 2.7559425830841064, 'eval_rouge2_precision': 0.1337, 'eval_rouge2_recall': 0.1168, 'eval_rouge2_fmeasure': 0.1231, 'eval_runtime': 11882.112, 'eval_samples_per_second': 0.002, 'eval_steps_per_second': 0.001, 'epoch': 0.96}


Model weights saved in ./checkpoint-30/pytorch_model.bin
tokenizer config file saved in ./checkpoint-30/tokenizer_config.json
Special tokens file saved in ./checkpoint-30/special_tokens_map.json
Deleting older checkpoint [checkpoint-10] due to args.save_total_limit
100%|██████████| 31/31 [22:20:02<00:00, 4304.45s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 31/31 [22:20:02<00:00, 2593.64s/it]

{'train_runtime': 80402.8998, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.0, 'train_loss': 3.010087836173273, 'epoch': 0.99}





TrainOutput(global_step=31, training_loss=3.010087836173273, metrics={'train_runtime': 80402.8998, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.0, 'train_loss': 3.010087836173273, 'epoch': 0.99})