Import all the necessary libraries

In [1]:
!pip install transformers
!pip install datasets
!pip install py7zr

import pandas as pd
import numpy as np
import json
import transformers
import datasets
from datasets import load_dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
import torch





In [2]:
samsum = load_dataset("samsum")

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_ckpt = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt)

In [4]:
#TOKENIZATION

def get_feature(batch):
    encodings = tokenizer(batch['dialogue'], text_target=batch['summary'], max_length=1024, padding='max_length', truncation=True, return_tensors='pt')
    encodings = {'input_ids': encodings['input_ids'].to(device), 'attention_mask': encodings['attention_mask'].to(device), 'labels': encodings['labels'].to(device)}
    return encodings

In [5]:
samsum_pt = samsum.map(get_feature, batched=True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [6]:
samsum_pt

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [7]:
columns = ['input_ids', 'attention_mask', 'labels']
samsum_pt.set_format(type='torch', columns=columns)

In [8]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
#TRAINING
! pip install -U accelerate
! pip install -U transformers

training_args = TrainingArguments(
    output_dir='bart-on-samsum',
    num_train_epochs=1,
    warmup_steps = 500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay = 0.01,
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=samsum_pt['train'],
    eval_dataset=samsum_pt['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
)



In [11]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.0405,0.038846


TrainOutput(global_step=920, training_loss=0.5825438418466112, metrics={'train_runtime': 14589.2815, 'train_samples_per_second': 1.01, 'train_steps_per_second': 0.063, 'total_flos': 3.189977974308864e+16, 'train_loss': 0.5825438418466112, 'epoch': 1.0})

In [12]:
trainer.save_model('/content/saved_model')

In [13]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("saved_model")
model = AutoModelForSeq2SeqLM.from_pretrained("saved_model")