# Install necessary libraries

Using PyTorch here because huggingface models and facebook 😛

In [2]:
%pip install -q torch transformers datasets accelerate

# Load the dataset

In [3]:
import pandas as pd

dataset = pd.read_json("/content/data.jsonl", lines=True)

In [10]:
cols = dataset.columns
cols

Index(['title', 'input_text', 'target_text', 'compression_ratio',
       'readability_score', 'rouge-1', 'rouge-2', 'rouge-l'],
      dtype='object')

In [23]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/content/data.jsonl")["train"]

# remove columns we don't need
# for now, we just need input-output pairs
dataset = dataset.remove_columns(["title", "compression_ratio",
                                  "readability_score", "rouge-1",
                                  "rouge-2", "rouge-l"])

In [24]:
split_dataset = dataset.train_test_split(test_size=0.1)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 7205
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 801
    })
})


# Tokenization

In [25]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

In [26]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["input_text"],
                             max_length=512,
                             truncation=True,
                             padding='max_length')
    labels = tokenizer(examples["target_text"],
                       max_length=128,
                       truncation=True,
                       padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = split_dataset.map(preprocess_function,
                                batched=True,
                                remove_columns=["input_text", "target_text"])

Map:   0%|          | 0/7205 [00:00<?, ? examples/s]

Map:   0%|          | 0/801 [00:00<?, ? examples/s]

In [27]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7205
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 801
    })
})

# BART

In [28]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

## Training

In [29]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bart-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    logging_dir="./logs",
    report_to="none"
)

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.4314,0.890182
2,0.8918,0.867856
3,0.8068,0.867155




TrainOutput(global_step=2703, training_loss=0.9863935294876353, metrics={'train_runtime': 1921.0564, 'train_samples_per_second': 11.252, 'train_steps_per_second': 1.407, 'total_flos': 6589726772428800.0, 'train_loss': 0.9863935294876353, 'epoch': 3.0})

In [31]:
trainer.save_model("./bart-finetuned")
tokenizer.save_pretrained("./bart-finetuned")

('./bart-finetuned/tokenizer_config.json',
 './bart-finetuned/special_tokens_map.json',
 './bart-finetuned/vocab.json',
 './bart-finetuned/merges.txt',
 './bart-finetuned/added_tokens.json')

In [43]:
!zip -r bart-finetuned.zip bart-finetuned

  adding: bart-finetuned/ (stored 0%)
  adding: bart-finetuned/vocab.json (deflated 68%)
  adding: bart-finetuned/merges.txt (deflated 53%)
  adding: bart-finetuned/checkpoint-901/ (stored 0%)
  adding: bart-finetuned/checkpoint-901/vocab.json (deflated 68%)
  adding: bart-finetuned/checkpoint-901/merges.txt (deflated 53%)
  adding: bart-finetuned/checkpoint-901/optimizer.pt (deflated 9%)
  adding: bart-finetuned/checkpoint-901/training_args.bin (deflated 51%)
  adding: bart-finetuned/checkpoint-901/special_tokens_map.json (deflated 85%)
  adding: bart-finetuned/checkpoint-901/config.json (deflated 63%)
  adding: bart-finetuned/checkpoint-901/tokenizer_config.json (deflated 75%)
  adding: bart-finetuned/checkpoint-901/trainer_state.json (deflated 56%)
  adding: bart-finetuned/checkpoint-901/rng_state.pth (deflated 25%)
  adding: bart-finetuned/checkpoint-901/scheduler.pt (deflated 55%)
  adding: bart-finetuned/checkpoint-901/model.safetensors (deflated 8%)
  adding: bart-finetuned/chec