In [1]:
! pip install -q kaggle
! pip install datasets
from datasets import Dataset, DatasetDict
!pip install rouge_score
! pip install evaluate
import evaluate
! pip install nltk
import nltk
nltk.download("punkt")
! pip install -U accelerate
! pip install -U transformers[torch]



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




## Get the data

Import the dataset from Kaggle repository

In [2]:
from google.colab import files
!kaggle datasets download mexwell/amazon-reviews-multi

Dataset URL: https://www.kaggle.com/datasets/mexwell/amazon-reviews-multi
License(s): other
amazon-reviews-multi.zip: Skipping, found more recently modified local copy (use --force to force download)


Unzip the downloaded file

In [3]:
import zipfile
zip_ref = zipfile.ZipFile('/content/amazon-reviews-multi.zip','r')
zip_ref.extractall('/content/')
zip_ref.close()

Import it as a HuggingFace Dataset

In [4]:
from datasets import load_dataset
data_files = {'train': 'train.csv', 'validation': 'validation.csv','test': 'test.csv'}
dataset = load_dataset('csv', data_files=data_files)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 1200000
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 30000
    })
})

Filter to only get book reviews in English and Spanish

In [5]:
books_dataset = dataset
books_dataset['train'] = books_dataset['train'].filter(lambda example: (example["product_category"]=='book' or example["product_category"]=='digital_ebook_purchase') and
 (example["language"]=="en" or example["language"]=='es'))
books_dataset['validation']=books_dataset['validation'].filter(lambda example: (example["product_category"]=='book' or example["product_category"]=='digital_ebook_purchase') and
 (example["language"]=="en" or example["language"]=='es'))
books_dataset['test']=books_dataset['validation'].filter(lambda example: (example["product_category"]=='book' or example["product_category"]=='digital_ebook_purchase') and
 (example["language"]=="en" or example["language"]=='es'))
books_dataset

Filter:   0%|          | 0/1200000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/30000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/424 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 17612
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 424
    })
    test: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 424
    })
})

We delete the very short titles:

In [6]:
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)
books_dataset

Filter:   0%|          | 0/17612 [00:00<?, ? examples/s]

Filter:   0%|          | 0/424 [00:00<?, ? examples/s]

Filter:   0%|          | 0/424 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 9672
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 238
    })
    test: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 238
    })
})

## Data processing

Import the Tokenizer. Because we have a multilingual dataset, we need a multilingual pretrained model as a basis. We choose 'mt5-small' because it's a relatively nimble model.

In [7]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


We define a pre-processing function:

In [8]:
max_input_length = 512
max_target_length = 30


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["review_title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

We tokenize the dataset:

In [9]:
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/9672 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Function to extract the first three sentences

In [10]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(books_dataset["train"][1]["review_body"]))

I ordered this book on February 11.
It never arrived.


Evaluation function

In [11]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["review_body"]]
    return metric.compute(predictions=summaries, references=dataset["review_title"])

Compute the ROUGE score

In [12]:
rouge_score = evaluate.load("rouge")
score = evaluate_baseline(books_dataset["validation"], rouge_score)
score

{'rouge1': 0.16765536564564187,
 'rouge2': 0.08811819814075421,
 'rougeL': 0.1560785881236159,
 'rougeLsum': 0.15960239760361764}

##Model training

Define the model

In [13]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Define the training arguments

In [20]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-amazon-en-es",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
    hub_token='',
)



Function to compute the metric during training

In [15]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

Data collator

In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Let's see what the data collator does with a bunch of examples:

In [17]:
tokenized_datasets = tokenized_datasets.remove_columns(
    books_dataset["train"].column_names
)

In [18]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

{'input_ids': tensor([[  4927,  97789,   2888,   1070,   5835,    527,    287,   2250,   2225,
         105255,    910,    269, 202133,  62799,    260,    259,   9918,    336,
            259,  68140,    287,  10945,    339,    259,    262, 123647,    260,
            298,  53799,    400,  55976,    260,      1],
        [   336,    259,  91451,    714,   3435,    351,    259,   4293,   4668,
           1385,   8103,  24938,    285,    260,      1,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[  2368,   4267,    714,   3435,    259,   2220,    272,    293,    270,
           3162,      1],
        [

## Model training

Instantiate the trainer

In [21]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Start training

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,7.8333,3.337193,14.841,5.9448,14.4336,14.3245
2,3.9162,3.241267,15.5167,7.561,14.95,14.9741
3,3.5907,3.180744,16.3665,8.3144,15.7774,15.7948
4,3.4242,3.149525,16.3917,8.1872,15.9351,15.8359
5,3.3149,3.125503,16.8456,8.1961,16.4239,16.3677
6,3.247,3.115959,16.5491,7.6788,16.0112,15.9946
7,3.1998,3.102478,17.4028,8.4074,16.9673,16.9513
8,3.178,3.104658,16.5427,7.9991,16.2132,16.1175




TrainOutput(global_step=9672, training_loss=3.963025846394554, metrics={'train_runtime': 2946.3374, 'train_samples_per_second': 26.262, 'train_steps_per_second': 3.283, 'total_flos': 1.189856995786752e+16, 'train_loss': 3.963025846394554, 'epoch': 8.0})

The eval loss seems to plateau. Let's look at the evaluation metrics:

In [23]:
trainer.evaluate()

{'eval_loss': 3.1046576499938965,
 'eval_rouge1': 16.5427,
 'eval_rouge2': 7.9991,
 'eval_rougeL': 16.2132,
 'eval_rougeLsum': 16.1175,
 'eval_runtime': 9.552,
 'eval_samples_per_second': 24.916,
 'eval_steps_per_second': 3.141,
 'epoch': 8.0}

##Fine tuning the model

Set the format to PyTorch

In [24]:
tokenized_datasets.set_format("torch")

Load the model again

In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Instantiate the DataLoader

In [26]:
from torch.utils.data import DataLoader

batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size
)

Set up the optrimizer. We use the classical AdamW

In [27]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

Set up the accelerator (to train faster)

In [28]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

Define a standard linear learning rate scheduler:

In [29]:
from transformers import get_scheduler

num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

Function splitting the generated summaries into the format expected by the evaluator (newsline sentence separator)

In [30]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

Training loop

In [31]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    # Save and upload
    #accelerator.wait_for_everyone()
    #unwrapped_model = accelerator.unwrap_model(model)
    #unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    #if accelerator.is_main_process:
    #    tokenizer.save_pretrained(output_dir)
    #    repo.push_to_hub(
    #        commit_message=f"Training in progress epoch {epoch}", blocking=False
    #   )

  0%|          | 0/12090 [00:00<?, ?it/s]

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch 0: {'rouge1': 1.5578, 'rouge2': 0.1364, 'rougeL': 1.3645, 'rougeLsum': 1.3657}
Epoch 1: {'rouge1': 4.0603, 'rouge2': 0.6167, 'rougeL': 3.893, 'rougeLsum': 3.9186}
Epoch 2: {'rouge1': 5.5272, 'rouge2': 1.1157, 'rougeL': 5.3741, 'rougeLsum': 5.3943}
Epoch 3: {'rouge1': 8.513, 'rouge2': 1.8969, 'rougeL': 8.2118, 'rougeLsum': 8.1795}
Epoch 4: {'rouge1': 8.4333, 'rouge2': 2.3977, 'rougeL': 8.4803, 'rougeLsum': 8.4767}
Epoch 5: {'rouge1': 10.2908, 'rouge2': 3.4144, 'rougeL': 10.3443, 'rougeLsum': 10.3256}
Epoch 6: {'rouge1': 9.9806, 'rouge2': 2.956, 'rougeL': 10.0535, 'rougeLsum': 10.0098}
Epoch 7: {'rouge1': 10.6449, 'rouge2': 3.6243, 'rougeL': 10.6382, 'rougeLsum': 10.642}
Epoch 8: {'rouge1': 10.7796, 'rouge2': 3.8992, 'rougeL': 10.8161, 'rougeLsum': 10.7227}
Epoch 9: {'rouge1': 11.1083, 'rouge2': 3.8992, 'rougeL': 11.0707, 'rougeLsum': 10.9946}


Looks like it plateau'ed

## Inference

Use of a pipeline to play with the fine-tuned model

In [32]:
from transformers import pipeline

hub_model_id = "huggingface-course/mt5-small-finetuned-amazon-en-es"
summarizer = pipeline("summarization", model=hub_model_id)

config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.33M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Function to show the review, the title and the generated summary.

In [33]:
def print_summary(idx):
    review = books_dataset["test"][idx]["review_body"]
    title = books_dataset["test"][idx]["review_title"]
    summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
    print(f"'>>> Review: {review}'")
    print(f"\n'>>> Title: {title}'")
    print(f"\n'>>> Summary: {summary}'")

In [34]:
print_summary(100)

'>>> Review: Too many dark things around what could have been a beautiful wedding. Dogs played a very small role. Others in the series have been better.'

'>>> Title: This one is darker than normal.'

'>>> Summary: Cute'


This one isn't a good summary

In [35]:
print_summary(0)

'>>> Review: Great book. Terrible delivery. Came out of the package mangled. Not cool Amazon'

'>>> Title: Terrible delivery condition'

'>>> Summary: Great book'


This one looks good.

# Test another model

We used "google/mt5-small". Let's try other models to compare their relative performance. because it's a multilingual dataset, there are fewer choice. We choose mBART-50.

We load the model, its tokenizer and we tokenize the dataset.

In [36]:
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [37]:
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/9672 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

We define the arguments (same as for the first model to be comparable)

In [39]:
batch_size = 8
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-amazon-en-es",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
    hub_token='',
)



In [40]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [41]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [42]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.0161,2.729868,15.9418,7.0732,15.2939,15.2703
2,2.186,2.805462,18.8047,9.2283,18.6191,18.6423
3,1.2241,3.191668,16.0541,7.7049,15.768,15.7559
4,0.5739,3.861671,16.5905,7.9938,16.008,15.931
5,0.2288,4.445966,16.0835,7.1711,15.6128,15.5449
6,0.0919,4.73466,16.8169,7.5697,16.2395,16.2232
7,0.0392,4.833027,17.6121,7.2674,17.3863,17.3034
8,0.0188,4.918066,18.8056,9.1634,18.3342,18.1533


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameter

TrainOutput(global_step=9672, training_loss=0.9223300779625047, metrics={'train_runtime': 5749.6637, 'train_samples_per_second': 13.457, 'train_steps_per_second': 1.682, 'total_flos': 2.1777994959028224e+16, 'train_loss': 0.9223300779625047, 'epoch': 8.0})

The test performance is better that what was achieved with the smaller model at the start. It however decreases the more we train the model, showing that we're overfitting with a larger model.