<a href="https://colab.research.google.com/github/alturkim/nlp-notebooks/blob/main/Sentiment_Analysis_with_Parameter_Efficient_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PEFT: Parameter Efficient Finetuning by HuggingFace.

This notebook finetune xlm-roberta-base using PEFT to train a sentiment classifier for Arabic. <br>


Author: <br>
Mustafa Alturki <br>
https://github.com/alturkim

In [1]:
!pip install -q datasets evaluate transformers[sentencepiece]
!pip install -q accelerate loralib tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install -q git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.3/76.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install -q bitsandbytes

In [4]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModel, AdamW, get_scheduler
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, arrow_dataset
from datasets.arrow_dataset import Dataset

import evaluate
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

import numpy as np

from tqdm.auto import tqdm

from typing import Tuple, Dict


import argparse
import os

from peft import get_peft_config,get_peft_model, get_peft_model_state_dict, \
set_peft_model_state_dict, LoraConfig, PeftType, \
PrefixTuningConfig, PromptEncoderConfig

from transformers import AutoModelForSequenceClassification, \
get_linear_schedule_with_warmup, set_seed



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues


In [5]:
peft_type = PeftType.LORA

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)

In [6]:
def compute_metrics(eval_preds):
    # eval_preds is an EvalPrediction object which is a named tuple
    logits, labels = eval_preds
    predictions = torch.argmax(logits, axis=-1)

    metric_names = ["precision", "recall", "f1", "accuracy"]
    results = dict()
    for m in metric_names:
        metric = evaluate.load(m)
        results = {**results, **metric.compute(predictions=predictions, references=labels)}
    return results

class Trainer:
    def __init__(self, model, 
                 train_dataloader: DataLoader, eval_dataloader: DataLoader,
                 num_train_epochs: int, lr: float, device: torch.device):
        self.model = model
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.num_train_epochs = num_train_epochs
        self.lr = lr
        self.device = device
        self.optimizer = AdamW(self.model.parameters(), lr=self.lr)

        self.model.to(self.device)

    def train(self) -> None:
        num_training_steps = self.num_train_epochs * len(self.train_dataloader)
        progress_bar = tqdm(range(num_training_steps))
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=self.optimizer,
            num_warmup_steps=0.06*(len(self.train_dataloader) * self.num_train_epochs),
            num_training_steps=num_training_steps,
        )

        for epoch in range(self.num_train_epochs):
            self.model.train()
            for batch in self.train_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                output = self.model(**batch)
                loss = output.loss
                loss.backward()
                self.optimizer.step()
                lr_scheduler.step()
                self.optimizer.zero_grad()
                progress_bar.update(1)

            # evaluate after each epoch
            print(f"evaluate ... epoch:{epoch}")
            results = self.evaluation()
            print(results)

    def evaluation(self) -> dict:
        self.model.eval()
        for batch in self.eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                output = self.model(**batch)
                logits = output.logits
            
            labels = batch["labels"]
            return compute_metrics((logits, labels))
        
    def predict(self, dataset) -> dict:
        self.model.eval()
        dataloader = DataLoader(
            dataset,
            shuffle=False,
            collate_fn=collate_fn,
            batch_size=8)
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                output = self.model(**batch)
                logits = output.logits
            
            labels = batch["labels"]
            return compute_metrics((logits, labels))

In [7]:
def tokenize(batch: arrow_dataset.Dataset):
    processed = tokenizer(batch["text"], truncation=True,
                          padding="max_length", return_tensors="pt")
    # processed.pop("input_ids")
    # processed.pop("attention_mask")
    return processed
    
def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")

In [8]:
def train_eval_test_split(dataset : Dataset) -> Dict[str, Dataset]:
    split_datasets = dict()
    train_eval_test = dataset.train_test_split(test_size=0.4, stratify_by_column="labels", seed=10)
    split_datasets["train"] = train_eval_test["train"]
    eval_test = train_eval_test["test"].train_test_split(test_size=0.5, stratify_by_column="labels", seed=10)
    split_datasets["eval"] = eval_test["train"]
    split_datasets["test"] = eval_test["test"]
    return split_datasets

In [9]:
def get_stat(dataset : Dataset) -> None:
    labels = dataset["labels"]
    pos_count = sum([1 for i in labels if i==1])
    neg_count = sum([1 for i in labels if i==0])

    pos_pct = pos_count/(pos_count + neg_count)
    neg_pct = neg_count/(pos_count + neg_count)
    print(f"There are: \n{pos_count} positive reviews.\n{neg_count} negative reviews.")
    print(f"Percentage of positive reviews: {pos_pct*100:.2f}%")
    print(f"Percentage of negative reviews: {neg_pct*100:.2f}%")

In [10]:
if __name__ == "__main__":
    lm_checkpoint = "xlm-roberta-base"
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    raw_dataset = load_dataset("ar_res_reviews", split="train").rename_column("polarity", "labels")

    split_datasets = train_eval_test_split(raw_dataset)

    for split, data in split_datasets.items():
        print(f"{split} data stats")
        get_stat(data)
        print()

    tokenizer = AutoTokenizer.from_pretrained(lm_checkpoint, padding_side="right")
    if getattr(tokenizer, "pad_token_id") is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model = AutoModelForSequenceClassification.from_pretrained(lm_checkpoint, return_dict=True)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    tokenized_datasets = dict()
    for split in ["train", "eval", "test"]:
        tokenized_datasets[split] = split_datasets[split].map(tokenize, batched=True, batch_size=8)
        tokenized_datasets[split] = tokenized_datasets[split].remove_columns(["text", "restaurant_id", "user_id"])
        tokenized_datasets[split] = tokenized_datasets[split].with_format("torch")

    train_dataloader = DataLoader(
        tokenized_datasets["train"],
        # collate_fn=collate_fn,
        shuffle=True,
        batch_size=8)
    
    eval_dataloader = DataLoader(
        tokenized_datasets["eval"],
        # collate_fn=collate_fn,
        shuffle=False, 
        batch_size=8)


Downloading builder script:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

Downloading and preparing dataset ar_res_reviews/default to /root/.cache/huggingface/datasets/ar_res_reviews/default/0.0.0/f303714dc96c8056d45dca8950e5b7fe6ad59b88d0c095e07724e0484824031c...


Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8364 [00:00<?, ? examples/s]

Dataset ar_res_reviews downloaded and prepared to /root/.cache/huggingface/datasets/ar_res_reviews/default/0.0.0/f303714dc96c8056d45dca8950e5b7fe6ad59b88d0c095e07724e0484824031c. Subsequent calls will reuse this data.
train data stats
There are: 
3567 positive reviews.
1451 negative reviews.
Percentage of positive reviews: 71.08%
Percentage of negative reviews: 28.92%

eval data stats
There are: 
1190 positive reviews.
483 negative reviews.
Percentage of positive reviews: 71.13%
Percentage of negative reviews: 28.87%

test data stats
There are: 
1189 positive reviews.
484 negative reviews.
Percentage of positive reviews: 71.07%
Percentage of negative reviews: 28.93%



Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

trainable params: 887042 || all params: 278340098 || trainable%: 0.31868997904858104


  0%|          | 0/628 [00:00<?, ?ba/s]

  0%|          | 0/210 [00:00<?, ?ba/s]

  0%|          | 0/210 [00:00<?, ?ba/s]

In [11]:
if __name__ == "__main__":
    trainer = Trainer(model=model, 
                      train_dataloader=train_dataloader, 
                      eval_dataloader=eval_dataloader,
                      num_train_epochs=4, 
                      lr=3e-4,
                      device=device)
    # print("Evaluating before training ... ")
    # result = trainer.predict(tokenized_datasets["test"])
    # print(result)
    
    print("training ... ")
    trainer.train()

    print("Evaluating after training ... ")
    result = trainer.predict(tokenized_datasets["test"])
    print(result)
    



training ... 


  0%|          | 0/2512 [00:00<?, ?it/s]

evaluate ... epoch:0


Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'precision': 0.6666666666666666, 'recall': 1.0, 'f1': 0.8, 'accuracy': 0.75}
evaluate ... epoch:1
{'precision': 0.8, 'recall': 1.0, 'f1': 0.888888888888889, 'accuracy': 0.875}
evaluate ... epoch:2
{'precision': 0.8, 'recall': 1.0, 'f1': 0.888888888888889, 'accuracy': 0.875}
evaluate ... epoch:3


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'precision': 0.8, 'recall': 1.0, 'f1': 0.888888888888889, 'accuracy': 0.875}
Evaluating after training ... 
{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'accuracy': 1.0}
