In [1]:
! pip install datasets transformers evaluate peft

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

In [4]:
! pip install torch



In [5]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [7]:
model_checkpt = "distilbert/distilbert-base-uncased"

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpt, num_labels=2, id2label=id2label, label2id=label2id
)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load Data

In [8]:
dataset = load_dataset("shawhin/imdb-truncated")
dataset

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

(…)-00000-of-00001-5a744bf76a1d84b2.parquet:   0%|          | 0.00/836k [00:00<?, ?B/s]

(…)-00000-of-00001-a3a52fabb70c739f.parquet:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

Preprocess data

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpt, add_prefix_space=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
def tokenize_function(examples):
  text = examples['text']

  tokenizer.truncation_side='left'
  tokenized_inputs = tokenizer(text, return_tensors='np', truncation=True, max_length=512)

  return tokenized_inputs

# add pad token if none exists
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Evaluation metrics

In [17]:
accuracy = evaluate.load('accuracy')

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)

  return {"accuracy": accuracy.compute(predictions=predictions,
                                       references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Untrain model performace

In [24]:
text_list = ["It was good", "Not a fan, don't recommend.", "Better than the first one.", "This one is a pass."]

print("Untrained model predictions:")
print("--"*30)
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors='pt')
  # compute logits
  logits = model(inputs).logits
  # convert logits to label
  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
------------------------------------------------------------
It was good - Negative
Not a fan, don't recommend. - Negative
Better than the first one. - Negative
This one is a pass. - Positive


In [26]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Fine-tuning with LORA

In [25]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                         r = 4, # intrinsic rank of trainable weight matrix
                         lora_alpha=32, # this is like a learning rate
                         lora_dropout=0.01, # probability of dropout
                         target_modules = ['q_lin'] # we apply lora to query layer
)

In [27]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [29]:
lr = 1e-3
batch_size = 4
num_epochs = 10

training_args = TrainingArguments(
    output_dir=model_checkpt + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)



In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dinamically pad examples in each bach
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [31]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.421947,{'accuracy': 0.869}
2,0.423600,0.444853,{'accuracy': 0.876}
3,0.423600,0.794445,{'accuracy': 0.867}
4,0.211600,0.645299,{'accuracy': 0.881}
5,0.211600,0.768755,{'accuracy': 0.887}
6,0.054600,0.933386,{'accuracy': 0.884}
7,0.054600,0.982845,{'accuracy': 0.879}
8,0.006900,1.054014,{'accuracy': 0.88}
9,0.006900,1.056414,{'accuracy': 0.886}
10,0.012200,1.089359,{'accuracy': 0.881}


Trainer is attempting to log a value of "{'accuracy': 0.869}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.876}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.867}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.881}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.887}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=2500, training_loss=0.14176764373779296, metrics={'train_runtime': 492.0722, 'train_samples_per_second': 20.322, 'train_steps_per_second': 5.081, 'total_flos': 1112883852759936.0, 'train_loss': 0.14176764373779296, 'epoch': 10.0})

In [32]:
model.to('cuda')

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=7

In [36]:
print("Trained model predictions:")
print("--"*30)
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors='pt').to("cuda")

  logits = model(inputs).logits
  predictions = torch.max(logits, 1).indices

  print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
------------------------------------------------------------
It was good - Positive
Not a fan, don't recommend. - Negative
Better than the first one. - Positive
This one is a pass. - Positive
