<a href="https://colab.research.google.com/github/bartheart/Tuning_LLM_for_sentiment_analysis/blob/main/Tuning_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install evaluate
!pip install datasets
!pip install peft

Collecting peft
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m776.4 kB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.25.0 peft-0.7.1


In [1]:
from datasets import load_dataset, DatasetDict, Dataset

In [2]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

In [3]:
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

#Base model selection

In [4]:
model_checkpoint = 'distilbert-base-uncased'

Define label maps

In [5]:
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0 , "Positive": 1}

Import auto base model from transformers

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id = label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


load dataset

In [7]:
dataset= load_dataset("shawhin/imdb-truncated")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

#Preprocessing

Change all the data into numbers for the nueral network

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space = True)

In [9]:
def tokenize(examples):
  text = examples['text']

  tokenizer.truncate_side = "left"
  tokenized_inputs = tokenizer (
      text,
      return_tensors='np',
      truncation= True,
      max_length= 512
  )

  return tokenized_inputs

In [10]:
if tokenizer.pad is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [11]:
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
accuracy = evaluate.load("accuracy")


def compute_metrics (p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)

  return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [14]:
test_list = ["This one is a pass",
             "I love this icecream",
             "what a bad day",
             "this is absoloutely devestating",
             "I like cakes"]

In [15]:
for text in test_list:
  inputs = tokenizer.encode(text, return_tensors="pt")
  logits = model(inputs).logits
  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])

This one is a pass - Negative
I love this icecream - Negative
what a bad day - Negative
this is absoloutely devestating - Negative
I like cakes - Negative


Fine-tuning the model

In [16]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r = 4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules = ['q_lin']
)

In [17]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [18]:
lr = 0.001
batch_size = 4
num_epochs = 10

In [26]:
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2


In [19]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [20]:
trainer = Trainer(
    model= model,
    args= training_args,
    train_dataset= tokenized_dataset["train"],
    eval_dataset= tokenized_dataset["validation"],
    tokenizer= tokenizer,
    data_collator= data_collator,
    compute_metrics= compute_metrics,
)

In [21]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.355958,{'accuracy': 0.877}
2,0.439200,0.569805,{'accuracy': 0.859}
3,0.439200,0.554527,{'accuracy': 0.879}
4,0.213400,0.711044,{'accuracy': 0.863}
5,0.213400,0.861454,{'accuracy': 0.862}
6,0.065500,0.874999,{'accuracy': 0.872}
7,0.065500,1.038491,{'accuracy': 0.86}
8,0.017100,1.080773,{'accuracy': 0.87}
9,0.017100,1.094756,{'accuracy': 0.866}
10,0.005000,1.120091,{'accuracy': 0.866}


Trainer is attempting to log a value of "{'accuracy': 0.877}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification/checkpoint-250 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Trainer is attempting to log a value of "{'accuracy': 0.859}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Trainer is attempting to log a value of "{'accuracy': 0.879}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we drop

TrainOutput(global_step=2500, training_loss=0.1480426664352417, metrics={'train_runtime': 477.785, 'train_samples_per_second': 20.93, 'train_steps_per_second': 5.232, 'total_flos': 1113026652407424.0, 'train_loss': 0.1480426664352417, 'epoch': 10.0})

In [26]:
!pip install torch --upgrade



In [24]:
import torch
print(torch.version.cuda)

12.1


In [26]:
!pip install torch --upgrade

Collecting torch
  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m91.3 MB/s[0m

In [24]:
text_list = {"What an awful thing",
             "I hate this place",
             "This is the best thing",
             "I have the best gift",
             "She is nice to me"}

In [25]:
print("Tuned model predictions:")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")
    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Tuned model predictions:
What an awful thing - Negative
I hate this place - Negative
She is nice to me - Positive
I have the best gift - Positive
This is the best thing - Positive
