- source : https://www.datacamp.com/tutorial/fine-tuning-large-language-models

## Load dataset

In [1]:
!pip install datasets evaluate -q
!pip install accelerate -U -q

In [2]:
import pandas as pd

from datasets import load_dataset

dataset = load_dataset("mteb/tweet_sentiment_extraction")
df = pd.DataFrame(dataset['train'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [31]:
df.sample(5)

Unnamed: 0,id,text,label,label_text
1588,a7f72a928a,WOOOOOOOOOO are you coming to Nottingham at...,2,positive
23879,ef42dee96c,resting had a whole day of walking,1,neutral
6561,07d17131b1,"was in Palawan a couple of days ago, i`ll try ...",1,neutral
2602,2820205db5,I know! I`m so slow its horrible. DON`T TELL ...,0,negative
4003,7d3ce4363c,"Glad I went out, glad I didn`t leave early, an...",2,positive


In [4]:
df.shape

(27481, 4)

## Load Tokenizer

In [5]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [6]:
def tokenize_function(examples):
   return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [7]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

## Load Model

In [8]:
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Evaluate Method

In [9]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

## Train

In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=1,  # Reduce batch size here
   per_device_eval_batch_size=1,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4
   )


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   compute_metrics=compute_metrics,

)

trainer.train()

Step,Training Loss
500,0.8046


TrainOutput(global_step=750, training_loss=0.6970193684895833, metrics={'train_runtime': 797.5911, 'train_samples_per_second': 3.761, 'train_steps_per_second': 0.94, 'total_flos': 1567794659328000.0, 'train_loss': 0.6970193684895833, 'epoch': 3.0})

In [11]:
import evaluate

trainer.evaluate()

{'eval_loss': 1.256435751914978,
 'eval_accuracy': 0.718,
 'eval_runtime': 92.7697,
 'eval_samples_per_second': 10.779,
 'eval_steps_per_second': 10.779,
 'epoch': 3.0}

In [12]:
trainer.save_model("fine_tuned_model")

In [28]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("fine_tuned_model")

In [29]:
# Contoh menggunakan model untuk prediksi
import torch

input_text = "sad because manchester united is lose yesterday night"
encoded_input = tokenizer(input_text, return_tensors='pt')
output = model(**encoded_input)
predicted_class = torch.argmax(output.logits).item()
print("Prediksi kelas:", predicted_class)

Prediksi kelas: 0


In [30]:
output.logits

tensor([[ 6.1086,  0.4756, -1.2046]], grad_fn=<IndexBackward0>)