<a href="https://colab.research.google.com/github/Yosafat0804/Emotion_Classification_GoEmotions_Using_RoBERTa/blob/main/Emotion_Classification_GoEmotions_Using_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================
# 0. Imports
# =========================
from datasets import load_dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch
import numpy as np
from sklearn.metrics import f1_score

In [None]:
# =========================
# 1. Load dataset
# =========================
dataset = load_dataset("go_emotions")

dataset["train"] = dataset["train"].shuffle(seed=42).select(range(7000))
dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(3000))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(3000))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [None]:
# =========================
# 2. Tokenizer
# =========================
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# =========================
# 3. Encode labels (multi-label)
# =========================
NUM_LABELS = 28

def encode_labels(example):
    label_vector = [0.0] * NUM_LABELS
    for l in example["labels"]:
        label_vector[l] = 1.0
    example["labels"] = label_vector
    return example

dataset = dataset.map(encode_labels)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# =========================
# 4. Tokenization (SLIDING WINDOW)
# =========================
MAX_LEN = 256
STRIDE = 128

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        stride=STRIDE,
        return_overflowing_tokens=True
    )

    # Copy labels for each chunk
    labels = []
    for idx in tokenized["overflow_to_sample_mapping"]:
        labels.append(examples["labels"][idx])

    tokenized["labels"] = labels
    return tokenized

dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# =========================
# 5. Torch format
# =========================
dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

In [None]:
# =========================
# 6. Model
# =========================
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# =========================
# 7. Metrics
# =========================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs >= 0.5).astype(int)

    return {
        "micro_f1": f1_score(labels, preds, average="micro", zero_division=0),
        "macro_f1": f1_score(labels, preds, average="macro", zero_division=0),
    }

In [None]:
# =========================
# 8. Custom Trainer
# =========================
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fn = torch.nn.BCEWithLogitsLoss()
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
# =========================
# 9. Training arguments
# =========================
training_args = TrainingArguments(
    output_dir="./goemotions_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    logging_steps=100,
    report_to="none"
)

In [None]:
# =========================
# 10. Trainer
# =========================
trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics
)

In [None]:
# =========================
# 11. Train
# =========================
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1
1,0.1459,0.140416,0.009561,0.003236
2,0.1331,0.129446,0.322609,0.058659


TrainOutput(global_step=876, training_loss=0.16335155865917467, metrics={'train_runtime': 696.9372, 'train_samples_per_second': 20.088, 'train_steps_per_second': 1.257, 'total_flos': 1842207338496000.0, 'train_loss': 0.16335155865917467, 'epoch': 2.0})

In [None]:
# =========================
# 12. Evaluate
# =========================
print("Evaluating on test set...")
print(trainer.evaluate(dataset["test"]))

Evaluating on test set...


{'eval_loss': 0.128588005900383, 'eval_micro_f1': 0.3170840787119857, 'eval_macro_f1': 0.05787173460405544, 'eval_runtime': 39.3286, 'eval_samples_per_second': 76.28, 'eval_steps_per_second': 2.39, 'epoch': 2.0}


In [None]:
# =========================
# 13. Save model
# =========================
trainer.save_model("./goemotions_model")
tokenizer.save_pretrained("./goemotions_model")

print("Model saved to ./goemotions_model")

Model saved to ./goemotions_model


In [None]:
!pip install -U onnx onnxscript onnxruntime

Collecting onnx
  Downloading onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting onnxscript
  Downloading onnxscript-0.6.0-py3-none-any.whl.metadata (13 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting onnx_ir<2,>=0.1.15 (from onnxscript)
  Downloading onnx_ir-0.1.15-py3-none-any.whl.metadata (3.2 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxscript-0.6.0-py3-none-any.whl (689 kB)
[2K   [90m━━━━━━━━━━━━━━━━

In [None]:
import json
from pathlib import Path
import torch

def export_to_onnx(model, tokenizer, output_dir="./goemotions_model_onnx"):
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    device = torch.device("cpu")
    model.to(device)
    model.eval()

    dummy_text = "This is a sample text for ONNX export."
    inputs = tokenizer(
        dummy_text,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    torch.onnx.export(
        model,
        (inputs["input_ids"], inputs["attention_mask"]),
        f"{output_dir}/model.onnx",
        export_params=True,
        opset_version=18,
        do_constant_folding=True,
        input_names=["input_ids", "attention_mask"],
        output_names=["logits"],
        dynamic_axes={
            "input_ids": {0: "batch_size"},
            "attention_mask": {0: "batch_size"},
            "logits": {0: "batch_size"}
        },
        dynamo=False
    )

    tokenizer.save_pretrained(output_dir)

    config = {
        "model_type": "roberta",
        "num_labels": 28,
        "problem_type": "multi_label_classification",
        "max_length": 128
    }

    with open(f"{output_dir}/config.json", "w") as f:
        json.dump(config, f, indent=2)

    print("✅ ONNX export SUCCESS")
    print(f"📁 Saved to: {output_dir}")

In [None]:
export_to_onnx(model, tokenizer, "./goemotions_model_onnx")

  torch.onnx.export(
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


✅ ONNX export SUCCESS
📁 Saved to: ./goemotions_model_onnx


In [None]:
import os

print("Isi folder ONNX:")
for f in os.listdir("./goemotions_model_onnx"):
    print("-", f)

Isi folder ONNX:
- model.onnx
- tokenizer_config.json
- vocab.json
- special_tokens_map.json
- config.json
- tokenizer.json
- merges.txt


In [None]:
from google.colab import files
files.download("goemotions_model_onnx.zip")

In [None]:
# ===============================
# Imports
# ===============================
import onnxruntime as ort
import numpy as np
import scipy.special
from transformers import RobertaTokenizer

# ===============================
# Load ONNX model & tokenizer
# ===============================
ONNX_MODEL_PATH = "./goemotions_model_onnx"

tokenizer = RobertaTokenizer.from_pretrained(ONNX_MODEL_PATH)

session = ort.InferenceSession(
    f"{ONNX_MODEL_PATH}/model.onnx",
    providers=["CPUExecutionProvider"]
)

# ===============================
# Emotion labels (GoEmotions)
# ===============================
EMOTIONS = [
    "admiration", "amusement", "anger", "annoyance", "approval",
    "caring", "confusion", "curiosity", "desire", "disappointment",
    "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness",
    "surprise", "neutral"
]

# ===============================
# Predict function (MULTI-LABEL, ONNX)
# ===============================
def predict_onnx(
    text,
    threshold=0.05,
    top_k=None,
    remove_neutral=False
):
    inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        padding="max_length",
        max_length=128
    )

    outputs = session.run(
        None,
        {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"]
        }
    )

    logits = outputs[0][0]
    probs = scipy.special.expit(logits)  # sigmoid

    results = []
    for i, prob in enumerate(probs):
        emotion = EMOTIONS[i]

        if remove_neutral and emotion == "neutral":
            continue

        if prob >= threshold:
            results.append((emotion, float(prob)))

    results.sort(key=lambda x: x[1], reverse=True)

    if top_k is not None:
        results = results[:top_k]

    return results

# ===============================
# Test with lyric
# ===============================
if __name__ == "__main__":
    lyric = """
    I found a love for me
Darling, just dive right in and follow my lead
Well, I found a girl, beautiful and sweet
Oh, I never knew you were the someone waitin' for me
'Cause we were just kids when we fell in love
Not knowin' what it was
I will not give you up this time
Darling, just kiss me slow
Your heart is all I own
And in your eyes, you're holdin' mine
Baby, I'm dancin' in the dark
With you between my arms
Barefoot on the grass
Listenin' to our favourite song
When you said you looked a mess
I whispered underneath my breath
But you heard it
Darling, you look perfect tonight
I found a woman
Stronger than anyone I know
She shares my dreams
I hope that someday I'll share her home
I found a love
To carry more than just my secrets
Carry love, to carry children
Of our own
We are still kids and we're so in love
Fightin' against all odds
I know we'll be alright this time
Darling, just hold my hand
Be my girl, I'll be your man
I see my future in your eyes
Baby, I'm dancin' in the dark
With you between my arms
Barefoot on the grass
Listenin' to our favourite song
When I saw you in that dress
Looking so beautiful
I don't deserve this
Darling, you look perfect tonight
Mm, sing with me, yeah
Baby, I'm (sing) dancin' in the dark
With you between my arms
Barefoot on the grass
Listenin' to our favourite song
I have faith in what I see
Now I know I have met an angel in person
And she looks perfect
Oh, I don't deserve this
You look perfect tonight
    """

    print("=" * 50)
    print("LYRIC INPUT:")
    print("=" * 50)
    print(lyric.strip())
    print("=" * 50)

    print("\nPredicted emotions (ONNX):\n")

    predictions = predict_onnx(
        lyric,
        threshold=0.03,
        top_k=5,
        remove_neutral=True
    )

    for emotion, prob in predictions:
        print(f"- {emotion:<15}: {prob:.1%}")

LYRIC INPUT:
I found a love for me
Darling, just dive right in and follow my lead
Well, I found a girl, beautiful and sweet
Oh, I never knew you were the someone waitin' for me
'Cause we were just kids when we fell in love
Not knowin' what it was
I will not give you up this time
Darling, just kiss me slow
Your heart is all I own
And in your eyes, you're holdin' mine
Baby, I'm dancin' in the dark
With you between my arms
Barefoot on the grass
Listenin' to our favourite song
When you said you looked a mess
I whispered underneath my breath
But you heard it
Darling, you look perfect tonight
I found a woman
Stronger than anyone I know
She shares my dreams
I hope that someday I'll share her home
I found a love
To carry more than just my secrets
Carry love, to carry children
Of our own
We are still kids and we're so in love
Fightin' against all odds
I know we'll be alright this time
Darling, just hold my hand
Be my girl, I'll be your man
I see my future in your eyes
Baby, I'm dancin' in the d