### Reference
- https://github.com/bhadreshpsavani/ExploringSentimentalAnalysis/blob/main/SentimentalAnalysisWithDistilbert.ipynb
- https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb

# Libraries

In [8]:
%pip install -q -U datasets transformers accelerate sentencepiece

In [10]:
import os
import random
import numpy as np
import torch
import transformers
from pprint import pprint
from datetime import datetime
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score

# Config

In [42]:
seed = 42
lang = 'sun'

hf_model_id = 'FacebookAI/xlm-roberta-base'
hf_tokenizer_id = hf_model_id
hf_data_id = 'alxxtexxr/sundanese-twitter-dataset'
hf_data_config = '2506_12_12'

project_name = f'XLM-RoBERTa-Base-Sundanese-Emotion-Classifier-v{datetime.now().strftime("%Y%m%d%H%M%S")}'
print("Project name:", project_name)

label2emotion = {
    0: 'marah',
    1: 'sedih',
    2: 'senang',
    3: 'takut',
}

num_labels = 4
num_epochs = 10
batch_size = 64

Project name: XLM-RoBERTa-Base-Sundanese-Emotion-Classifier-v20241222173652


In [12]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [15]:
datasets = load_dataset(hf_data_id, hf_data_config)
cols = list(datasets['train'].features)
splits = [*datasets.keys()]

print("Data columns:", cols)

Data columns: ['label', 'data', 'emotion']


## Preprocess Data

In [16]:
tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_id)

In [17]:
def one_hot_encode_emotion(emotion, emotion_cols):
    emotions = emotion.replace(" ", "").split(",")
    one_hot_emotion = [1.0 if emotion_col in emotions else 0.0 for emotion_col in emotion_cols] # Ensure that the label is float, not int
    return one_hot_emotion

def preprocess_function(data):
   text = data['data']
   data = tokenizer(text, padding=True, truncation=True)
   return data

tokenized_datasets = {split: datasets[split].map(preprocess_function) for split in splits}

Map:   0%|          | 0/2506 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [21]:
# Sanity check
data = tokenized_datasets['train'][5]

print("Data keys:", list(data.keys()))
print("Text:", data['data'])
print("Emotion:", data['emotion'])
print("Label:", data['label'])

Data keys: ['label', 'data', 'emotion', 'input_ids', 'attention_mask']
Text: hate dadas jadian acan urg engges bebeakan padahal mah jelema siga chiki chuba
Emotion: sedih
Label: 1


In [20]:
data_collator = DataCollatorWithPadding(tokenizer)

# Model

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(
    hf_model_id, 
    num_labels=num_labels,

    # low_cpu_mem_usage=True,
    # return_dict=True,
    # torch_dtype=torch.float16,
    device_map='auto', 
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Finetuning

In [30]:
def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = eval_pred.predictions.argmax(-1)

    # Compute F1 score for each type of averaging method
    accuracy = accuracy_score(y_true, y_pred)
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0.0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0.0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0.0)

    return {
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
    }

In [31]:
train_args = TrainingArguments(
    # Training config
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    learning_rate=2e-5,
    weight_decay=0.01,

    # Logging config for training
    logging_strategy='steps',
    logging_steps=100,

    # Evaluation config during training
    per_device_eval_batch_size=2,
    eval_strategy='steps',
    eval_steps=100,

    # Model saving config
    output_dir=project_name,
    save_strategy='epoch',
    # load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [32]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro,F1 Weighted
100,1.0664,0.443155,0.916667,0.916667,0.914286,0.914286
200,0.4356,0.106835,0.916667,0.916667,0.914286,0.914286
300,0.19,0.010901,1.0,1.0,1.0,1.0
400,0.1069,0.008979,1.0,1.0,1.0,1.0


TrainOutput(global_step=400, training_loss=0.44973822832107546, metrics={'train_runtime': 1396.5427, 'train_samples_per_second': 17.944, 'train_steps_per_second': 0.286, 'total_flos': 1053395331370512.0, 'train_loss': 0.44973822832107546, 'epoch': 10.0})

In [36]:
trainer.push_to_hub(project_name)

events.out.tfevents.1734888772.f21f19ca844c.5859.2:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alxxtexxr/XLM-RoBERTa-Base-Sundanese-Emotion-Classifier-v20241222170134/commit/2c874bef16524c648be43df2499b372c2c3489b7', commit_message='XLM-RoBERTa-Base-Sundanese-Emotion-Classifier-v20241222170134', commit_description='', oid='2c874bef16524c648be43df2499b372c2c3489b7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alxxtexxr/XLM-RoBERTa-Base-Sundanese-Emotion-Classifier-v20241222170134', endpoint='https://huggingface.co', repo_type='model', repo_id='alxxtexxr/XLM-RoBERTa-Base-Sundanese-Emotion-Classifier-v20241222170134'), pr_revision=None, pr_num=None)

# Evaluation

In [37]:
eval = trainer.evaluate(eval_dataset=tokenized_datasets['test'])

import pandas as pd
pd.DataFrame(eval, index=[0])

Unnamed: 0,eval_loss,eval_accuracy,eval_f1_micro,eval_f1_macro,eval_f1_weighted,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.017732,1.0,1.0,1.0,1.0,0.087,137.892,68.946,10.0


# Inference

In [45]:
data = datasets['val'][1]
text = data['data']
emotion_true = data['emotion']

inputs = tokenizer(text, return_tensors='pt').to(model.device)

outputs = trainer.model(**inputs)
logits = outputs.logits
label_pred = logits.argmax(-1).item()
emotion_pred = label2emotion[label_pred]

print("Text:", text)
print("True emotion:", emotion_true)
print("Predicted emotion:", emotion_pred)

Text: Mau mandi asa takut kieu di rumah sendirian:]y
True emotion: takut
Predicted emotion: takut
