In [13]:
# ============================================================
# Fine-Tuning IndoBERTweet untuk Analisis Sentimen Layanan KAI Access
# ============================================================

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import torch
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [14]:
# ============================================================
# FIXED DATA LOADING & LABEL NORMALIZATION
# ============================================================

# === Path file ===
DATA_FILE = "D:\\Code\\skripsi_sentimen\\02.Preprocessing\\data\\preprocessed_C_stepwise_tidak_normalisasi_STEMwithoutStopword.csv"

# === Load dataset dengan BOM handling ===
df = pd.read_csv(DATA_FILE, encoding="utf-8-sig")

# === Normalisasi nama kolom ===
df.columns = [c.strip().lower() for c in df.columns]
print("Kolom tersedia:", df.columns.tolist())

# === Tentukan kolom utama ===
text_col = "text_stemmed"
label_col = "label"

# === Bersihkan nilai kosong ===
df = df.dropna(subset=[text_col, label_col])

# === Normalisasi label ke format 3 kelas standar ===
df[label_col] = (
    df[label_col]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        "negatif": "neg",
        "positif": "pos",
        "netral": "neu",
        "negative": "neg",
        "positive": "pos",
        "neutral": "neu"
    })
)

# === Filter hanya 3 kelas yang valid ===
df = df[df[label_col].isin(["neg", "neu", "pos"])]

print("\nDistribusi label setelah normalisasi:")
print(df[label_col].value_counts())

Kolom tersedia: ['id_str', 'created_at', 'user_id_str', 'conversation_id_str', 'full_text', 'label', 'text_casefold', 'text_clean', 'tokens', 'tokens_stem', 'text_stemmed', 'emoji_drop_from_raw', 'emoji_map_from_raw', 'emoji_drop_from_stemmed', 'emoji_map_from_stemmed']

Distribusi label setelah normalisasi:
label
neu    721
neg    585
pos    128
Name: count, dtype: int64


In [15]:
# ============================================================
# 3. Split Train-Test (80:20, stratified)
# ============================================================
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[text_col].astype(str),
    df[label_col],
    test_size=0.2,
    random_state=42,
    stratify=df[label_col]
)

train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
test_df  = pd.DataFrame({"text": test_texts, "label": test_labels})

label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v: k for k, v in label2id.items()}

train_df["label"] = train_df["label"].map(label2id)
test_df["label"]  = test_df["label"].map(label2id)

print("\nDistribusi label TRAIN:\n", train_df["label"].value_counts())
print("\nDistribusi label TEST:\n", test_df["label"].value_counts())



Distribusi label TRAIN:
 label
1    577
0    468
2    102
Name: count, dtype: int64

Distribusi label TEST:
 label
1    144
0    117
2     26
Name: count, dtype: int64


In [16]:
# ============================================================
# 4. Konversi ke HuggingFace Dataset
# ============================================================
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})

print("\nDataset siap digunakan untuk tokenisasi:")
print(ds)


Dataset siap digunakan untuk tokenisasi:
DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1147
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 287
    })
})


In [17]:
# ============================================================
# 5. Tokenisasi
# ============================================================
MODEL_NAME = "indolem/indobertweet-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

ds_encoded = ds.map(tokenize, batched=True)

Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]

In [18]:
# ============================================================
# 6. Load Model IndoBERTweet
# ============================================================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# ============================================================
# 7. Definisi Metrik Evaluasi
# ============================================================
metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = metric_accuracy.compute(predictions=preds, references=labels)
    f1_macro = metric_f1.compute(predictions=preds, references=labels, average="macro")
    return {
        "accuracy": acc["accuracy"],
        "f1_macro": f1_macro["f1"]
    }

In [20]:
# ============================================================
# 8. Training Configuration (Sesuai Tabel 4.1)
# ============================================================
batch_size = 16
args = TrainingArguments(
    output_dir="indobertweet-finetuned-kai",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none",  # nonaktifkan Weights & Biases
    logging_dir="./logs",
    logging_strategy="epoch"
)


In [21]:
# ============================================================
# 9. Trainer Setup
# ============================================================
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [22]:

# ============================================================
# 10. Train Model
# ============================================================
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.8631,0.769583,0.630662,0.423728
2,0.6663,0.626984,0.752613,0.620017
3,0.48,0.613447,0.773519,0.683131
4,0.2949,0.551545,0.811847,0.745471
5,0.1628,0.597961,0.829268,0.783499
6,0.0982,0.714829,0.8223,0.75245
7,0.0447,0.721486,0.829268,0.755594
8,0.0245,0.770447,0.829268,0.760194
9,0.0165,0.821984,0.825784,0.749955
10,0.0097,0.803809,0.836237,0.765842




TrainOutput(global_step=720, training_loss=0.2660653519961569, metrics={'train_runtime': 6014.5887, 'train_samples_per_second': 1.907, 'train_steps_per_second': 0.12, 'total_flos': 754477725335040.0, 'train_loss': 0.2660653519961569, 'epoch': 10.0})

In [23]:
# ============================================================
# 11. Evaluate Model
# ============================================================
results = trainer.evaluate()
print("\n=== HASIL AKHIR EVALUASI MODEL ===")
for k, v in results.items():
    print(f"{k}: {v:.4f}")




=== HASIL AKHIR EVALUASI MODEL ===
eval_loss: 0.6760
eval_accuracy: 0.8118
eval_f1_macro: 0.7401
eval_runtime: 33.9497
eval_samples_per_second: 8.4540
eval_steps_per_second: 0.5300
epoch: 10.0000


In [None]:
# ============================================================
# 12. Simpan Model
# ============================================================
trainer.save_model("indobertweet-finetuned-kai/best_model")
tokenizer.save_pretrained("indobertweet-finetuned-kai/best_model")
print("\n✅ Model & tokenizer berhasil disimpan ke folder 'indobertweet-finetuned-kai/best_model'")