
# Fine-Tuning IndoBERTweet untuk Analisis Sentimen Layanan KAI

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import & Konfigurasi Awal
import os
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)
import shutil
import torch
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [4]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)

True
12.6


In [5]:
# Konfigurasi File & Skenario P1â€“P4
# Path file hasil preprocessing (gabungan A + C)
DATA_FILE = "preprocessed_merged_stepwise.csv"

# Mapping skenario preprocessing ke nama kolom teks
SCENARIO_TO_TEXTCOL = {
    "P1": "text_clean",               # only cleaning
    "P2": "text_nostop_text",         # stopword removal only
    "P3": "text_stemmed_noSTOPWORD",  # stemming only
    "P4": "text_stemmed",             # stopword + stemming
}

# pilih skenario yang mau dijalankan (bisa kamu ganti-ganti)
SCENARIO = "P1"   # misal awalnya P1

label_col = "label"

print("Skenario aktif:", SCENARIO, "-> kolom teks:", SCENARIO_TO_TEXTCOL[SCENARIO])


Skenario aktif: P1 -> kolom teks: text_clean


In [6]:
# Data Loading & Normalisasi Label
df = pd.read_csv(DATA_FILE, encoding="utf-8-sig")
df.columns = [c.strip().lower() for c in df.columns]
print("Kolom tersedia:", df.columns.tolist())

text_col = SCENARIO_TO_TEXTCOL[SCENARIO]

# pastikan kolom yang dibutuhkan ada
assert text_col in df.columns, f"Kolom teks '{text_col}' tidak ditemukan."
assert label_col in df.columns, f"Kolom label '{label_col}' tidak ditemukan."

# buang baris yang teks atau labelnya kosong
df = df.dropna(subset=[text_col, label_col])

# normalisasi label ke 3 kelas
df[label_col] = (
    df[label_col]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        "negatif": "neg",
        "negative": "neg",
        "positif": "pos",
        "positive": "pos",
        "netral": "neu",
        "neutral": "neu",
    })
)

# keep hanya 3 label utama
df = df[df[label_col].isin(["neg", "neu", "pos"])]

print("\nDistribusi label setelah normalisasi:")
print(df[label_col].value_counts())


Kolom tersedia: ['id_str', 'created_at', 'user_id_str', 'conversation_id_str', 'full_text', 'label', 'text_casefold', 'text_clean', 'tokens', 'tokens_nostop', 'text_nostop_text', 'tokens_stem', 'text_stemmed', 'emoji_drop_from_raw', 'emoji_map_from_raw', 'emoji_drop_from_stemmed', 'emoji_map_from_stemmed', 'text_stemmed_nostopword']

Distribusi label setelah normalisasi:
label
neu    721
neg    585
pos    128
Name: count, dtype: int64


Mapping Label & Split Train / Validasi / Test (80/10/10)

In [7]:
label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v: k for k, v in label2id.items()}

# 1) Split train + temp (80% train, 20% temp)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df[text_col].astype(str),
    df[label_col],
    test_size=0.2,
    random_state=SEED,
    stratify=df[label_col]
)

# 2) Split temp jadi validation + test (10% + 10%)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=SEED,
    stratify=temp_labels
)

train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
val_df   = pd.DataFrame({"text": val_texts, "label": val_labels})
test_df  = pd.DataFrame({"text": test_texts, "label": test_labels})

# mapping label -> id
train_df["label"] = train_df["label"].map(label2id)
val_df["label"]   = val_df["label"].map(label2id)
test_df["label"]  = test_df["label"].map(label2id)

print("TRAIN:", train_df["label"].value_counts().to_dict())
print("VAL  :", val_df["label"].value_counts().to_dict())
print("TEST :", test_df["label"].value_counts().to_dict())


TRAIN: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}


Konversi ke HuggingFace Dataset & Tokenisasi

In [8]:
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
})

print(ds)

MODEL_NAME = "indolem/indobertweet-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1147
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 143
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 144
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [9]:
# Definisi Metrik Evaluasi
metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = metric_accuracy.compute(predictions=preds, references=labels)
    f1_macro = metric_f1.compute(
        predictions=preds, references=labels, average="macro"
    )

    prec_macro, rec_macro, _, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )

    return {
        "accuracy": acc["accuracy"],
        "precision_macro": prec_macro,
        "recall_macro": rec_macro,
        "f1_macro": f1_macro["f1"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Fungsi Utility: Jalankan 1 Eksperimen (untuk H1â€“H3 dan P1â€“P4)

In [10]:
def run_experiment(
    ds_encoded,
    learning_rate=2e-5,
    batch_size=16,
    num_epochs=5,
    output_dir="indobertweet-exp",
    run_name="",
):
    print(f"\n=== Running experiment: {run_name} ===")
    print(f"lr={learning_rate}, batch_size={batch_size}, epochs={num_epochs}")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    ).to(device)

    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1_macro",
        report_to="none",
        logging_dir=os.path.join(output_dir, "logs"),
        logging_strategy="epoch",
        seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_encoded["train"],
        eval_dataset=ds_encoded["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # evaluasi di validation (untuk pilih hyperparameter)
    val_results = trainer.evaluate(ds_encoded["validation"])
    print("\n[VALIDATION RESULTS]")
    for k, v in val_results.items():
        if isinstance(v, float):
            print(f"{k}: {v:.4f}")

    # evaluasi akhir di test
    test_results = trainer.evaluate(ds_encoded["test"])
    print("\n[TEST RESULTS]")
    for k, v in test_results.items():
        if isinstance(v, float):
            print(f"{k}: {v:.4f}")

    # simpan model & tokenizer (opsional, bisa pakai run_name)
    save_dir = os.path.join(output_dir, "best_model")
    trainer.save_model(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"\nâœ… Model & tokenizer disimpan di: {save_dir}")

    return val_results, test_results, trainer


In [11]:
def run_h1_lr_search(
    ds_encoded,
    learning_rates,
    batch_size,
    num_epochs,
    base_output_dir="exp_H1",
):
    """
    H1: Mencari learning rate terbaik dengan batch_size & epoch tetap.
    """
    results = []
    for lr in learning_rates:
        run_name = f"H1_lr_{lr}"
        out_dir = f"{base_output_dir}_lr_{lr}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=lr,
            batch_size=batch_size,
            num_epochs=num_epochs,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "lr": lr,
            "val_accuracy": val_res["eval_accuracy"],
            "val_precision_macro": val_res["eval_precision_macro"],
            "val_recall_macro": val_res["eval_recall_macro"],
            "val_f1_macro": val_res["eval_f1_macro"],

            "test_accuracy": test_res["eval_accuracy"],
            "test_precision_macro": test_res["eval_precision_macro"],
            "test_recall_macro": test_res["eval_recall_macro"],
            "test_f1_macro": test_res["eval_f1_macro"],
        })

    return pd.DataFrame(results)


def run_h2_epoch_search(
    ds_encoded,
    epochs_list,
    learning_rate,
    batch_size,
    base_output_dir="exp_H2",
):
    """
    H2: Mencari jumlah epoch terbaik dengan lr & batch_size tetap.
    """
    results = []
    for n_ep in epochs_list:
        run_name = f"H2_ep_{n_ep}"
        out_dir = f"{base_output_dir}_ep_{n_ep}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=learning_rate,
            batch_size=batch_size,
            num_epochs=n_ep,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "epochs": n_ep,
            "val_accuracy": val_res["eval_accuracy"],
            "val_precision_macro": val_res["eval_precision_macro"],
            "val_recall_macro": val_res["eval_recall_macro"],
            "val_f1_macro": val_res["eval_f1_macro"],

            "test_accuracy": test_res["eval_accuracy"],
            "test_precision_macro": test_res["eval_precision_macro"],
            "test_recall_macro": test_res["eval_recall_macro"],
            "test_f1_macro": test_res["eval_f1_macro"],
        })

    return pd.DataFrame(results)


def run_h3_batch_search(
    ds_encoded,
    batch_sizes,
    learning_rate,
    num_epochs,
    base_output_dir="exp_H3",
):
    """
    H3: Mencari batch size terbaik dengan lr & epoch tetap.
    """
    results = []
    for bs in batch_sizes:
        run_name = f"H3_bs_{bs}"
        out_dir = f"{base_output_dir}_bs_{bs}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=learning_rate,
            batch_size=bs,
            num_epochs=num_epochs,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "batch_size": bs,
            "val_accuracy": val_res["eval_accuracy"],
            "val_precision_macro": val_res["eval_precision_macro"],
            "val_recall_macro": val_res["eval_recall_macro"],
            "val_f1_macro": val_res["eval_f1_macro"],

            "test_accuracy": test_res["eval_accuracy"],
            "test_precision_macro": test_res["eval_precision_macro"],
            "test_recall_macro": test_res["eval_recall_macro"],
            "test_f1_macro": test_res["eval_f1_macro"],
        })

    return pd.DataFrame(results)


Jalankan P1â€“P4 dengan Hyperparameter Terbaik

In [12]:
# %%
def load_dataset_for_scenario(
    scenario: str,
    data_file: str,
    scenario_to_textcol: dict,
    label_col: str,
    seed: int,
    tokenizer,
):
    """
    Load data, normalisasi label, split train/val/test,
    dan kembalikan DatasetDict yang sudah ditokenisasi
    untuk satu skenario preprocessing (P1â€“P4).
    """
    text_col = scenario_to_textcol[scenario]

    print("\n==============================")
    print("Skenario:", scenario, "| text col:", text_col)
    print("==============================")

    # --- Load & cleaning dasar ---
    df = pd.read_csv(data_file, encoding="utf-8-sig")
    df.columns = [c.strip().lower() for c in df.columns]

    assert text_col in df.columns, f"Kolom teks '{text_col}' tidak ditemukan."
    assert label_col in df.columns, f"Kolom label '{label_col}' tidak ditemukan."

    df = df.dropna(subset=[text_col, label_col])

    # --- Normalisasi label ke 3 kelas ---
    df[label_col] = (
        df[label_col]
        .astype(str).str.strip().str.lower()
        .replace({
            "negatif": "neg", "negative": "neg",
            "positif": "pos", "positive": "pos",
            "netral": "neu", "neutral": "neu",
        })
    )
    df = df[df[label_col].isin(["neg", "neu", "pos"])]

    # --- Split 80/10/10: train / validation / test ---
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        df[text_col].astype(str),
        df[label_col],
        test_size=0.2,
        random_state=seed,
        stratify=df[label_col]
    )
    val_texts, test_texts, val_labels, test_labels = train_test_split(
        temp_texts,
        temp_labels,
        test_size=0.5,
        random_state=seed,
        stratify=temp_labels
    )

    train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
    val_df   = pd.DataFrame({"text": val_texts, "label": val_labels})
    test_df  = pd.DataFrame({"text": test_texts, "label": test_labels})

    train_df["label"] = train_df["label"].map(label2id)
    val_df["label"]   = val_df["label"].map(label2id)
    test_df["label"]  = test_df["label"].map(label2id)

    print("TRAIN:", train_df["label"].value_counts().to_dict())
    print("VAL  :", val_df["label"].value_counts().to_dict())
    print("TEST :", test_df["label"].value_counts().to_dict())

    ds = DatasetDict({
        "train": Dataset.from_pandas(train_df, preserve_index=False),
        "validation": Dataset.from_pandas(val_df, preserve_index=False),
        "test": Dataset.from_pandas(test_df, preserve_index=False),
    })

    # --- Tokenisasi ---
    ds_encoded = ds.map(tokenize, batched=True)

    return ds_encoded


Step 0: Pastikan dataset & ds_encoded siap (misal pakai P1 dulu)

In [13]:
ds_encoded = load_dataset_for_scenario(
    scenario=SCENARIO,
    data_file=DATA_FILE,
    scenario_to_textcol=SCENARIO_TO_TEXTCOL,
    label_col=label_col,
    seed=SEED,
    tokenizer=tokenizer,
)


Skenario: P1 | text col: text_clean
TRAIN: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}


Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [14]:
learning_rates = [3e-5, 2e-5]
batch_size = 16
fixed_epochs = 5

df_h1 = run_h1_lr_search(
    ds_encoded,
    learning_rates=learning_rates,
    batch_size=batch_size,
    num_epochs=fixed_epochs,
)

df_h1.sort_values("val_f1_macro", ascending=False)

best_lr = df_h1.sort_values("val_f1_macro", ascending=False).iloc[0]["lr"]
best_lr


=== Running experiment: H1_lr_3e-05 ===
lr=3e-05, batch_size=16, epochs=5


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8502,0.795533,0.65035,0.439394,0.46408,0.441692
2,0.6638,0.719089,0.678322,0.468085,0.488186,0.471218
3,0.4702,0.62844,0.748252,0.620473,0.563317,0.562263
4,0.3044,0.57558,0.797203,0.75931,0.704134,0.724976
5,0.2194,0.565015,0.818182,0.785625,0.740151,0.758729



[VALIDATION RESULTS]
eval_loss: 0.5650
eval_accuracy: 0.8182
eval_precision_macro: 0.7856
eval_recall_macro: 0.7402
eval_f1_macro: 0.7587
eval_runtime: 1.0728
eval_samples_per_second: 133.3010
eval_steps_per_second: 8.3900
epoch: 5.0000

[TEST RESULTS]
eval_loss: 0.5573
eval_accuracy: 0.7778
eval_precision_macro: 0.6861
eval_recall_macro: 0.6685
eval_f1_macro: 0.6758
eval_runtime: 1.0155
eval_samples_per_second: 141.8030
eval_steps_per_second: 8.8630
epoch: 5.0000

âœ… Model & tokenizer disimpan di: exp_H1_lr_3e-05/best_model

=== Running experiment: H1_lr_2e-05 ===
lr=2e-05, batch_size=16, epochs=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8713,0.811267,0.65035,0.436744,0.461845,0.437148
2,0.7483,0.790717,0.657343,0.456197,0.467593,0.445525
3,0.637,0.752519,0.671329,0.625586,0.501216,0.502115
4,0.5124,0.672671,0.713287,0.81383,0.533464,0.535078
5,0.4392,0.667586,0.72028,0.82337,0.538093,0.540293



[VALIDATION RESULTS]
eval_loss: 0.6676
eval_accuracy: 0.7203
eval_precision_macro: 0.8234
eval_recall_macro: 0.5381
eval_f1_macro: 0.5403
eval_runtime: 1.0560
eval_samples_per_second: 135.4190
eval_steps_per_second: 8.5230
epoch: 5.0000

[TEST RESULTS]
eval_loss: 0.5800
eval_accuracy: 0.7500
eval_precision_macro: 0.6966
eval_recall_macro: 0.6459
eval_f1_macro: 0.6632
eval_runtime: 1.0459
eval_samples_per_second: 137.6860
eval_steps_per_second: 8.6050
epoch: 5.0000

âœ… Model & tokenizer disimpan di: exp_H1_lr_2e-05/best_model


np.float64(3e-05)

ðŸ”¹ H2 â€“ Cari epoch terbaik (pakai best_lr dari H1)

In [15]:
epochs_list = [2, 3, 5, 10]
df_h2 = run_h2_epoch_search(
    ds_encoded,
    epochs_list=epochs_list,
    learning_rate=best_lr,
    batch_size=batch_size,
)

df_h2.sort_values("val_f1_macro", ascending=False)
best_epochs = int(df_h2.sort_values("val_f1_macro", ascending=False).iloc[0]["epochs"])
best_epochs



=== Running experiment: H2_ep_2 ===
lr=3e-05, batch_size=16, epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8577,0.816804,0.636364,0.438637,0.450351,0.425902
2,0.7505,0.732994,0.657343,0.434471,0.476533,0.454201



[VALIDATION RESULTS]
eval_loss: 0.7330
eval_accuracy: 0.6573
eval_precision_macro: 0.4345
eval_recall_macro: 0.4765
eval_f1_macro: 0.4542
eval_runtime: 1.0689
eval_samples_per_second: 133.7840
eval_steps_per_second: 8.4200
epoch: 2.0000

[TEST RESULTS]
eval_loss: 0.6789
eval_accuracy: 0.7153
eval_precision_macro: 0.4806
eval_recall_macro: 0.5289
eval_f1_macro: 0.4994
eval_runtime: 1.0340
eval_samples_per_second: 139.2670
eval_steps_per_second: 8.7040
epoch: 2.0000

âœ… Model & tokenizer disimpan di: exp_H2_ep_2/best_model

=== Running experiment: H2_ep_3 ===
lr=3e-05, batch_size=16, epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8497,0.802323,0.643357,0.441655,0.458333,0.436666
2,0.686,0.776543,0.615385,0.429595,0.435345,0.41326
3,0.5785,0.714054,0.671329,0.536525,0.502333,0.501719



[VALIDATION RESULTS]
eval_loss: 0.7141
eval_accuracy: 0.6713
eval_precision_macro: 0.5365
eval_recall_macro: 0.5023
eval_f1_macro: 0.5017
eval_runtime: 1.0504
eval_samples_per_second: 136.1380
eval_steps_per_second: 8.5680
epoch: 3.0000

[TEST RESULTS]
eval_loss: 0.5982
eval_accuracy: 0.7500
eval_precision_macro: 0.6839
eval_recall_macro: 0.6479
eval_f1_macro: 0.6612
eval_runtime: 0.9872
eval_samples_per_second: 145.8670
eval_steps_per_second: 9.1170
epoch: 3.0000

âœ… Model & tokenizer disimpan di: exp_H2_ep_3/best_model

=== Running experiment: H2_ep_5 ===
lr=3e-05, batch_size=16, epochs=5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8457,0.794684,0.671329,0.455625,0.481322,0.460059
2,0.7138,0.648859,0.713287,0.594459,0.537933,0.537146
3,0.5079,0.668407,0.741259,0.718353,0.594005,0.616984
4,0.3234,0.557265,0.762238,0.683715,0.616834,0.632726
5,0.2327,0.568854,0.79021,0.758333,0.699504,0.721605



[VALIDATION RESULTS]
eval_loss: 0.5689
eval_accuracy: 0.7902
eval_precision_macro: 0.7583
eval_recall_macro: 0.6995
eval_f1_macro: 0.7216
eval_runtime: 1.0697
eval_samples_per_second: 133.6790
eval_steps_per_second: 8.4130
epoch: 5.0000

[TEST RESULTS]
eval_loss: 0.6580
eval_accuracy: 0.7847
eval_precision_macro: 0.7453
eval_recall_macro: 0.7151
eval_f1_macro: 0.7278
eval_runtime: 1.0618
eval_samples_per_second: 135.6140
eval_steps_per_second: 8.4760
epoch: 5.0000

âœ… Model & tokenizer disimpan di: exp_H2_ep_5/best_model

=== Running experiment: H2_ep_10 ===
lr=3e-05, batch_size=16, epochs=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8747,0.814181,0.622378,0.432323,0.437739,0.410256
2,0.7551,0.730433,0.692308,0.462334,0.499681,0.477366
3,0.5726,0.791821,0.692308,0.612992,0.515105,0.517657
4,0.3592,0.506204,0.825175,0.778323,0.729357,0.746892
5,0.1883,0.568075,0.825175,0.769781,0.765792,0.767226
6,0.0802,0.756176,0.804196,0.774933,0.747433,0.757054
7,0.0298,0.778839,0.818182,0.75789,0.779939,0.764855
8,0.0111,0.801274,0.811189,0.751562,0.754298,0.751088
9,0.0064,0.816124,0.797203,0.739136,0.745039,0.741074
10,0.0023,0.835293,0.811189,0.760874,0.755416,0.757278



[VALIDATION RESULTS]
eval_loss: 0.5681
eval_accuracy: 0.8252
eval_precision_macro: 0.7698
eval_recall_macro: 0.7658
eval_f1_macro: 0.7672
eval_runtime: 1.0720
eval_samples_per_second: 133.3990
eval_steps_per_second: 8.3960
epoch: 10.0000

[TEST RESULTS]
eval_loss: 0.5855
eval_accuracy: 0.7917
eval_precision_macro: 0.7432
eval_recall_macro: 0.7438
eval_f1_macro: 0.7435
eval_runtime: 1.0185
eval_samples_per_second: 141.3820
eval_steps_per_second: 8.8360
epoch: 10.0000

âœ… Model & tokenizer disimpan di: exp_H2_ep_10/best_model


10

H3 â€“ Cari batch size terbaik (pakai lr & epoch terbaik)

In [16]:
batch_sizes = [16, 32]
df_h3 = run_h3_batch_search(
    ds_encoded,
    batch_sizes=batch_sizes,
    learning_rate=best_lr,
    num_epochs=best_epochs,
)

df_h3.sort_values("val_f1_macro", ascending=False)
best_batch = int(df_h3.sort_values("val_f1_macro", ascending=False).iloc[0]["batch_size"])
best_batch



=== Running experiment: H3_bs_16 ===
lr=3e-05, batch_size=16, epochs=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8598,0.815957,0.636364,0.429705,0.454821,0.43339
2,0.7051,0.723669,0.657343,0.456032,0.469828,0.450853
3,0.5485,0.718609,0.692308,0.607684,0.516222,0.518952
4,0.3809,0.590041,0.811189,0.7384,0.678075,0.694138
5,0.2345,0.612474,0.804196,0.768673,0.709881,0.731943
6,0.1308,0.630683,0.811189,0.73641,0.697969,0.7117
7,0.0823,0.718422,0.818182,0.761555,0.804303,0.778766
8,0.0433,0.735484,0.818182,0.742021,0.701481,0.716432
9,0.0145,0.776661,0.79021,0.71024,0.703974,0.706163
10,0.0126,0.806877,0.811189,0.730794,0.736639,0.733231



[VALIDATION RESULTS]
eval_loss: 0.7184
eval_accuracy: 0.8182
eval_precision_macro: 0.7616
eval_recall_macro: 0.8043
eval_f1_macro: 0.7788
eval_runtime: 1.1112
eval_samples_per_second: 128.6910
eval_steps_per_second: 8.0990
epoch: 10.0000

[TEST RESULTS]
eval_loss: 0.8425
eval_accuracy: 0.7986
eval_precision_macro: 0.7137
eval_recall_macro: 0.7515
eval_f1_macro: 0.7272
eval_runtime: 1.0705
eval_samples_per_second: 134.5170
eval_steps_per_second: 8.4070
epoch: 10.0000

âœ… Model & tokenizer disimpan di: exp_H3_bs_16/best_model

=== Running experiment: H3_bs_32 ===
lr=3e-05, batch_size=32, epochs=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.8785,0.800482,0.643357,0.434806,0.458333,0.435605
2,0.732,0.843672,0.629371,0.476581,0.440134,0.412061
3,0.6064,0.748262,0.678322,0.486067,0.481481,0.459737
4,0.4182,0.579413,0.769231,0.716954,0.622581,0.641662
5,0.2617,0.576977,0.811189,0.747723,0.759886,0.753327
6,0.1541,0.599892,0.776224,0.746273,0.689127,0.710721
7,0.0882,0.659359,0.804196,0.765537,0.753021,0.758917
8,0.0479,0.768708,0.776224,0.736986,0.666998,0.690304
9,0.0279,0.81405,0.797203,0.782729,0.743921,0.759729
10,0.0208,0.794043,0.804196,0.75913,0.729775,0.742302



[VALIDATION RESULTS]
eval_loss: 0.8141
eval_accuracy: 0.7972
eval_precision_macro: 0.7827
eval_recall_macro: 0.7439
eval_f1_macro: 0.7597
eval_runtime: 0.9653
eval_samples_per_second: 148.1340
eval_steps_per_second: 5.1790
epoch: 10.0000

[TEST RESULTS]
eval_loss: 0.7244
eval_accuracy: 0.8194
eval_precision_macro: 0.7574
eval_recall_macro: 0.7383
eval_f1_macro: 0.7425
eval_runtime: 0.9024
eval_samples_per_second: 159.5770
eval_steps_per_second: 5.5410
epoch: 10.0000

âœ… Model & tokenizer disimpan di: exp_H3_bs_32/best_model


16

In [17]:
print("Best hyperparameters:")
print("lr     :", best_lr)
print("epochs :", best_epochs)
print("batch  :", best_batch)

Best hyperparameters:
lr     : 3e-05
epochs : 10
batch  : 16
