
# Fine-Tuning IndoBERTweet untuk Analisis Sentimen Layanan KAI

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import & Konfigurasi Awal
import os
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)
import shutil
import torch
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [3]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)

True
12.6


In [15]:
# Konfigurasi File & Skenario P1–P4
# Path file hasil preprocessing (gabungan A + C)
DATA_FILE = "preprocessed_merged_stepwise.csv"

# Mapping skenario preprocessing ke nama kolom teks
SCENARIO_TO_TEXTCOL = {
    "P1": "text_clean",               # only cleaning
    "P2": "text_nostop_text",         # stopword removal only
    "P3": "text_stemmed_nostopword",  # stemming only
    "P4": "text_stemmed",             # stopword + stemming
}

# pilih skenario yang mau dijalankan (bisa kamu ganti-ganti)
SCENARIO = "P1"   # misal awalnya P1

label_col = "label"

print("Skenario aktif:", SCENARIO, "-> kolom teks:", SCENARIO_TO_TEXTCOL[SCENARIO])


Skenario aktif: P1 -> kolom teks: text_clean


In [None]:
# Import & Konfigurasi Awal
import os
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)
import shutil
import torch
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Konfigurasi File & Skenario P1–P4
DATA_FILE = "preprocessed_merged_stepwise.csv"

SCENARIO_TO_TEXTCOL = {
    "P1": "text_clean",               
    "P2": "text_nostop_text",         
    "P3": "text_stemmed_nostopword",  
    "P4": "text_stemmed",             
}

SCENARIO = "P1"

label_col = "label"

print("Skenario aktif:", SCENARIO, "-> kolom teks:", SCENARIO_TO_TEXTCOL[SCENARIO])

In [5]:
# Data Loading & Normalisasi Label
df = pd.read_csv(DATA_FILE, encoding="utf-8-sig")
df.columns = [c.strip().lower() for c in df.columns]
print("Kolom tersedia:", df.columns.tolist())

text_col = SCENARIO_TO_TEXTCOL[SCENARIO]

# pastikan kolom yang dibutuhkan ada
assert text_col in df.columns, f"Kolom teks '{text_col}' tidak ditemukan."
assert label_col in df.columns, f"Kolom label '{label_col}' tidak ditemukan."

# buang baris yang teks atau labelnya kosong
df = df.dropna(subset=[text_col, label_col])

# normalisasi label ke 3 kelas
df[label_col] = (
    df[label_col]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        "negatif": "neg",
        "negative": "neg",
        "positif": "pos",
        "positive": "pos",
        "netral": "neu",
        "neutral": "neu",
    })
)

# keep hanya 3 label utama
df = df[df[label_col].isin(["neg", "neu", "pos"])]

print("\nDistribusi label setelah normalisasi:")
print(df[label_col].value_counts())


Kolom tersedia: ['id_str', 'created_at', 'user_id_str', 'conversation_id_str', 'full_text', 'label', 'text_casefold', 'text_clean', 'tokens', 'tokens_nostop', 'text_nostop_text', 'tokens_stem', 'text_stemmed', 'emoji_drop_from_raw', 'emoji_map_from_raw', 'emoji_drop_from_stemmed', 'emoji_map_from_stemmed', 'text_stemmed_nostopword']

Distribusi label setelah normalisasi:
label
neu    721
neg    585
pos    128
Name: count, dtype: int64


Mapping Label & Split Train / Validasi / Test (80/10/10)

In [6]:
label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v: k for k, v in label2id.items()}

# 1) Split train + temp (80% train, 20% temp)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df[text_col].astype(str),
    df[label_col],
    test_size=0.2,
    random_state=SEED,
    stratify=df[label_col]
)

# 2) Split temp jadi validation + test (10% + 10%)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=SEED,
    stratify=temp_labels
)

train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
val_df   = pd.DataFrame({"text": val_texts, "label": val_labels})
test_df  = pd.DataFrame({"text": test_texts, "label": test_labels})

# mapping label -> id
train_df["label"] = train_df["label"].map(label2id)
val_df["label"]   = val_df["label"].map(label2id)
test_df["label"]  = test_df["label"].map(label2id)

print("TRAIN:", train_df["label"].value_counts().to_dict())
print("VAL  :", val_df["label"].value_counts().to_dict())
print("TEST :", test_df["label"].value_counts().to_dict())


TRAIN: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}


Konversi ke HuggingFace Dataset & Tokenisasi

In [7]:
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
})

print(ds)

MODEL_NAME = "indolem/indobertweet-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1147
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 143
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 144
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Definisi Metrik Evaluasi
metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = metric_accuracy.compute(predictions=preds, references=labels)
    f1_macro = metric_f1.compute(predictions=preds, references=labels, average="macro")
    return {
        "accuracy": acc["accuracy"],
        "f1_macro": f1_macro["f1"],
    }


Fungsi Utility: Jalankan 1 Eksperimen (untuk H1–H3 dan P1–P4)

In [9]:
def run_experiment(
    ds_encoded,
    learning_rate=2e-5,
    batch_size=16,
    num_epochs=5,
    output_dir="indobertweet-exp",
    run_name="",
):
    print(f"\n=== Running experiment: {run_name} ===")
    print(f"lr={learning_rate}, batch_size={batch_size}, epochs={num_epochs}")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    ).to(device)

    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1_macro",
        report_to="none",
        logging_dir=os.path.join(output_dir, "logs"),
        logging_strategy="epoch",
        seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_encoded["train"],
        eval_dataset=ds_encoded["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # evaluasi di validation (untuk pilih hyperparameter)
    val_results = trainer.evaluate(ds_encoded["validation"])
    print("\n[VALIDATION RESULTS]")
    for k, v in val_results.items():
        if isinstance(v, float):
            print(f"{k}: {v:.4f}")

    # evaluasi akhir di test
    test_results = trainer.evaluate(ds_encoded["test"])
    print("\n[TEST RESULTS]")
    for k, v in test_results.items():
        if isinstance(v, float):
            print(f"{k}: {v:.4f}")

    # simpan model & tokenizer (opsional, bisa pakai run_name)
    save_dir = os.path.join(output_dir, "best_model")
    trainer.save_model(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"\n✅ Model & tokenizer disimpan di: {save_dir}")

    return val_results, test_results, trainer


In [10]:
def run_h1_lr_search(
    ds_encoded,
    learning_rates,
    batch_size,
    num_epochs,
    base_output_dir="exp_H1",
):
    """
    H1: Mencari learning rate terbaik dengan batch_size & epoch tetap.
    """
    results = []
    for lr in learning_rates:
        run_name = f"H1_lr_{lr}"
        out_dir = f"{base_output_dir}_lr_{lr}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=lr,
            batch_size=batch_size,
            num_epochs=num_epochs,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "lr": lr,
            "val_f1_macro": val_res["eval_f1_macro"],
            "val_accuracy": val_res["eval_accuracy"],
            "test_f1_macro": test_res["eval_f1_macro"],
            "test_accuracy": test_res["eval_accuracy"],
        })

    return pd.DataFrame(results)


def run_h2_epoch_search(
    ds_encoded,
    epochs_list,
    learning_rate,
    batch_size,
    base_output_dir="exp_H2",
):
    """
    H2: Mencari jumlah epoch terbaik dengan lr & batch_size tetap.
    """
    results = []
    for n_ep in epochs_list:
        run_name = f"H2_ep_{n_ep}"
        out_dir = f"{base_output_dir}_ep_{n_ep}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=learning_rate,
            batch_size=batch_size,
            num_epochs=n_ep,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "epochs": n_ep,
            "val_f1_macro": val_res["eval_f1_macro"],
            "val_accuracy": val_res["eval_accuracy"],
            "test_f1_macro": test_res["eval_f1_macro"],
            "test_accuracy": test_res["eval_accuracy"],
        })

    return pd.DataFrame(results)


def run_h3_batch_search(
    ds_encoded,
    batch_sizes,
    learning_rate,
    num_epochs,
    base_output_dir="exp_H3",
):
    """
    H3: Mencari batch size terbaik dengan lr & epoch tetap.
    """
    results = []
    for bs in batch_sizes:
        run_name = f"H3_bs_{bs}"
        out_dir = f"{base_output_dir}_bs_{bs}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=learning_rate,
            batch_size=bs,
            num_epochs=num_epochs,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "batch_size": bs,
            "val_f1_macro": val_res["eval_f1_macro"],
            "val_accuracy": val_res["eval_accuracy"],
            "test_f1_macro": test_res["eval_f1_macro"],
            "test_accuracy": test_res["eval_accuracy"],
        })

    return pd.DataFrame(results)


Jalankan P1–P4 dengan Hyperparameter Terbaik

In [11]:
# %%
def load_dataset_for_scenario(
    scenario: str,
    data_file: str,
    scenario_to_textcol: dict,
    label_col: str,
    seed: int,
    tokenizer,
):
    """
    Load data, normalisasi label, split train/val/test,
    dan kembalikan DatasetDict yang sudah ditokenisasi
    untuk satu skenario preprocessing (P1–P4).
    """
    text_col = scenario_to_textcol[scenario]

    print("\n==============================")
    print("Skenario:", scenario, "| text col:", text_col)
    print("==============================")

    # --- Load & cleaning dasar ---
    df = pd.read_csv(data_file, encoding="utf-8-sig")
    df.columns = [c.strip().lower() for c in df.columns]

    assert text_col in df.columns, f"Kolom teks '{text_col}' tidak ditemukan."
    assert label_col in df.columns, f"Kolom label '{label_col}' tidak ditemukan."

    df = df.dropna(subset=[text_col, label_col])

    # --- Normalisasi label ke 3 kelas ---
    df[label_col] = (
        df[label_col]
        .astype(str).str.strip().str.lower()
        .replace({
            "negatif": "neg", "negative": "neg",
            "positif": "pos", "positive": "pos",
            "netral": "neu", "neutral": "neu",
        })
    )
    df = df[df[label_col].isin(["neg", "neu", "pos"])]

    # --- Split 80/10/10: train / validation / test ---
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        df[text_col].astype(str),
        df[label_col],
        test_size=0.2,
        random_state=seed,
        stratify=df[label_col]
    )
    val_texts, test_texts, val_labels, test_labels = train_test_split(
        temp_texts,
        temp_labels,
        test_size=0.5,
        random_state=seed,
        stratify=temp_labels
    )

    train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
    val_df   = pd.DataFrame({"text": val_texts, "label": val_labels})
    test_df  = pd.DataFrame({"text": test_texts, "label": test_labels})

    train_df["label"] = train_df["label"].map(label2id)
    val_df["label"]   = val_df["label"].map(label2id)
    test_df["label"]  = test_df["label"].map(label2id)

    print("TRAIN:", train_df["label"].value_counts().to_dict())
    print("VAL  :", val_df["label"].value_counts().to_dict())
    print("TEST :", test_df["label"].value_counts().to_dict())

    ds = DatasetDict({
        "train": Dataset.from_pandas(train_df, preserve_index=False),
        "validation": Dataset.from_pandas(val_df, preserve_index=False),
        "test": Dataset.from_pandas(test_df, preserve_index=False),
    })

    # --- Tokenisasi ---
    ds_encoded = ds.map(tokenize, batched=True)

    return ds_encoded


Step 0: Pastikan dataset & ds_encoded siap (misal pakai P1 dulu)

P1

In [15]:
SCENARIO = "P1"

ds_encoded = load_dataset_for_scenario(
    scenario=SCENARIO,
    data_file=DATA_FILE,
    scenario_to_textcol=SCENARIO_TO_TEXTCOL,
    label_col=label_col,
    seed=SEED,
    tokenizer=tokenizer,
)

best_lr = 3e-05
best_epochs = 10
best_batch = 16

print("Best hyperparameters:")
print("lr     :", best_lr)
print("epochs :", best_epochs)
print("batch  :", best_batch)

final_val_res_p1, final_test_res_p1, final_trainer_p1 = run_experiment(
    ds_encoded,
    learning_rate=best_lr,
    batch_size=best_batch,
    num_epochs=best_epochs,
    output_dir="final_indobertweet_model_p1",
    run_name="final_run_p1",
)


Skenario: P1 | text col: text_clean
TRAIN: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}


Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Best hyperparameters:
lr     : 3e-05
epochs : 10
batch  : 16

=== Running experiment: final_run_p1 ===
lr=3e-05, batch_size=16, epochs=10


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.8487,0.78687,0.65035,0.445759
2,0.6395,0.657889,0.72028,0.504006
3,0.4093,0.666819,0.769231,0.661349
4,0.2206,0.654176,0.79021,0.692141
5,0.1016,0.738711,0.804196,0.716738
6,0.0543,0.775575,0.818182,0.731161
7,0.0288,0.774428,0.804196,0.737702
8,0.0066,0.789879,0.832168,0.763725
9,0.0028,0.828983,0.818182,0.75282
10,0.0031,0.875804,0.818182,0.75282



[VALIDATION RESULTS]
eval_loss: 0.7899
eval_accuracy: 0.8322
eval_f1_macro: 0.7637
eval_runtime: 1.0767
eval_samples_per_second: 132.8120
eval_steps_per_second: 8.3590
epoch: 10.0000

[TEST RESULTS]
eval_loss: 0.8404
eval_accuracy: 0.8333
eval_f1_macro: 0.7702
eval_runtime: 1.0131
eval_samples_per_second: 142.1340
eval_steps_per_second: 8.8830
epoch: 10.0000

✅ Model & tokenizer disimpan di: final_indobertweet_model_p1/best_model


In [None]:
ds_encoded = load_dataset_for_scenario(
    scenario=SCENARIO,
    data_file=DATA_FILE,
    scenario_to_textcol=SCENARIO_TO_TEXTCOL,
    label_col=label_col,
    seed=SEED,
    tokenizer=tokenizer,
)

print("Best hyperparameters:")
print("lr     :", best_lr)
print("epochs :", best_epochs)
print("batch  :", best_batch)

final_val_res, final_test_res, final_trainer = run_experiment(
    ds_encoded,
    learning_rate=best_lr,
    batch_size=best_batch,
    num_epochs=best_epochs,
    output_dir="final_indobertweet_model",
    run_name="final_run",
)

# ============================================================
# Evaluasi Lengkap di Test Set: akurasi, precision, recall, F1, Macro F1 + Confusion Matrix
# ============================================================

# Prediksi di data test
pred_output= final_trainer.predict(ds_encoded["test"])
y_true= pred_output.label_ids
y_pred= np.argmax(pred_output.predictions, axis=-1)

# Akurasi
test_acc = accuracy_score(y_true, y_pred)

# Precision, Recall, F1 per kelas + Macro
prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="macro"
)

print("\n=== EVALUASI LENGKAP TEST SET ===")
print(f"Akurasi (sklearn)         : {test_acc:.4f}")
print(f"Precision Macro           : {prec_macro:.4f}")
print(f"Recall Macro              : {rec_macro:.4f}")
print(f"F1-Score Macro (sklearn)  : {f1_macro:.4f}")
print(f"F1-Score Macro (Trainer)  : {final_test_res['eval_f1_macro']:.4f}")
print(f"Akurasi (Trainer)         : {final_test_res['eval_accuracy']:.4f}")

# Laporan per kelas (neg, neu, pos)
target_names = ["neg", "neu", "pos"]
print("\n--- Classification Report per Kelas ---")
print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)
print("\n--- Confusion Matrix (rows = true, cols = pred) ---")
print(cm_df)

save_dir = "final_indobertweet_model/best_model"
final_trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"\n✅ Final model & tokenizer disimpan di: {save_dir}")

# ============================================================
# Menyimpan Model ke Google Drive
# ============================================================

drive_save_dir = "/content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_model"

# Kalau folder tujuan sudah ada, hapus dulu (opsional, supaya bersih)
if os.path.exists(drive_save_dir):
    shutil.rmtree(drive_save_dir)

shutil.copytree(save_dir, drive_save_dir)
print(f"✅ Model & tokenizer juga disalin ke Google Drive: {drive_save_dir}")

In [19]:

# ============================================================
# Evaluasi Lengkap di Test Set: akurasi, precision, recall, F1, Macro F1 + Confusion Matrix
# ============================================================

# Prediksi di data test
pred_output_p1= final_trainer_p1.predict(ds_encoded["test"])
y_true_p1= pred_output_p1.label_ids
y_pred_p1= np.argmax(pred_output_p1.predictions, axis=-1)

# Akurasi
test_acc_p1 = accuracy_score(y_true_p1, y_pred_p1)

# Precision, Recall, F1 per kelas + Macro
prec_macro_p1, rec_macro_p1, f1_macro_p1, _ = precision_recall_fscore_support(
    y_true_p1, y_pred_p1, average="macro"
)

print("\n=== EVALUASI LENGKAP TEST SET - P1 ===")
print(f"Akurasi (sklearn)         : {test_acc_p1:.4f}")
print(f"Precision Macro           : {prec_macro_p1:.4f}")
print(f"Recall Macro              : {rec_macro_p1:.4f}")
print(f"F1-Score Macro (sklearn)  : {f1_macro_p1:.4f}")
print(f"F1-Score Macro (Trainer)  : {final_test_res_p1['eval_f1_macro']:.4f}")
print(f"Akurasi (Trainer)         : {final_test_res_p1['eval_accuracy']:.4f}")

# Laporan per kelas (neg, neu, pos)
target_names = ["neg", "neu", "pos"]
print("\n--- Classification Report per Kelas - P1 ---")
print(classification_report(y_true_p1, y_pred_p1, target_names=target_names, digits=4))

# Confusion Matrix
cm_p1 = confusion_matrix(y_true_p1, y_pred_p1)
cm_p1_df = pd.DataFrame(cm_p1, index=target_names, columns=target_names)
print("\n--- Confusion Matrix - P1 (rows = true, cols = pred) ---")
print(cm_p1_df)

save_dir_p1 = "final_indobertweet_model/best_model"
final_trainer_p1.save_model(save_dir_p1)
tokenizer.save_pretrained(save_dir_p1)
print(f"\n✅ Final model & tokenizer disimpan di: {save_dir_p1}")

# ============================================================
# Menyimpan Model ke Google Drive
# ============================================================

drive_save_dir_p1 = "/content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_model_p1"

# Kalau folder tujuan sudah ada, hapus dulu (opsional, supaya bersih)
if os.path.exists(drive_save_dir_p1):
    shutil.rmtree(drive_save_dir_p1)

shutil.copytree(save_dir_p1, drive_save_dir_p1)
print(f"✅ Model & tokenizer juga disalin ke Google Drive: {drive_save_dir_p1}")


=== EVALUASI LENGKAP TEST SET - P1 ===
Akurasi (sklearn)         : 0.8333
Precision Macro           : 0.8005
Recall Macro              : 0.7506
F1-Score Macro (sklearn)  : 0.7702
F1-Score Macro (Trainer)  : 0.7702
Akurasi (Trainer)         : 0.8333

--- Classification Report per Kelas - P1 ---
              precision    recall  f1-score   support

         neg     0.8868    0.7966    0.8393        59
         neu     0.8148    0.9167    0.8627        72
         pos     0.7000    0.5385    0.6087        13

    accuracy                         0.8333       144
   macro avg     0.8005    0.7506    0.7702       144
weighted avg     0.8339    0.8333    0.8302       144


--- Confusion Matrix - P1 (rows = true, cols = pred) ---
     neg  neu  pos
neg   47   10    2
neu    5   66    1
pos    1    5    7

✅ Final model & tokenizer disimpan di: final_indobertweet_model/best_model
✅ Model & tokenizer juga disalin ke Google Drive: /content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_

In [12]:
SCENARIO = "P2"

ds_encoded = load_dataset_for_scenario(
    scenario=SCENARIO,
    data_file=DATA_FILE,
    scenario_to_textcol=SCENARIO_TO_TEXTCOL,
    label_col=label_col,
    seed=SEED,
    tokenizer=tokenizer,
)

best_lr = 3e-05
best_epochs = 10
best_batch = 16

print("Best hyperparameters:")
print("lr     :", best_lr)
print("epochs :", best_epochs)
print("batch  :", best_batch)

final_val_res_p2, final_test_res_p2, final_trainer_p2 = run_experiment(
    ds_encoded,
    learning_rate=best_lr,
    batch_size=best_batch,
    num_epochs=best_epochs,
    output_dir="final_indobertweet_model_p2",
    run_name="final_run_p2",
)


Skenario: P2 | text col: text_nostop_text
TRAIN: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}


Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Best hyperparameters:
lr     : 3e-05
epochs : 10
batch  : 16

=== Running experiment: final_run_p2 ===
lr=3e-05, batch_size=16, epochs=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.8529,0.780534,0.657343,0.444328
2,0.6288,0.700311,0.72028,0.497707
3,0.3874,0.841136,0.692308,0.508333
4,0.2168,0.798646,0.748252,0.637398
5,0.1196,0.990632,0.741259,0.643778
6,0.0517,1.114508,0.741259,0.656277
7,0.0137,1.199809,0.755245,0.660937
8,0.0066,1.396575,0.762238,0.67679
9,0.0029,1.388679,0.755245,0.675718
10,0.0033,1.381183,0.755245,0.675718



[VALIDATION RESULTS]
eval_loss: 1.3966
eval_accuracy: 0.7622
eval_f1_macro: 0.6768
eval_runtime: 1.0668
eval_samples_per_second: 134.0410
eval_steps_per_second: 8.4360
epoch: 10.0000

[TEST RESULTS]
eval_loss: 1.0482
eval_accuracy: 0.8125
eval_f1_macro: 0.7100
eval_runtime: 1.0248
eval_samples_per_second: 140.5110
eval_steps_per_second: 8.7820
epoch: 10.0000

✅ Model & tokenizer disimpan di: final_indobertweet_model_p2/best_model


In [13]:
# ============================================================
# Evaluasi Lengkap di Test Set: akurasi, precision, recall, F1, Macro F1 + Confusion Matrix
# ============================================================

# Prediksi di data test
pred_output_p2= final_trainer_p2.predict(ds_encoded["test"])
y_true_p2= pred_output_p2.label_ids
y_pred_p2= np.argmax(pred_output_p2.predictions, axis=-1)

# Akurasi
test_acc_p2 = accuracy_score(y_true_p2, y_pred_p2)

# Precision, Recall, F1 per kelas + Macro
prec_macro_p2, rec_macro_p2, f1_macro_p2, _ = precision_recall_fscore_support(
    y_true_p2, y_pred_p2, average="macro"
)

print("\n=== EVALUASI LENGKAP TEST SET - P2 ===")
print(f"Akurasi (sklearn)         : {test_acc_p2:.4f}")
print(f"Precision Macro           : {prec_macro_p2:.4f}")
print(f"Recall Macro              : {rec_macro_p2:.4f}")
print(f"F1-Score Macro (sklearn)  : {f1_macro_p2:.4f}")
print(f"F1-Score Macro (Trainer)  : {final_test_res_p2['eval_f1_macro']:.4f}")
print(f"Akurasi (Trainer)         : {final_test_res_p2['eval_accuracy']:.4f}")

# Laporan per kelas (neg, neu, pos)
target_names = ["neg", "neu", "pos"]
print("\n--- Classification Report per Kelas - P2 ---")
print(classification_report(y_true_p2, y_pred_p2, target_names=target_names, digits=4))

# Confusion Matrix
cm_p2 = confusion_matrix(y_true_p2, y_pred_p2)
cm_p2_df = pd.DataFrame(cm_p2, index=target_names, columns=target_names)
print("\n--- Confusion Matrix - P2 (rows = true, cols = pred) ---")
print(cm_p2_df)

save_dir_p2 = "final_indobertweet_model/best_model"
final_trainer_p2.save_model(save_dir_p2)
tokenizer.save_pretrained(save_dir_p2)
print(f"\n✅ Final model & tokenizer disimpan di: {save_dir_p2}")

# ============================================================
# Menyimpan Model ke Google Drive
# ============================================================

drive_save_dir_p2 = "/content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_model_p2"

# Kalau folder tujuan sudah ada, hapus dulu (opsional, supaya bersih)
if os.path.exists(drive_save_dir_p2):
    shutil.rmtree(drive_save_dir_p2)

shutil.copytree(save_dir_p2, drive_save_dir_p2)
print(f"✅ Model & tokenizer juga disalin ke Google Drive: {drive_save_dir_p2}")


=== EVALUASI LENGKAP TEST SET - P2 ===
Akurasi (sklearn)         : 0.8125
Precision Macro           : 0.7481
Recall Macro              : 0.6916
F1-Score Macro (sklearn)  : 0.7100
F1-Score Macro (Trainer)  : 0.7100
Akurasi (Trainer)         : 0.8125

--- Classification Report per Kelas - P2 ---
              precision    recall  f1-score   support

         neg     0.8980    0.7458    0.8148        59
         neu     0.7907    0.9444    0.8608        72
         pos     0.5556    0.3846    0.4545        13

    accuracy                         0.8125       144
   macro avg     0.7481    0.6916    0.7100       144
weighted avg     0.8134    0.8125    0.8053       144


--- Confusion Matrix - P2 (rows = true, cols = pred) ---
     neg  neu  pos
neg   44   12    3
neu    3   68    1
pos    2    6    5

✅ Final model & tokenizer disimpan di: final_indobertweet_model/best_model
✅ Model & tokenizer juga disalin ke Google Drive: /content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_

In [16]:
SCENARIO = "P3"

ds_encoded = load_dataset_for_scenario(
    scenario=SCENARIO,
    data_file=DATA_FILE,
    scenario_to_textcol=SCENARIO_TO_TEXTCOL,
    label_col=label_col,
    seed=SEED,
    tokenizer=tokenizer,
)

best_lr = 3e-05
best_epochs = 10
best_batch = 16

print("Best hyperparameters:")
print("lr     :", best_lr)
print("epochs :", best_epochs)
print("batch  :", best_batch)

final_val_res_p3, final_test_res_p3, final_trainer_p3 = run_experiment(
    ds_encoded,
    learning_rate=best_lr,
    batch_size=best_batch,
    num_epochs=best_epochs,
    output_dir="final_indobertweet_model_p3",
    run_name="final_run_p3",
)


Skenario: P3 | text col: text_stemmed_nostopword
TRAIN: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}


Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Best hyperparameters:
lr     : 3e-05
epochs : 10
batch  : 16

=== Running experiment: final_run_p3 ===
lr=3e-05, batch_size=16, epochs=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.8427,0.73268,0.664336,0.460021
2,0.5761,0.59651,0.769231,0.615941
3,0.3214,0.604218,0.783217,0.671194
4,0.1548,0.625597,0.853147,0.818231
5,0.07,0.674678,0.853147,0.816567
6,0.0301,0.723161,0.853147,0.802136
7,0.0073,0.847427,0.846154,0.789875
8,0.0032,0.905932,0.825175,0.75243
9,0.0015,0.892316,0.853147,0.802136
10,0.0012,0.887387,0.853147,0.802136



[VALIDATION RESULTS]
eval_loss: 0.6256
eval_accuracy: 0.8531
eval_f1_macro: 0.8182
eval_runtime: 1.0770
eval_samples_per_second: 132.7790
eval_steps_per_second: 8.3570
epoch: 10.0000

[TEST RESULTS]
eval_loss: 0.5533
eval_accuracy: 0.8194
eval_f1_macro: 0.7611
eval_runtime: 1.0173
eval_samples_per_second: 141.5470
eval_steps_per_second: 8.8470
epoch: 10.0000

✅ Model & tokenizer disimpan di: final_indobertweet_model_p3/best_model


In [17]:

# ============================================================
# Evaluasi Lengkap di Test Set: akurasi, precision, recall, F1, Macro F1 + Confusion Matrix
# ============================================================

# Prediksi di data test
pred_output_p3= final_trainer_p3.predict(ds_encoded["test"])
y_true_p3= pred_output_p3.label_ids
y_pred_p3= np.argmax(pred_output_p3.predictions, axis=-1)

# Akurasi
test_acc_p3 = accuracy_score(y_true_p3, y_pred_p3)

# Precision, Recall, F1 per kelas + Macro
prec_macro_p3, rec_macro_p3, f1_macro_p3, _ = precision_recall_fscore_support(
    y_true_p3, y_pred_p3, average="macro"
)

print("\n=== EVALUASI LENGKAP TEST SET - P3 ===")
print(f"Akurasi (sklearn)         : {test_acc_p3:.4f}")
print(f"Precision Macro           : {prec_macro_p3:.4f}")
print(f"Recall Macro              : {rec_macro_p3:.4f}")
print(f"F1-Score Macro (sklearn)  : {f1_macro_p3:.4f}")
print(f"F1-Score Macro (Trainer)  : {final_test_res_p3['eval_f1_macro']:.4f}")
print(f"Akurasi (Trainer)         : {final_test_res_p3['eval_accuracy']:.4f}")

# Laporan per kelas (neg, neu, pos)
target_names = ["neg", "neu", "pos"]
print("\n--- Classification Report per Kelas - P3 ---")
print(classification_report(y_true_p3, y_pred_p3, target_names=target_names, digits=4))

# Confusion Matrix
cm_p3 = confusion_matrix(y_true_p3, y_pred_p3)
cm_p3_df = pd.DataFrame(cm_p3, index=target_names, columns=target_names)
print("\n--- Confusion Matrix - P3 (rows = true, cols = pred) ---")
print(cm_p3_df)

save_dir_p3 = "final_indobertweet_model/best_model"
final_trainer_p3.save_model(save_dir_p3)
tokenizer.save_pretrained(save_dir_p3)
print(f"\n✅ Final model & tokenizer disimpan di: {save_dir_p3}")

# ============================================================
# Menyimpan Model ke Google Drive
# ============================================================

drive_save_dir_p3 = "/content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_model_p3"

# Kalau folder tujuan sudah ada, hapus dulu (opsional, supaya bersih)
if os.path.exists(drive_save_dir_p3):
    shutil.rmtree(drive_save_dir_p3)

shutil.copytree(save_dir_p3, drive_save_dir_p3)
print(f"✅ Model & tokenizer juga disalin ke Google Drive: {drive_save_dir_p3}")


=== EVALUASI LENGKAP TEST SET - P3 ===
Akurasi (sklearn)         : 0.8194
Precision Macro           : 0.7851
Recall Macro              : 0.7444
F1-Score Macro (sklearn)  : 0.7611
F1-Score Macro (Trainer)  : 0.7611
Akurasi (Trainer)         : 0.8194

--- Classification Report per Kelas - P3 ---
              precision    recall  f1-score   support

         neg     0.8197    0.8475    0.8333        59
         neu     0.8356    0.8472    0.8414        72
         pos     0.7000    0.5385    0.6087        13

    accuracy                         0.8194       144
   macro avg     0.7851    0.7444    0.7611       144
weighted avg     0.8168    0.8194    0.8171       144


--- Confusion Matrix - P3 (rows = true, cols = pred) ---
     neg  neu  pos
neg   50    6    3
neu   11   61    0
pos    0    6    7

✅ Final model & tokenizer disimpan di: final_indobertweet_model/best_model
✅ Model & tokenizer juga disalin ke Google Drive: /content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_

In [18]:
SCENARIO = "P4"

ds_encoded = load_dataset_for_scenario(
    scenario=SCENARIO,
    data_file=DATA_FILE,
    scenario_to_textcol=SCENARIO_TO_TEXTCOL,
    label_col=label_col,
    seed=SEED,
    tokenizer=tokenizer,
)

best_lr = 3e-05
best_epochs = 10
best_batch = 16

print("Best hyperparameters:")
print("lr     :", best_lr)
print("epochs :", best_epochs)
print("batch  :", best_batch)

final_val_res_p4, final_test_res_p4, final_trainer_p4 = run_experiment(
    ds_encoded,
    learning_rate=best_lr,
    batch_size=best_batch,
    num_epochs=best_epochs,
    output_dir="final_indobertweet_model_p4",
    run_name="final_run_p4",
)


Skenario: P4 | text col: text_stemmed
TRAIN: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}


Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Best hyperparameters:
lr     : 3e-05
epochs : 10
batch  : 16

=== Running experiment: final_run_p4 ===
lr=3e-05, batch_size=16, epochs=10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.8637,0.768095,0.657343,0.452802
2,0.6429,0.710469,0.699301,0.478304
3,0.4105,0.874724,0.678322,0.499565
4,0.2232,0.865004,0.741259,0.596171
5,0.1086,0.967311,0.734266,0.644274
6,0.0406,1.194399,0.762238,0.664096
7,0.0194,1.21161,0.734266,0.63907
8,0.011,1.310606,0.748252,0.647469
9,0.0043,1.339022,0.755245,0.652191
10,0.0033,1.335746,0.755245,0.652191



[VALIDATION RESULTS]
eval_loss: 1.1944
eval_accuracy: 0.7622
eval_f1_macro: 0.6641
eval_runtime: 1.0731
eval_samples_per_second: 133.2540
eval_steps_per_second: 8.3870
epoch: 10.0000

[TEST RESULTS]
eval_loss: 0.9325
eval_accuracy: 0.8264
eval_f1_macro: 0.7328
eval_runtime: 1.0300
eval_samples_per_second: 139.8070
eval_steps_per_second: 8.7380
epoch: 10.0000

✅ Model & tokenizer disimpan di: final_indobertweet_model_p4/best_model


In [19]:

# ============================================================
# Evaluasi Lengkap di Test Set: akurasi, precision, recall, F1, Macro F1 + Confusion Matrix
# ============================================================

# Prediksi di data test
pred_output_p4= final_trainer_p4.predict(ds_encoded["test"])
y_true_p4= pred_output_p4.label_ids
y_pred_p4= np.argmax(pred_output_p4.predictions, axis=-1)

# Akurasi
test_acc_p4 = accuracy_score(y_true_p4, y_pred_p4)

# Precision, Recall, F1 per kelas + Macro
prec_macro_p4, rec_macro_p4, f1_macro_p4, _ = precision_recall_fscore_support(
    y_true_p4, y_pred_p4, average="macro"
)

print("\n=== EVALUASI LENGKAP TEST SET - P4 ===")
print(f"Akurasi (sklearn)         : {test_acc_p4:.4f}")
print(f"Precision Macro           : {prec_macro_p4:.4f}")
print(f"Recall Macro              : {rec_macro_p4:.4f}")
print(f"F1-Score Macro (sklearn)  : {f1_macro_p4:.4f}")
print(f"F1-Score Macro (Trainer)  : {final_test_res_p4['eval_f1_macro']:.4f}")
print(f"Akurasi (Trainer)         : {final_test_res_p4['eval_accuracy']:.4f}")

# Laporan per kelas (neg, neu, pos)
target_names = ["neg", "neu", "pos"]
print("\n--- Classification Report per Kelas - P4 ---")
print(classification_report(y_true_p4, y_pred_p4, target_names=target_names, digits=4))

# Confusion Matrix
cm_p4 = confusion_matrix(y_true_p4, y_pred_p4)
cm_p4_df = pd.DataFrame(cm_p4, index=target_names, columns=target_names)
print("\n--- Confusion Matrix - P4 (rows = true, cols = pred) ---")
print(cm_p4_df)

save_dir_p4 = "final_indobertweet_model/best_model"
final_trainer_p4.save_model(save_dir_p4)
tokenizer.save_pretrained(save_dir_p4)
print(f"\n✅ Final model & tokenizer disimpan di: {save_dir_p4}")

# ============================================================
# Menyimpan Model ke Google Drive
# ============================================================

drive_save_dir_p4 = "/content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_model_p4"

# Kalau folder tujuan sudah ada, hapus dulu (opsional, supaya bersih)
if os.path.exists(drive_save_dir_p4):
    shutil.rmtree(drive_save_dir_p4)

shutil.copytree(save_dir_p4, drive_save_dir_p4)
print(f"✅ Model & tokenizer juga disalin ke Google Drive: {drive_save_dir_p4}")


=== EVALUASI LENGKAP TEST SET - P4 ===
Akurasi (sklearn)         : 0.8264
Precision Macro           : 0.7996
Recall Macro              : 0.7039
F1-Score Macro (sklearn)  : 0.7328
F1-Score Macro (Trainer)  : 0.7328
Akurasi (Trainer)         : 0.8264

--- Classification Report per Kelas - P4 ---
              precision    recall  f1-score   support

         neg     0.8868    0.7966    0.8393        59
         neu     0.7976    0.9306    0.8590        72
         pos     0.7143    0.3846    0.5000        13

    accuracy                         0.8264       144
   macro avg     0.7996    0.7039    0.7328       144
weighted avg     0.8266    0.8264    0.8185       144


--- Confusion Matrix - P4 (rows = true, cols = pred) ---
     neg  neu  pos
neg   47   10    2
neu    5   67    0
pos    1    7    5

✅ Final model & tokenizer disimpan di: final_indobertweet_model/best_model
✅ Model & tokenizer juga disalin ke Google Drive: /content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_