
# Fine-Tuning IndoBERTweet untuk Analisis Sentimen Layanan KAI

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import & Konfigurasi Awal
import os
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)
import shutil
import torch
import evaluate
from collections import Counter

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [4]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.__version__)


True
12.6
2.9.0+cu126


In [5]:
# Konfigurasi File & Skenario P1–P4
# Path file hasil preprocessing (gabungan A + C)
DATA_FILE = "preprocessed_merged_stepwise.csv"

# Mapping skenario preprocessing ke nama kolom teks
SCENARIO_TO_TEXTCOL = {
    "P1": "text_clean",               # only cleaning
    "P2": "text_nostop_text",         # stopword removal only
    "P3": "text_stemmed_nostopword",  # stemming only
    "P4": "text_stemmed",             # stopword + stemming
}

# pilih skenario yang mau dijalankan (bisa kamu ganti-ganti)
SCENARIO = "P1"   # misal awalnya P1

label_col = "label"

print("Skenario aktif:", SCENARIO, "-> kolom teks:", SCENARIO_TO_TEXTCOL[SCENARIO])


Skenario aktif: P1 -> kolom teks: text_clean


In [6]:
# Data Loading & Normalisasi Label
df = pd.read_csv(DATA_FILE, encoding="utf-8-sig")
df.columns = [c.strip().lower() for c in df.columns]
print("Kolom tersedia:", df.columns.tolist())

text_col = SCENARIO_TO_TEXTCOL[SCENARIO]

# pastikan kolom yang dibutuhkan ada
assert text_col in df.columns, f"Kolom teks '{text_col}' tidak ditemukan."
assert label_col in df.columns, f"Kolom label '{label_col}' tidak ditemukan."

# buang baris yang teks atau labelnya kosong
df = df.dropna(subset=[text_col, label_col])

# normalisasi label ke 3 kelas
df[label_col] = (
    df[label_col]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        "negatif": "neg",
        "negative": "neg",
        "positif": "pos",
        "positive": "pos",
        "netral": "neu",
        "neutral": "neu",
    })
)

# keep hanya 3 label utama
df = df[df[label_col].isin(["neg", "neu", "pos"])]

print("\nDistribusi label setelah normalisasi:")
print(df[label_col].value_counts())


Kolom tersedia: ['id_str', 'created_at', 'user_id_str', 'conversation_id_str', 'full_text', 'label', 'text_casefold', 'text_clean', 'tokens', 'tokens_nostop', 'text_nostop_text', 'tokens_stem', 'text_stemmed', 'emoji_drop_from_raw', 'emoji_map_from_raw', 'emoji_drop_from_stemmed', 'emoji_map_from_stemmed', 'text_stemmed_nostopword']

Distribusi label setelah normalisasi:
label
neu    721
neg    585
pos    128
Name: count, dtype: int64


Mapping Label & Split Train / Validasi / Test (80/10/10)

In [7]:
label2id = {"neg": 0, "neu": 1, "pos": 2}
id2label = {v: k for k, v in label2id.items()}

# 1) Split train + temp (80% train, 20% temp)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df[text_col].astype(str),
    df[label_col],
    test_size=0.2,
    random_state=SEED,
    stratify=df[label_col]
)

# 2) Split temp jadi validation + test (10% + 10%)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=SEED,
    stratify=temp_labels
)

train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
val_df   = pd.DataFrame({"text": val_texts, "label": val_labels})
test_df  = pd.DataFrame({"text": test_texts, "label": test_labels})

# mapping label -> id
train_df["label"] = train_df["label"].map(label2id)
val_df["label"]   = val_df["label"].map(label2id)
test_df["label"]  = test_df["label"].map(label2id)

print("TRAIN:", train_df["label"].value_counts().to_dict())
print("VAL  :", val_df["label"].value_counts().to_dict())
print("TEST :", test_df["label"].value_counts().to_dict())


TRAIN: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}


Konversi ke HuggingFace Dataset & Tokenisasi

In [8]:
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
})

print(ds)

MODEL_NAME = "indolem/indobertweet-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1147
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 143
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 144
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [9]:
# Definisi Metrik Evaluasi
metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = metric_accuracy.compute(predictions=preds, references=labels)
    f1_macro = metric_f1.compute(predictions=preds, references=labels, average="macro")
    return {
        "accuracy": acc["accuracy"],
        "f1_macro": f1_macro["f1"],
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Fungsi Utility: Jalankan 1 Eksperimen (untuk H1–H3 dan P1–P4)

In [10]:
def run_experiment(
    ds_encoded,
    learning_rate=2e-5,
    batch_size=16,
    num_epochs=5,
    output_dir="indobertweet-exp",
    run_name="",
):
    print(f"\n=== Running experiment: {run_name} ===")
    print(f"lr={learning_rate}, batch_size={batch_size}, epochs={num_epochs}")

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=3,
        id2label=id2label,
        label2id=label2id
    ).to(device)

    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1_macro",
        report_to="none",
        logging_dir=os.path.join(output_dir, "logs"),
        logging_strategy="epoch",
        seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_encoded["train"],
        eval_dataset=ds_encoded["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # evaluasi di validation (untuk pilih hyperparameter)
    val_results = trainer.evaluate(ds_encoded["validation"])
    print("\n[VALIDATION RESULTS]")
    for k, v in val_results.items():
        if isinstance(v, float):
            print(f"{k}: {v:.4f}")

    # evaluasi akhir di test
    test_results = trainer.evaluate(ds_encoded["test"])
    print("\n[TEST RESULTS]")
    for k, v in test_results.items():
        if isinstance(v, float):
            print(f"{k}: {v:.4f}")

    # simpan model & tokenizer (opsional, bisa pakai run_name)
    save_dir = os.path.join(output_dir, "best_model")
    trainer.save_model(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"\n✅ Model & tokenizer disimpan di: {save_dir}")

    return val_results, test_results, trainer


In [11]:
def run_h1_lr_search(
    ds_encoded,
    learning_rates,
    batch_size,
    num_epochs,
    base_output_dir="exp_H1",
):
    """
    H1: Mencari learning rate terbaik dengan batch_size & epoch tetap.
    """
    results = []
    for lr in learning_rates:
        run_name = f"H1_lr_{lr}"
        out_dir = f"{base_output_dir}_lr_{lr}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=lr,
            batch_size=batch_size,
            num_epochs=num_epochs,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "lr": lr,
            "val_f1_macro": val_res["eval_f1_macro"],
            "val_accuracy": val_res["eval_accuracy"],
            "test_f1_macro": test_res["eval_f1_macro"],
            "test_accuracy": test_res["eval_accuracy"],
        })

    return pd.DataFrame(results)


def run_h2_epoch_search(
    ds_encoded,
    epochs_list,
    learning_rate,
    batch_size,
    base_output_dir="exp_H2",
):
    """
    H2: Mencari jumlah epoch terbaik dengan lr & batch_size tetap.
    """
    results = []
    for n_ep in epochs_list:
        run_name = f"H2_ep_{n_ep}"
        out_dir = f"{base_output_dir}_ep_{n_ep}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=learning_rate,
            batch_size=batch_size,
            num_epochs=n_ep,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "epochs": n_ep,
            "val_f1_macro": val_res["eval_f1_macro"],
            "val_accuracy": val_res["eval_accuracy"],
            "test_f1_macro": test_res["eval_f1_macro"],
            "test_accuracy": test_res["eval_accuracy"],
        })

    return pd.DataFrame(results)


def run_h3_batch_search(
    ds_encoded,
    batch_sizes,
    learning_rate,
    num_epochs,
    base_output_dir="exp_H3",
):
    """
    H3: Mencari batch size terbaik dengan lr & epoch tetap.
    """
    results = []
    for bs in batch_sizes:
        run_name = f"H3_bs_{bs}"
        out_dir = f"{base_output_dir}_bs_{bs}"

        val_res, test_res, _ = run_experiment(
            ds_encoded,
            learning_rate=learning_rate,
            batch_size=bs,
            num_epochs=num_epochs,
            output_dir=out_dir,
            run_name=run_name,
        )

        results.append({
            "batch_size": bs,
            "val_f1_macro": val_res["eval_f1_macro"],
            "val_accuracy": val_res["eval_accuracy"],
            "test_f1_macro": test_res["eval_f1_macro"],
            "test_accuracy": test_res["eval_accuracy"],
        })

    return pd.DataFrame(results)


Jalankan P1–P4 dengan Hyperparameter Terbaik

In [12]:
import random

def augment_text_random_deletion(text, p_del=0.1):
    """
    Random deletion dengan proteksi token sentimen (negasi & polar words)
    """
    protected_tokens = {
        "tidak", "tak", "gak", "ga", "nggak", "enggak", "bukan",
        "kurang", "belum",
        "bagus", "buruk", "jelek", "parah", "mantap", "baik",
        "telat", "lambat", "error", "gagal"
    }

    tokens = text.split()
    if len(tokens) <= 3:
        return text

    new_tokens = []
    for tok in tokens:
        tok_lower = tok.lower()

        # JANGAN hapus token penting
        if tok_lower in protected_tokens:
            new_tokens.append(tok)
            continue

        # random deletion biasa
        if random.random() < p_del:
            continue

        new_tokens.append(tok)

    if not new_tokens:
        return text

    return " ".join(new_tokens)

def undersample_then_augment(
    train_df,
    seed=42,
    p_del=0.1,
):
    """
    Hybrid sesuai request:
    1) Undersampling: semua kelas -> min_count (kelas minoritas awal)
    2) Augmentation (random deletion): semua kelas -> max_count (kelas mayoritas awal)

    Output: train_df baru (shuffled) dengan distribusi seimbang = max_count awal.
    """
    random.seed(seed)
    np.random.seed(seed)

    # counts awal (SEBELUM undersampling) -> ini jadi patokan target
    counts = train_df["label"].value_counts()
    target_under = int(counts.min())   # contoh: 102
    target_aug   = int(counts.max())   # contoh: 577

    print(f"[HYBRID] target_under(min_count)={target_under}, target_aug(max_count)={target_aug}")

    # 1) UNDERSAMPLING -> semua kelas jadi min_count
    undersampled_parts = []
    for label_id in counts.index:
        part = train_df[train_df["label"] == label_id]
        # ambil tepat target_under (untuk kelas mayoritas)
        if len(part) > target_under:
            part = part.sample(n=target_under, random_state=seed)
        # untuk berjaga kalau ada label yang malah < min_count (jarang)
        else:
            part = part.sample(frac=1, random_state=seed)
        undersampled_parts.append(part)

    undersampled_df = pd.concat(undersampled_parts, ignore_index=True)

    print("[HYBRID] Distribusi setelah UNDERSAMPLING:")
    print(undersampled_df["label"].value_counts())

    # 2) AUGMENTATION -> semua kelas dinaikkan sampai max_count awal
    new_parts = [undersampled_df]
    new_counts = undersampled_df["label"].value_counts()

    for label_id, cnt in new_counts.items():
        if cnt < target_aug:
            need = target_aug - cnt
            base = undersampled_df[undersampled_df["label"] == label_id]

            reps = need // len(base) + 1
            base_rep = pd.concat([base] * reps, ignore_index=True).iloc[:need].copy()

            base_rep["text"] = base_rep["text"].apply(
                lambda x: augment_text_random_deletion(x, p_del=p_del)
            )

            new_parts.append(base_rep)

    final_df = pd.concat(new_parts, ignore_index=True)
    final_df = final_df.sample(frac=1, random_state=seed).reset_index(drop=True)

    print("[HYBRID] Distribusi setelah AUGMENTATION:")
    print(final_df["label"].value_counts())

    return final_df

In [13]:
def load_dataset_for_scenario(
    scenario: str,
    data_file: str,
    scenario_to_textcol: dict,
    label_col: str,
    seed: int,
    tokenizer,
):
    """
    Load data, normalisasi label, split train/val/test,
    dan kembalikan DatasetDict yang sudah ditokenisasi
    untuk satu skenario preprocessing (P1–P4).
    """
    text_col = scenario_to_textcol[scenario]

    print("\n==============================")
    print("Skenario:", scenario, "| text col:", text_col)
    print("==============================")

    # --- Load & cleaning dasar ---
    df = pd.read_csv(data_file, encoding="utf-8-sig")
    df.columns = [c.strip().lower() for c in df.columns]

    assert text_col in df.columns, f"Kolom teks '{text_col}' tidak ditemukan."
    assert label_col in df.columns, f"Kolom label '{label_col}' tidak ditemukan."

    df = df.dropna(subset=[text_col, label_col])

    # --- Normalisasi label ke 3 kelas ---
    df[label_col] = (
        df[label_col]
        .astype(str).str.strip().str.lower()
        .replace({
            "negatif": "neg", "negative": "neg",
            "positif": "pos", "positive": "pos",
            "netral": "neu", "neutral": "neu",
        })
    )
    df = df[df[label_col].isin(["neg", "neu", "pos"])]

    # --- Split 80/10/10: train / validation / test ---
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        df[text_col].astype(str),
        df[label_col],
        test_size=0.2,
        random_state=seed,
        stratify=df[label_col]
    )
    val_texts, test_texts, val_labels, test_labels = train_test_split(
        temp_texts,
        temp_labels,
        test_size=0.5,
        random_state=seed,
        stratify=temp_labels
    )

    train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
    val_df   = pd.DataFrame({"text": val_texts, "label": val_labels})
    test_df  = pd.DataFrame({"text": test_texts, "label": test_labels})

    train_df["label"] = train_df["label"].map(label2id)
    val_df["label"]   = val_df["label"].map(label2id)
    test_df["label"]  = test_df["label"].map(label2id)

    print("TRAIN sebelum augmentation:", train_df["label"].value_counts().to_dict())
    print("VAL  :", val_df["label"].value_counts().to_dict())
    print("TEST :", test_df["label"].value_counts().to_dict())

    if scenario == "P1":
        # TRAIN ONLY: undersampling -> min_count, lalu augment -> max_count awal
        train_df = undersample_then_augment(
            train_df,
            seed=seed,
            p_del=0.1,
        )

    ds = DatasetDict({
        "train": Dataset.from_pandas(train_df, preserve_index=False),
        "validation": Dataset.from_pandas(val_df, preserve_index=False),
        "test": Dataset.from_pandas(test_df, preserve_index=False),
    })

    # --- Tokenisasi ---
    ds_encoded = ds.map(tokenize, batched=True)

    return ds_encoded


Step 0: Pastikan dataset & ds_encoded siap (misal pakai P1 dulu)

P1

In [14]:
SCENARIO = "P1"

ds_encoded = load_dataset_for_scenario(
    scenario=SCENARIO,
    data_file=DATA_FILE,
    scenario_to_textcol=SCENARIO_TO_TEXTCOL,
    label_col=label_col,
    seed=SEED,
    tokenizer=tokenizer,
)

best_lr = 3e-05
best_epochs = 10
best_batch = 16

print("Best hyperparameters:")
print("lr     :", best_lr)
print("epochs :", best_epochs)
print("batch  :", best_batch)

final_val_res_p1, final_test_res_p1, final_trainer_p1 = run_experiment(
    ds_encoded,
    learning_rate=best_lr,
    batch_size=best_batch,
    num_epochs=best_epochs,
    output_dir="final_indobertweet_model_p1_textaug",
    run_name="final_run_p1_textaug",
)


Skenario: P1 | text col: text_clean
TRAIN sebelum augmentation: {1: 577, 0: 468, 2: 102}
VAL  : {1: 72, 0: 58, 2: 13}
TEST : {1: 72, 0: 59, 2: 13}
[HYBRID] target_under(min_count)=102, target_aug(max_count)=577
[HYBRID] Distribusi setelah UNDERSAMPLING:
label
1    102
0    102
2    102
Name: count, dtype: int64
[HYBRID] Distribusi setelah AUGMENTATION:
label
1    577
0    577
2    577
Name: count, dtype: int64


Map:   0%|          | 0/1731 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Best hyperparameters:
lr     : 3e-05
epochs : 10
batch  : 16

=== Running experiment: final_run_p1_textaug ===
lr=3e-05, batch_size=16, epochs=10


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.7354,0.816243,0.678322,0.633514
2,0.0738,1.290455,0.678322,0.636509
3,0.002,1.679723,0.664336,0.622058
4,0.0006,1.748377,0.671329,0.620901
5,0.0005,1.801017,0.671329,0.621906
6,0.0004,1.681819,0.713287,0.649957
7,0.0003,1.946848,0.657343,0.609548
8,0.0002,2.030064,0.657343,0.607584
9,0.0003,2.0313,0.657343,0.617781
10,0.0002,2.015905,0.65035,0.605818



[VALIDATION RESULTS]
eval_loss: 1.6818
eval_accuracy: 0.7133
eval_f1_macro: 0.6500
eval_runtime: 1.0890
eval_samples_per_second: 131.3150
eval_steps_per_second: 8.2650
epoch: 10.0000

[TEST RESULTS]
eval_loss: 1.5789
eval_accuracy: 0.7083
eval_f1_macro: 0.6475
eval_runtime: 1.0524
eval_samples_per_second: 136.8270
eval_steps_per_second: 8.5520
epoch: 10.0000

✅ Model & tokenizer disimpan di: final_indobertweet_model_p1_textaug/best_model


In [15]:

# ============================================================
# Evaluasi Lengkap di Test Set: akurasi, precision, recall, F1, Macro F1 + Confusion Matrix
# ============================================================

# Prediksi di data test
pred_output_p1= final_trainer_p1.predict(ds_encoded["test"])
y_true_p1= pred_output_p1.label_ids
y_pred_p1= np.argmax(pred_output_p1.predictions, axis=-1)

# Akurasi
test_acc_p1 = accuracy_score(y_true_p1, y_pred_p1)

# Precision, Recall, F1 per kelas + Macro
prec_macro_p1, rec_macro_p1, f1_macro_p1, _ = precision_recall_fscore_support(
    y_true_p1, y_pred_p1, average="macro"
)

print("\n=== EVALUASI LENGKAP TEST SET - P1 + Undersampling + Text aug ===")
print(f"Akurasi (sklearn)         : {test_acc_p1:.4f}")
print(f"Precision Macro           : {prec_macro_p1:.4f}")
print(f"Recall Macro              : {rec_macro_p1:.4f}")
print(f"F1-Score Macro (sklearn)  : {f1_macro_p1:.4f}")
print(f"F1-Score Macro (Trainer)  : {final_test_res_p1['eval_f1_macro']:.4f}")
print(f"Akurasi (Trainer)         : {final_test_res_p1['eval_accuracy']:.4f}")

# Laporan per kelas (neg, neu, pos)
target_names = ["neg", "neu", "pos"]
print("\n--- Classification Report per Kelas - P1 + Undersampling + Text aug ---")
print(classification_report(y_true_p1, y_pred_p1, target_names=target_names, digits=4))

# Confusion Matrix
cm_p1 = confusion_matrix(y_true_p1, y_pred_p1)
cm_p1_df = pd.DataFrame(cm_p1, index=target_names, columns=target_names)
print("\n--- Confusion Matrix - P1 +  Undersampling + Text aug(rows = true, cols = pred) ---")
print(cm_p1_df)

save_dir_p1 = "final_indobertweet_model/best_model"
final_trainer_p1.save_model(save_dir_p1)
tokenizer.save_pretrained(save_dir_p1)
print(f"\n✅ Final model & tokenizer disimpan di: {save_dir_p1}")

# ============================================================
# Menyimpan Model ke Google Drive
# ============================================================

drive_save_dir_p1 = "/content/drive/MyDrive/ModelSkripsi/skripsi_indobertweet_final_model_p1_undersampling_min_textaug_protectedtoken"

# Kalau folder tujuan sudah ada, hapus dulu (opsional, supaya bersih)
if os.path.exists(drive_save_dir_p1):
    shutil.rmtree(drive_save_dir_p1)

shutil.copytree(save_dir_p1, drive_save_dir_p1)
print(f"✅ Model & tokenizer juga disalin ke Google Drive: {drive_save_dir_p1}")


=== EVALUASI LENGKAP TEST SET - P1 + Undersampling + Text aug ===
Akurasi (sklearn)         : 0.7083
Precision Macro           : 0.6352
Recall Macro              : 0.6801
F1-Score Macro (sklearn)  : 0.6475
F1-Score Macro (Trainer)  : 0.6475
Akurasi (Trainer)         : 0.7083

--- Classification Report per Kelas - P1 + Undersampling + Text aug ---
              precision    recall  f1-score   support

         neg     0.7500    0.6610    0.7027        59
         neu     0.7746    0.7639    0.7692        72
         pos     0.3810    0.6154    0.4706        13

    accuracy                         0.7083       144
   macro avg     0.6352    0.6801    0.6475       144
weighted avg     0.7290    0.7083    0.7150       144


--- Confusion Matrix - P1 +  Undersampling + Text aug(rows = true, cols = pred) ---
     neg  neu  pos
neg   39   12    8
neu   12   55    5
pos    1    4    8

✅ Final model & tokenizer disimpan di: final_indobertweet_model/best_model
✅ Model & tokenizer juga disalin