In [1]:
import os
import numpy as np
import pandas as pd
import random
import copy

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from PIL import Image
import torchvision.models as models
import torchvision.transforms as transforms

import re
from collections import Counter
from transformers import (
    AutoConfig,
    AutoTokenizer, 
    AutoModel,
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding,
    get_scheduler
)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

2025-08-03 05:54:45.451564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754200485.626155      18 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754200485.678875      18 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
seed = 0
def seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(seed)

In [4]:
# Generator untuk DataLoader
g = torch.Generator()
g.manual_seed(seed)

def seed_worker(worker_id):
    """Fungsi untuk memastikan setiap worker memiliki seed yang sama"""
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [5]:
# Paths
train_csv_path = '/kaggle/input/train-test/train_data.csv'
test_csv_path = '/kaggle/input/train-test/test_data.csv'

In [6]:
# Load data
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

In [7]:
# Drop missing values
print('Missing values in Train:\n', train_df.isna().sum())
train_df = train_df.dropna()

Missing values in Train:
 File Name         0
Extracted Text    0
Class             0
dtype: int64


In [8]:
import re

def clean_texts(texts):
    cleaned_texts = []

    # Kata 1–2 huruf yang penting dan tidak boleh dihapus
    exceptions = {"di", "ke", "ya", "jl"}

    for text in texts:
        # ----- BASIC CLEANING -----
        text = re.sub(r"http\S+", "", text)                    # Hapus URL
        text = re.sub(r"\n", " ", text)                        # Ganti newline dengan spasi
        text = re.sub(r"[^a-zA-Z]", " ", text)             # Sisakan huruf, angka, apostrof
        text = re.sub(r"\s{2,}", " ", text).strip().lower()    # Hapus spasi ganda & ke lowercase

        # ----- FILTERING -----
        words = text.split()
        filtered_words = [
            w for w in words
            if (len(w) > 2 or w in exceptions)
            and not re.fullmatch(r"[aeiou]{3,}", w)            # Hindari vokal berulang ("aaa")
            and not re.fullmatch(r"[bcdfghjklmnpqrstvwxyz]{3,}", w)  # Hindari konsonan berulang ("kkk")
            and len(w) <= 20                                   # Batasi kata terlalu panjang
        ]

        cleaned_texts.append(" ".join(filtered_words))

    return cleaned_texts

In [9]:
# # Clean text
train_df['cleaned_text'] = clean_texts(train_df['Extracted Text'])
test_df['cleaned_text'] = clean_texts(test_df['Extracted Text'])
# Gabungkan teks dari train dan test sementara
# combined_texts = pd.concat([train_df['Extracted Text'], test_df['Extracted Text']], ignore_index=True)

# # Bersihkan semua teks gabungan
# cleaned_all = clean_texts(combined_texts)

# # Bagi kembali hasil cleaned text ke dalam train dan test
# train_df['cleaned_text'] = cleaned_all[:len(train_df)]
# test_df['cleaned_text'] = cleaned_all[len(train_df):]

In [10]:
# Drop rows with less than 5 words
train_df = train_df[train_df['cleaned_text'].apply(lambda x: len(str(x).split()) >= 5)]
test_df = test_df[test_df['cleaned_text'].apply(lambda x: len(str(x).split()) >= 5)]
train_df

Unnamed: 0,File Name,Extracted Text,Class,cleaned_text
0,ahoyamigo.com_home.png,KASKUSTOTO LOGIN KASKUSTOTO oiocom KASKUSTOTO ...,judi,kaskustoto login kaskustoto oiocom kaskustoto ...
1,asgard789.bet.png,SUCKBET kunusn Tusluju Unajv ]acipısı aüns I...,judi,suckbet kunusn tusluju unajv acip augn
2,bola442.monster.png,AsF User Name Password Code 6636 LOGIN Lela442...,judi,asf user name password code login lela live ch...
3,maniac-ihokibet.com_page1.png,Download APK Live Chat Hubungı Kamı Lupa Kata ...,judi,download apk live chat hubung kam lupa kata sa...
4,ketohour.com_page4.png,ketohour Beranda 7 @ Q Subscribe Beranda Situs...,judi,ketohour beranda subscribe beranda situs togel...
...,...,...,...,...
3340,bankmandiri.co.id_home.png,Ketik untuk mencari Log In mandırı Perseoranga...,non-judi,ketik untuk mencari log mand perseorangan bisn...
3341,bengkaliskab.go.id_page3.png,"JL Ahmad Yani, Bengkalis Kota Selamat datang d...",non-judi,jl ahmad yani bengkalis kota selamat datang di...
3342,bsmentertainment.com_page9.png,021-22792541 CallWva 08111306600 (Jakarta) | C...,non-judi,callwva jakarta call wva bandurg office hours ...
3343,coinmarketcap.com_page5.png,CoinMarketCap Cryptocurrencies DexScan Exchang...,non-judi,coinmarketcap cryptocurrencies dexscan exchang...


In [11]:
# Cek jumlah duplikasi sebelum dihapus
print("Duplikasi di train:", train_df.duplicated(subset='cleaned_text').sum())
print("Duplikasi di test :", test_df.duplicated(subset='cleaned_text').sum())

# Hapus duplikasi berdasarkan cleaned_text
train_df = train_df.drop_duplicates(subset='cleaned_text').reset_index(drop=True)
test_df = test_df.drop_duplicates(subset='cleaned_text').reset_index(drop=True)

# Cek ulang setelah pembersihan
print("Setelah dihapus:")
print("Train:", len(train_df), "baris")
print("Test :", len(test_df), "baris")

Duplikasi di train: 8
Duplikasi di test : 0
Setelah dihapus:
Train: 3335 baris
Test : 700 baris


In [12]:
# Print jumlah per kelas
print("Distribusi label di Train set:")
print(train_df['Class'].value_counts(), '\n')

print("Distribusi label di Test set:")
print(test_df['Class'].value_counts())

Distribusi label di Train set:
Class
non-judi    1740
judi        1595
Name: count, dtype: int64 

Distribusi label di Test set:
Class
judi        350
non-judi    350
Name: count, dtype: int64


In [13]:
from sklearn.utils import resample

# Pisahkan data berdasarkan kelas
train_judi = train_df[train_df['Class'] == 'judi']
train_nonjudi = train_df[train_df['Class'] == 'non-judi']

# Undersampling kelas mayoritas (non-judi) agar jumlahnya sama dengan kelas judi
train_nonjudi_undersampled = resample(train_nonjudi,
                                      replace=False,      # tanpa duplikasi
                                      n_samples=len(train_judi),  # samakan jumlahnya dengan kelas minoritas
                                      random_state=seed)    # untuk replikasi hasil

# Gabungkan kembali data yang sudah diundersample
train_df_balanced = pd.concat([train_judi, train_nonjudi_undersampled])

# Cek distribusi baru
print("Distribusi label setelah undersampling:")
print(train_df_balanced['Class'].value_counts())

Distribusi label setelah undersampling:
Class
judi        1595
non-judi    1595
Name: count, dtype: int64


In [14]:
label_map = {
    "non-judi": 0,
    "judi": 1
}

train_df_balanced['label'] = train_df_balanced['Class'].map(label_map)
test_df['label'] = test_df['Class'].map(label_map)

In [15]:
traindf, validdf = train_test_split(
    train_df_balanced, test_size=0.2, stratify=train_df_balanced['label'], random_state=seed
)

print(f"Jumlah data train: {len(traindf)}")
print(f"Jumlah data valid: {len(validdf)}")
print(f"Jumlah data test: {len(test_df)}")

Jumlah data train: 2552
Jumlah data valid: 638
Jumlah data test: 700


In [16]:
# Print jumlah per kelas
print("Distribusi label di Train set:")
print(traindf['label'].value_counts(), '\n')

print("Distribusi label di Validation set:")
print(validdf['label'].value_counts(), '\n')

print("Distribusi label di Test set:")
print(test_df['label'].value_counts())

Distribusi label di Train set:
label
1    1276
0    1276
Name: count, dtype: int64 

Distribusi label di Validation set:
label
0    319
1    319
Name: count, dtype: int64 

Distribusi label di Test set:
label
1    350
0    350
Name: count, dtype: int64


In [17]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        label = row['label']
        text = str(row['cleaned_text'])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt',
            padding='max_length'
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }


In [18]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
max_length = 128

# Dataset
train_dataset = CustomDataset(traindf, tokenizer, max_length)
valid_dataset = CustomDataset(validdf, tokenizer, max_length)
test_dataset = CustomDataset(test_df, tokenizer, max_length)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [19]:
def run_experiment(batch_size, learning_rate, num_epochs, device):
    experiment_name = f"bs{batch_size}_lr{learning_rate}_ep{num_epochs}"
    print(f"\n===== STARTING EXPERIMENT: {experiment_name} =====\n")

    # Inisialisasi DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                              num_workers=4, worker_init_fn=seed_worker, generator=g)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, 
                              num_workers=4, worker_init_fn=seed_worker, generator=g)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, 
                             num_workers=4, worker_init_fn=seed_worker, generator=g)

    # Model
    text_model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', num_labels=1)
    text_model.to(device)
    is_parallel = False
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        text_model = nn.DataParallel(text_model)
        is_parallel = True

    optimizer = optim.Adam(text_model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    train_losses, train_accuracies = [], []
    valid_losses, valid_accuracies = [], []

    best_val_loss = float('inf')
    best_model_path = f'best_text_model_{experiment_name}_state_dict.pt'

    # Time tracking
    training_start_time = time.time()

    for epoch in range(num_epochs):
        # TRAINING
        epoch_train_start = time.time()
        text_model.train()
        total_loss = 0.0
        correct = 0
        total_samples = 0

        for batch_data in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
            input_ids = batch_data['input_ids'].to(device)
            attention_mask = batch_data['attention_mask'].to(device)
            labels = batch_data['label'].float().to(device)

            optimizer.zero_grad()
            outputs = text_model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            predicted = (torch.sigmoid(outputs) > 0.5).long()
            correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        epoch_loss = total_loss / len(train_loader)
        epoch_accuracy = correct / total_samples
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_accuracy)
        epoch_train_duration = time.time() - epoch_train_start

        # VALIDATION
        epoch_val_start = time.time()
        text_model.eval()
        valid_loss = 0.0
        valid_correct = 0
        valid_total_samples = 0

        with torch.no_grad():
            for batch_data in valid_loader:
                input_ids = batch_data['input_ids'].to(device)
                attention_mask = batch_data['attention_mask'].to(device)
                labels = batch_data['label'].float().to(device)

                outputs = text_model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)
                loss = criterion(outputs, labels)
                valid_loss += loss.item()

                predicted = (torch.sigmoid(outputs) > 0.5).long()
                valid_correct += (predicted == labels).sum().item()
                valid_total_samples += labels.size(0)

        valid_epoch_loss = valid_loss / len(valid_loader)
        valid_epoch_accuracy = valid_correct / valid_total_samples
        valid_losses.append(valid_epoch_loss)
        valid_accuracies.append(valid_epoch_accuracy)
        epoch_val_duration = time.time() - epoch_val_start

        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Acc: {epoch_accuracy:.4f}, Time: {epoch_train_duration:.2f}s")
        print(f"Epoch [{epoch+1}/{num_epochs}], Val   Loss: {valid_epoch_loss:.4f}, Acc: {valid_epoch_accuracy:.4f}, Time: {epoch_val_duration:.2f}s")

        if valid_epoch_loss < best_val_loss:
            best_val_loss = valid_epoch_loss
            torch.save(text_model.module.state_dict() if is_parallel else text_model.state_dict(), best_model_path)
            print(f"New best model saved at epoch {epoch+1} with val loss: {valid_epoch_loss:.4f}")

    total_training_duration = time.time() - training_start_time
    print(f"\n✅ Total training + validation time: {total_training_duration:.2f} seconds\n")

    # Load best model
    best_model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', num_labels=1)
    best_model.load_state_dict(torch.load(best_model_path))
    best_model.to(device)

    # EVALUATION
    test_start_time = time.time()
    metrics = evaluate_text_model(best_model, test_loader, device)
    test_duration = time.time() - test_start_time
    print(f"🧪 Test time: {test_duration:.2f} seconds")

    return {
        'experiment_name': experiment_name,
        'metrics': metrics,
        'model_path': best_model_path,
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'valid_losses': valid_losses,
        'valid_accuracies': valid_accuracies,
        'best_val_loss': best_val_loss,
        'best_epoch': np.argmin(valid_losses) + 1,
        'train_time': total_training_duration,
        'test_time': test_duration,
    }

In [20]:
# Define function to evaluate text model
def evaluate_text_model(model, data_loader, device): 
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_data in tqdm(data_loader, desc="Evaluating Text Model"):
            input_ids = batch_data['input_ids'].to(device)
            attention_mask = batch_data['attention_mask'].to(device)
            labels = batch_data['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)
            preds = (torch.sigmoid(outputs) > 0.5).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(all_labels, all_preds),
        'Precision': precision_score(all_labels, all_preds),
        'Recall': recall_score(all_labels, all_preds),
        'F1 Score': f1_score(all_labels, all_preds)
    }
    
    return metrics

In [21]:
# Define experiments based on the specified hyperparameters
experiments = [
    {'batch_size': bs, 'learning_rate': lr, 'num_epochs': ep}
    for bs in [16, 32]
    for lr in [5e-5, 3e-5, 2e-5]
    for ep in [2, 3, 4]
]

In [22]:
# Run all experiments and collect results
results = []
for i, exp in enumerate(experiments):
    print(f"\n\n===== RUNNING EXPERIMENT {i+1}/{len(experiments)} =====")
    print(f"Batch Size: {exp['batch_size']}, Learning Rate: {exp['learning_rate']}, Epochs: {exp['num_epochs']}")
    
    experiment_result = run_experiment(
        batch_size=exp['batch_size'],
        learning_rate=exp['learning_rate'],
        num_epochs=exp['num_epochs'],
        device=device
    )
    
    results.append(experiment_result)
    
    # Print current experiment metrics
    print(f"\n----- Results for Experiment {i+1}: {experiment_result['experiment_name']} -----")
    print(f"Best epoch: {experiment_result['best_epoch']} with validation loss: {experiment_result['best_val_loss']:.4f}")
    print("Test Metrics:")
    for metric_name, value in experiment_result['metrics'].items():
        print(f"  {metric_name}: {value:.4f}")



===== RUNNING EXPERIMENT 1/18 =====
Batch Size: 16, Learning Rate: 5e-05, Epochs: 2

===== STARTING EXPERIMENT: bs16_lr5e-05_ep2 =====



pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]


Epoch 1 Training:   0%|          | 0/160 [00:00<?, ?it/s][A
Epoch 1 Training:   1%|          | 1/160 [00:01<03:22,  1.27s/it][A
Epoch 1 Training:   1%|▏         | 2/160 [00:01<01:42,  1.55it/s][A
Epoch 1 Training:   2%|▏         | 3/160 [00:01<01:09,  2.25it/s][A
Epoch 1 Training:   2%|▎         | 4/160 [00:01<00:54,  2.86it/s][A
Epoch 1 Training:   3%|▎         | 5/160 [00:02<00:46,  3.34it/s][A
Epoch 1 Training:   4%|▍         | 6/160 [00:02<00:41,  3.75it/s][A
Epoch 1 Training:   4%|▍         | 7/160 [00:02<00:37,  4.05it/s][A
Epoch 1 Training:   5%|▌         | 8/160 [00:02<00:35,  4.29it/s][A
Epoch 1 Training:   6%|▌         | 9/160 [00:02<00:33,  4.46it/s][A
Epoch 1 Training:   6%|▋         | 10/160 [00:03<00:32,  4.58it/s][A
Epoch 1 Training:   7%|▋         | 11/160 [00:03<00:31,  4.67it/s][A
Epoch 1 Training:   8%|▊         | 12/160 [00:03<00:31,  4.73it/s][A
Epoch 1 Training:   8%|▊         | 13/160 [00:03<00:30,  4.76it/s][A
Epoch 1 Training:   9%|▉         | 14

Epoch [1/2], Train Loss: 0.1062, Acc: 0.9596, Time: 33.88s
Epoch [1/2], Val   Loss: 0.0599, Acc: 0.9859, Time: 2.42s
New best model saved at epoch 1 with val loss: 0.0599


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.89it/s]


Epoch [2/2], Train Loss: 0.0466, Acc: 0.9882, Time: 32.75s
Epoch [2/2], Val   Loss: 0.0944, Acc: 0.9624, Time: 2.43s

✅ Total training + validation time: 72.26 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.25it/s]


🧪 Test time: 2.72 seconds

----- Results for Experiment 1: bs16_lr5e-05_ep2 -----
Best epoch: 1 with validation loss: 0.0599
Test Metrics:
  Accuracy: 0.9643
  Precision: 0.9380
  Recall: 0.9943
  F1 Score: 0.9653


===== RUNNING EXPERIMENT 2/18 =====
Batch Size: 16, Learning Rate: 5e-05, Epochs: 3

===== STARTING EXPERIMENT: bs16_lr5e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [1/3], Train Loss: 0.0865, Acc: 0.9687, Time: 32.80s
Epoch [1/3], Val   Loss: 0.0497, Acc: 0.9859, Time: 2.45s
New best model saved at epoch 1 with val loss: 0.0497


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [2/3], Train Loss: 0.0237, Acc: 0.9953, Time: 32.83s
Epoch [2/3], Val   Loss: 0.0464, Acc: 0.9906, Time: 2.45s
New best model saved at epoch 2 with val loss: 0.0464


Epoch 3 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [3/3], Train Loss: 0.0237, Acc: 0.9933, Time: 32.78s
Epoch [3/3], Val   Loss: 0.0533, Acc: 0.9922, Time: 2.46s

✅ Total training + validation time: 107.77 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.36it/s]


🧪 Test time: 2.70 seconds

----- Results for Experiment 2: bs16_lr5e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0464
Test Metrics:
  Accuracy: 0.9843
  Precision: 0.9942
  Recall: 0.9743
  F1 Score: 0.9841


===== RUNNING EXPERIMENT 3/18 =====
Batch Size: 16, Learning Rate: 5e-05, Epochs: 4

===== STARTING EXPERIMENT: bs16_lr5e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [1/4], Train Loss: 0.0943, Acc: 0.9698, Time: 32.82s
Epoch [1/4], Val   Loss: 0.0609, Acc: 0.9843, Time: 2.46s
New best model saved at epoch 1 with val loss: 0.0609


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [2/4], Train Loss: 0.0287, Acc: 0.9929, Time: 32.79s
Epoch [2/4], Val   Loss: 0.0535, Acc: 0.9828, Time: 2.45s
New best model saved at epoch 2 with val loss: 0.0535


Epoch 3 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [3/4], Train Loss: 0.0473, Acc: 0.9902, Time: 32.77s
Epoch [3/4], Val   Loss: 0.0729, Acc: 0.9828, Time: 2.45s


Epoch 4 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [4/4], Train Loss: 0.0099, Acc: 0.9976, Time: 32.78s
Epoch [4/4], Val   Loss: 0.0482, Acc: 0.9922, Time: 2.45s
New best model saved at epoch 4 with val loss: 0.0482

✅ Total training + validation time: 144.25 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.23it/s]


🧪 Test time: 2.72 seconds

----- Results for Experiment 3: bs16_lr5e-05_ep4 -----
Best epoch: 4 with validation loss: 0.0482
Test Metrics:
  Accuracy: 0.9814
  Precision: 0.9828
  Recall: 0.9800
  F1 Score: 0.9814


===== RUNNING EXPERIMENT 4/18 =====
Batch Size: 16, Learning Rate: 3e-05, Epochs: 2

===== STARTING EXPERIMENT: bs16_lr3e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [1/2], Train Loss: 0.0906, Acc: 0.9589, Time: 32.84s
Epoch [1/2], Val   Loss: 0.0478, Acc: 0.9859, Time: 2.45s
New best model saved at epoch 1 with val loss: 0.0478


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [2/2], Train Loss: 0.0199, Acc: 0.9945, Time: 32.78s
Epoch [2/2], Val   Loss: 0.0737, Acc: 0.9796, Time: 2.45s

✅ Total training + validation time: 71.29 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.26it/s]


🧪 Test time: 2.72 seconds

----- Results for Experiment 4: bs16_lr3e-05_ep2 -----
Best epoch: 1 with validation loss: 0.0478
Test Metrics:
  Accuracy: 0.9843
  Precision: 0.9802
  Recall: 0.9886
  F1 Score: 0.9844


===== RUNNING EXPERIMENT 5/18 =====
Batch Size: 16, Learning Rate: 3e-05, Epochs: 3

===== STARTING EXPERIMENT: bs16_lr3e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [1/3], Train Loss: 0.1065, Acc: 0.9632, Time: 32.85s
Epoch [1/3], Val   Loss: 0.0476, Acc: 0.9875, Time: 2.46s
New best model saved at epoch 1 with val loss: 0.0476


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [2/3], Train Loss: 0.0195, Acc: 0.9965, Time: 32.82s
Epoch [2/3], Val   Loss: 0.0542, Acc: 0.9875, Time: 2.46s


Epoch 3 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [3/3], Train Loss: 0.0095, Acc: 0.9973, Time: 32.82s
Epoch [3/3], Val   Loss: 0.0660, Acc: 0.9828, Time: 2.48s

✅ Total training + validation time: 106.66 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.28it/s]


🧪 Test time: 2.71 seconds

----- Results for Experiment 5: bs16_lr3e-05_ep3 -----
Best epoch: 1 with validation loss: 0.0476
Test Metrics:
  Accuracy: 0.9843
  Precision: 0.9802
  Recall: 0.9886
  F1 Score: 0.9844


===== RUNNING EXPERIMENT 6/18 =====
Batch Size: 16, Learning Rate: 3e-05, Epochs: 4

===== STARTING EXPERIMENT: bs16_lr3e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [1/4], Train Loss: 0.0980, Acc: 0.9687, Time: 32.85s
Epoch [1/4], Val   Loss: 0.0397, Acc: 0.9875, Time: 2.46s
New best model saved at epoch 1 with val loss: 0.0397


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [2/4], Train Loss: 0.0183, Acc: 0.9953, Time: 32.86s
Epoch [2/4], Val   Loss: 0.0651, Acc: 0.9828, Time: 2.46s


Epoch 3 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [3/4], Train Loss: 0.0151, Acc: 0.9957, Time: 32.83s
Epoch [3/4], Val   Loss: 0.0513, Acc: 0.9875, Time: 2.46s


Epoch 4 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [4/4], Train Loss: 0.0063, Acc: 0.9984, Time: 32.85s
Epoch [4/4], Val   Loss: 0.0634, Acc: 0.9843, Time: 2.46s

✅ Total training + validation time: 141.98 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.22it/s]


🧪 Test time: 2.72 seconds

----- Results for Experiment 6: bs16_lr3e-05_ep4 -----
Best epoch: 1 with validation loss: 0.0397
Test Metrics:
  Accuracy: 0.9857
  Precision: 0.9913
  Recall: 0.9800
  F1 Score: 0.9856


===== RUNNING EXPERIMENT 7/18 =====
Batch Size: 16, Learning Rate: 2e-05, Epochs: 2

===== STARTING EXPERIMENT: bs16_lr2e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [1/2], Train Loss: 0.0940, Acc: 0.9659, Time: 32.85s
Epoch [1/2], Val   Loss: 0.0451, Acc: 0.9906, Time: 2.47s
New best model saved at epoch 1 with val loss: 0.0451


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [2/2], Train Loss: 0.0280, Acc: 0.9914, Time: 32.81s
Epoch [2/2], Val   Loss: 0.0530, Acc: 0.9875, Time: 2.45s

✅ Total training + validation time: 71.35 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.27it/s]


🧪 Test time: 2.71 seconds

----- Results for Experiment 7: bs16_lr2e-05_ep2 -----
Best epoch: 1 with validation loss: 0.0451
Test Metrics:
  Accuracy: 0.9771
  Precision: 0.9827
  Recall: 0.9714
  F1 Score: 0.9770


===== RUNNING EXPERIMENT 8/18 =====
Batch Size: 16, Learning Rate: 2e-05, Epochs: 3

===== STARTING EXPERIMENT: bs16_lr2e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [1/3], Train Loss: 0.0968, Acc: 0.9581, Time: 32.81s
Epoch [1/3], Val   Loss: 0.0587, Acc: 0.9843, Time: 2.47s
New best model saved at epoch 1 with val loss: 0.0587


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [2/3], Train Loss: 0.0289, Acc: 0.9918, Time: 32.86s
Epoch [2/3], Val   Loss: 0.0512, Acc: 0.9843, Time: 2.47s
New best model saved at epoch 2 with val loss: 0.0512


Epoch 3 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [3/3], Train Loss: 0.0098, Acc: 0.9984, Time: 32.83s
Epoch [3/3], Val   Loss: 0.0471, Acc: 0.9922, Time: 2.46s
New best model saved at epoch 3 with val loss: 0.0471

✅ Total training + validation time: 108.98 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.29it/s]


🧪 Test time: 2.71 seconds

----- Results for Experiment 8: bs16_lr2e-05_ep3 -----
Best epoch: 3 with validation loss: 0.0471
Test Metrics:
  Accuracy: 0.9857
  Precision: 0.9913
  Recall: 0.9800
  F1 Score: 0.9856


===== RUNNING EXPERIMENT 9/18 =====
Batch Size: 16, Learning Rate: 2e-05, Epochs: 4

===== STARTING EXPERIMENT: bs16_lr2e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [1/4], Train Loss: 0.0889, Acc: 0.9659, Time: 32.86s
Epoch [1/4], Val   Loss: 0.0556, Acc: 0.9812, Time: 2.46s
New best model saved at epoch 1 with val loss: 0.0556


Epoch 2 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [2/4], Train Loss: 0.0200, Acc: 0.9957, Time: 32.84s
Epoch [2/4], Val   Loss: 0.0478, Acc: 0.9875, Time: 2.45s
New best model saved at epoch 2 with val loss: 0.0478


Epoch 3 Training: 100%|██████████| 160/160 [00:32<00:00,  4.88it/s]


Epoch [3/4], Train Loss: 0.0154, Acc: 0.9965, Time: 32.81s
Epoch [3/4], Val   Loss: 0.0850, Acc: 0.9812, Time: 2.45s


Epoch 4 Training: 100%|██████████| 160/160 [00:32<00:00,  4.87it/s]


Epoch [4/4], Train Loss: 0.0091, Acc: 0.9973, Time: 32.83s
Epoch [4/4], Val   Loss: 0.0632, Acc: 0.9843, Time: 2.46s

✅ Total training + validation time: 143.16 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.31it/s]


🧪 Test time: 2.71 seconds

----- Results for Experiment 9: bs16_lr2e-05_ep4 -----
Best epoch: 2 with validation loss: 0.0478
Test Metrics:
  Accuracy: 0.9800
  Precision: 0.9773
  Recall: 0.9829
  F1 Score: 0.9801


===== RUNNING EXPERIMENT 10/18 =====
Batch Size: 32, Learning Rate: 5e-05, Epochs: 2

===== STARTING EXPERIMENT: bs32_lr5e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.57it/s]


Epoch [1/2], Train Loss: 0.1039, Acc: 0.9596, Time: 31.14s
Epoch [1/2], Val   Loss: 0.0731, Acc: 0.9765, Time: 2.40s
New best model saved at epoch 1 with val loss: 0.0731


Epoch 2 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [2/2], Train Loss: 0.0345, Acc: 0.9894, Time: 31.05s
Epoch [2/2], Val   Loss: 0.0456, Acc: 0.9890, Time: 2.41s
New best model saved at epoch 2 with val loss: 0.0456

✅ Total training + validation time: 68.94 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.30it/s]


🧪 Test time: 2.66 seconds

----- Results for Experiment 10: bs32_lr5e-05_ep2 -----
Best epoch: 2 with validation loss: 0.0456
Test Metrics:
  Accuracy: 0.9829
  Precision: 0.9856
  Recall: 0.9800
  F1 Score: 0.9828


===== RUNNING EXPERIMENT 11/18 =====
Batch Size: 32, Learning Rate: 5e-05, Epochs: 3

===== STARTING EXPERIMENT: bs32_lr5e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [1/3], Train Loss: 0.1126, Acc: 0.9592, Time: 31.03s
Epoch [1/3], Val   Loss: 0.0667, Acc: 0.9812, Time: 2.40s
New best model saved at epoch 1 with val loss: 0.0667


Epoch 2 Training: 100%|██████████| 80/80 [00:31<00:00,  2.57it/s]


Epoch [2/3], Train Loss: 0.0344, Acc: 0.9902, Time: 31.09s
Epoch [2/3], Val   Loss: 0.0526, Acc: 0.9875, Time: 2.46s
New best model saved at epoch 2 with val loss: 0.0526


Epoch 3 Training: 100%|██████████| 80/80 [00:31<00:00,  2.57it/s]


Epoch [3/3], Train Loss: 0.0176, Acc: 0.9957, Time: 31.08s
Epoch [3/3], Val   Loss: 0.0535, Acc: 0.9796, Time: 2.39s

✅ Total training + validation time: 102.43 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.36it/s]


🧪 Test time: 2.64 seconds

----- Results for Experiment 11: bs32_lr5e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0526
Test Metrics:
  Accuracy: 0.9857
  Precision: 0.9913
  Recall: 0.9800
  F1 Score: 0.9856


===== RUNNING EXPERIMENT 12/18 =====
Batch Size: 32, Learning Rate: 5e-05, Epochs: 4

===== STARTING EXPERIMENT: bs32_lr5e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [1/4], Train Loss: 0.1137, Acc: 0.9569, Time: 31.07s
Epoch [1/4], Val   Loss: 0.0755, Acc: 0.9749, Time: 2.39s
New best model saved at epoch 1 with val loss: 0.0755


Epoch 2 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [2/4], Train Loss: 0.0367, Acc: 0.9898, Time: 31.05s
Epoch [2/4], Val   Loss: 0.0560, Acc: 0.9828, Time: 2.40s
New best model saved at epoch 2 with val loss: 0.0560


Epoch 3 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [3/4], Train Loss: 0.0106, Acc: 0.9984, Time: 31.06s
Epoch [3/4], Val   Loss: 0.0711, Acc: 0.9828, Time: 2.39s


Epoch 4 Training: 100%|██████████| 80/80 [00:30<00:00,  2.58it/s]


Epoch [4/4], Train Loss: 0.0069, Acc: 0.9992, Time: 31.00s
Epoch [4/4], Val   Loss: 0.0912, Acc: 0.9828, Time: 2.39s

✅ Total training + validation time: 135.78 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.38it/s]


🧪 Test time: 2.64 seconds

----- Results for Experiment 12: bs32_lr5e-05_ep4 -----
Best epoch: 2 with validation loss: 0.0560
Test Metrics:
  Accuracy: 0.9729
  Precision: 0.9911
  Recall: 0.9543
  F1 Score: 0.9723


===== RUNNING EXPERIMENT 13/18 =====
Batch Size: 32, Learning Rate: 3e-05, Epochs: 2

===== STARTING EXPERIMENT: bs32_lr3e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [1/2], Train Loss: 0.0986, Acc: 0.9596, Time: 31.07s
Epoch [1/2], Val   Loss: 0.0474, Acc: 0.9890, Time: 2.38s
New best model saved at epoch 1 with val loss: 0.0474


Epoch 2 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [2/2], Train Loss: 0.0135, Acc: 0.9973, Time: 31.03s
Epoch [2/2], Val   Loss: 0.0478, Acc: 0.9859, Time: 2.39s

✅ Total training + validation time: 67.61 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.41it/s]


🧪 Test time: 2.63 seconds

----- Results for Experiment 13: bs32_lr3e-05_ep2 -----
Best epoch: 1 with validation loss: 0.0474
Test Metrics:
  Accuracy: 0.9771
  Precision: 0.9665
  Recall: 0.9886
  F1 Score: 0.9774


===== RUNNING EXPERIMENT 14/18 =====
Batch Size: 32, Learning Rate: 3e-05, Epochs: 3

===== STARTING EXPERIMENT: bs32_lr3e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [1/3], Train Loss: 0.1060, Acc: 0.9573, Time: 31.05s
Epoch [1/3], Val   Loss: 0.0541, Acc: 0.9859, Time: 2.38s
New best model saved at epoch 1 with val loss: 0.0541


Epoch 2 Training: 100%|██████████| 80/80 [00:30<00:00,  2.58it/s]


Epoch [2/3], Train Loss: 0.0240, Acc: 0.9929, Time: 31.00s
Epoch [2/3], Val   Loss: 0.0489, Acc: 0.9890, Time: 2.38s
New best model saved at epoch 2 with val loss: 0.0489


Epoch 3 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [3/3], Train Loss: 0.0060, Acc: 0.9988, Time: 31.06s
Epoch [3/3], Val   Loss: 0.0680, Acc: 0.9828, Time: 2.37s

✅ Total training + validation time: 102.31 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.45it/s]


🧪 Test time: 2.61 seconds

----- Results for Experiment 14: bs32_lr3e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0489
Test Metrics:
  Accuracy: 0.9857
  Precision: 0.9885
  Recall: 0.9829
  F1 Score: 0.9857


===== RUNNING EXPERIMENT 15/18 =====
Batch Size: 32, Learning Rate: 3e-05, Epochs: 4

===== STARTING EXPERIMENT: bs32_lr3e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [1/4], Train Loss: 0.1086, Acc: 0.9600, Time: 31.06s
Epoch [1/4], Val   Loss: 0.0353, Acc: 0.9922, Time: 2.38s
New best model saved at epoch 1 with val loss: 0.0353


Epoch 2 Training: 100%|██████████| 80/80 [00:30<00:00,  2.58it/s]


Epoch [2/4], Train Loss: 0.0327, Acc: 0.9910, Time: 31.00s
Epoch [2/4], Val   Loss: 0.0395, Acc: 0.9890, Time: 2.39s


Epoch 3 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [3/4], Train Loss: 0.0107, Acc: 0.9980, Time: 31.03s
Epoch [3/4], Val   Loss: 0.0366, Acc: 0.9922, Time: 2.40s


Epoch 4 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [4/4], Train Loss: 0.0039, Acc: 0.9996, Time: 31.03s
Epoch [4/4], Val   Loss: 0.0499, Acc: 0.9890, Time: 2.38s

✅ Total training + validation time: 134.42 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.40it/s]


🧪 Test time: 2.63 seconds

----- Results for Experiment 15: bs32_lr3e-05_ep4 -----
Best epoch: 1 with validation loss: 0.0353
Test Metrics:
  Accuracy: 0.9843
  Precision: 0.9942
  Recall: 0.9743
  F1 Score: 0.9841


===== RUNNING EXPERIMENT 16/18 =====
Batch Size: 32, Learning Rate: 2e-05, Epochs: 2

===== STARTING EXPERIMENT: bs32_lr2e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.57it/s]


Epoch [1/2], Train Loss: 0.1437, Acc: 0.9408, Time: 31.07s
Epoch [1/2], Val   Loss: 0.0700, Acc: 0.9734, Time: 2.38s
New best model saved at epoch 1 with val loss: 0.0700


Epoch 2 Training: 100%|██████████| 80/80 [00:31<00:00,  2.57it/s]


Epoch [2/2], Train Loss: 0.0243, Acc: 0.9929, Time: 31.15s
Epoch [2/2], Val   Loss: 0.0407, Acc: 0.9890, Time: 2.38s
New best model saved at epoch 2 with val loss: 0.0407

✅ Total training + validation time: 69.08 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.33it/s]


🧪 Test time: 2.65 seconds

----- Results for Experiment 16: bs32_lr2e-05_ep2 -----
Best epoch: 2 with validation loss: 0.0407
Test Metrics:
  Accuracy: 0.9843
  Precision: 0.9885
  Recall: 0.9800
  F1 Score: 0.9842


===== RUNNING EXPERIMENT 17/18 =====
Batch Size: 32, Learning Rate: 2e-05, Epochs: 3

===== STARTING EXPERIMENT: bs32_lr2e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.57it/s]


Epoch [1/3], Train Loss: 0.1060, Acc: 0.9643, Time: 31.08s
Epoch [1/3], Val   Loss: 0.0500, Acc: 0.9875, Time: 2.38s
New best model saved at epoch 1 with val loss: 0.0500


Epoch 2 Training: 100%|██████████| 80/80 [00:31<00:00,  2.57it/s]


Epoch [2/3], Train Loss: 0.0283, Acc: 0.9910, Time: 31.14s
Epoch [2/3], Val   Loss: 0.0369, Acc: 0.9922, Time: 2.41s
New best model saved at epoch 2 with val loss: 0.0369


Epoch 3 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [3/3], Train Loss: 0.0140, Acc: 0.9957, Time: 31.06s
Epoch [3/3], Val   Loss: 0.0448, Acc: 0.9906, Time: 2.38s

✅ Total training + validation time: 102.51 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.38it/s]


🧪 Test time: 2.64 seconds

----- Results for Experiment 17: bs32_lr2e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0369
Test Metrics:
  Accuracy: 0.9814
  Precision: 0.9884
  Recall: 0.9743
  F1 Score: 0.9813


===== RUNNING EXPERIMENT 18/18 =====
Batch Size: 32, Learning Rate: 2e-05, Epochs: 4

===== STARTING EXPERIMENT: bs32_lr2e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 80/80 [00:31<00:00,  2.57it/s]


Epoch [1/4], Train Loss: 0.1134, Acc: 0.9612, Time: 31.12s
Epoch [1/4], Val   Loss: 0.0402, Acc: 0.9875, Time: 2.38s
New best model saved at epoch 1 with val loss: 0.0402


Epoch 2 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [2/4], Train Loss: 0.0208, Acc: 0.9949, Time: 31.05s
Epoch [2/4], Val   Loss: 0.0473, Acc: 0.9890, Time: 2.39s


Epoch 3 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [3/4], Train Loss: 0.0175, Acc: 0.9957, Time: 31.06s
Epoch [3/4], Val   Loss: 0.0702, Acc: 0.9796, Time: 2.38s


Epoch 4 Training: 100%|██████████| 80/80 [00:31<00:00,  2.58it/s]


Epoch [4/4], Train Loss: 0.0116, Acc: 0.9976, Time: 31.03s
Epoch [4/4], Val   Loss: 0.0418, Acc: 0.9890, Time: 2.39s

✅ Total training + validation time: 134.53 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.33it/s]

🧪 Test time: 2.65 seconds

----- Results for Experiment 18: bs32_lr2e-05_ep4 -----
Best epoch: 1 with validation loss: 0.0402
Test Metrics:
  Accuracy: 0.9857
  Precision: 0.9913
  Recall: 0.9800
  F1 Score: 0.9856





In [23]:
# Create a DataFrame for easier analysis
results_df = pd.DataFrame([
    {
        'Experiment': r['experiment_name'],
        'Batch Size': int(r['experiment_name'].split('_')[0][2:]),
        'Learning Rate': float(r['experiment_name'].split('_')[1][2:]),
        'Epochs': int(r['experiment_name'].split('_')[2][2:]),
        'Best Epoch': r['best_epoch'],
        'Best Val Loss': r['best_val_loss'],
        'Accuracy': r['metrics']['Accuracy'],
        'Precision': r['metrics']['Precision'],
        'Recall': r['metrics']['Recall'],
        'F1 Score': r['metrics']['F1 Score'],
        'Model Path': r['model_path']
    }
    for r in results
])

In [24]:
from datetime import datetime
# Save results to CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_csv_path = f'text_model_experiments_results_{timestamp}.csv'
results_df.to_csv(results_csv_path, index=False)
print(f"\nResults saved to {results_csv_path}")


Results saved to text_model_experiments_results_20250803_062758.csv


In [25]:
# Print summary of all experiment results sorted by F1 Score
print("\n===== EXPERIMENT RESULTS SUMMARY (Sorted by F1 Score) =====")
sorted_results = results_df.sort_values('F1 Score', ascending=False)
print(results_df[['Experiment', 'Accuracy', 'Precision', 'Recall','F1 Score']])


===== EXPERIMENT RESULTS SUMMARY (Sorted by F1 Score) =====
          Experiment  Accuracy  Precision    Recall  F1 Score
0   bs16_lr5e-05_ep2  0.964286   0.938005  0.994286  0.965326
1   bs16_lr5e-05_ep3  0.984286   0.994169  0.974286  0.984127
2   bs16_lr5e-05_ep4  0.981429   0.982808  0.980000  0.981402
3   bs16_lr3e-05_ep2  0.984286   0.980170  0.988571  0.984353
4   bs16_lr3e-05_ep3  0.984286   0.980170  0.988571  0.984353
5   bs16_lr3e-05_ep4  0.985714   0.991329  0.980000  0.985632
6   bs16_lr2e-05_ep2  0.977143   0.982659  0.971429  0.977011
7   bs16_lr2e-05_ep3  0.985714   0.991329  0.980000  0.985632
8   bs16_lr2e-05_ep4  0.980000   0.977273  0.982857  0.980057
9   bs32_lr5e-05_ep2  0.982857   0.985632  0.980000  0.982808
10  bs32_lr5e-05_ep3  0.985714   0.991329  0.980000  0.985632
11  bs32_lr5e-05_ep4  0.972857   0.991098  0.954286  0.972344
12  bs32_lr3e-05_ep2  0.977143   0.966480  0.988571  0.977401
13  bs32_lr3e-05_ep3  0.985714   0.988506  0.982857  0.985673
14  bs32_

In [26]:
# Find the best model based on F1 score
best_exp = sorted_results.iloc[0]
print("\n===== BEST MODEL =====")
print(f"Configuration: {best_exp['Experiment']}")
print(f"Batch Size: {best_exp['Batch Size']}")
print(f"Learning Rate: {best_exp['Learning Rate']}")
print(f"Total Epochs: {best_exp['Epochs']}")
print(f"Best Epoch: {best_exp['Best Epoch']}")
print(f"F1 Score: {best_exp['F1 Score']:.4f}")
print(f"Accuracy: {best_exp['Accuracy']:.4f}")
print(f"Precision: {best_exp['Precision']:.4f}")
print(f"Recall: {best_exp['Recall']:.4f}")
print(f"Model Path: {best_exp['Model Path']}")


===== BEST MODEL =====
Configuration: bs32_lr3e-05_ep3
Batch Size: 32
Learning Rate: 3e-05
Total Epochs: 3
Best Epoch: 2
F1 Score: 0.9857
Accuracy: 0.9857
Precision: 0.9885
Recall: 0.9829
Model Path: best_text_model_bs32_lr3e-05_ep3_state_dict.pt


# -----------------