In [1]:
import os
import numpy as np
import pandas as pd
import random
import copy

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from PIL import Image
import torchvision.models as models
import torchvision.transforms as transforms

import re
from collections import Counter
from transformers import (
    AutoConfig,
    AutoTokenizer, 
    AutoModel,
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding,
    get_scheduler
)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

2025-05-09 04:15:34.932585: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746764135.114722      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746764135.166805      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
seed = 42
def seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [4]:
# Generator untuk DataLoader
g = torch.Generator()
g.manual_seed(seed)

def seed_worker(worker_id):
    """Fungsi untuk memastikan setiap worker memiliki seed yang sama"""
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [5]:
# Paths
train_csv_path = '/kaggle/input/situsjudiid-txt4-2/text/train_data.csv'
test_csv_path = '/kaggle/input/situsjudiid-txt4-2/text/test_data.csv'

In [6]:
# Load data
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

In [7]:
# Drop missing values
print('Missing values in Train:\n', train_df.isna().sum())
train_df = train_df.dropna()

Missing values in Train:
 File Name           0
Extracted Text    132
Class               0
dtype: int64


In [8]:
import re
import pandas as pd
from collections import Counter

def clean_texts(texts):
    temp_text = []
    all_words = []

    # Kata 1–2 huruf yang penting dan tidak boleh dihapus
    exceptions = {
        "di", "ke", "ya"
    }

    for text in texts:

        # ----- BASIC CLEANING -----
        text = re.sub(r"http\S+", "", text)  # Hapus URL
        text = re.sub(r"\n", " ", text)  # Ganti newline dengan spasi
        text = re.sub(r"[^a-zA-Z']", " ", text)  # Hanya sisakan huruf dan apostrof
        text = re.sub(r"\s{2,}", " ", text).strip().lower()  # Hapus spasi ganda, ubah ke lowercase

        # ----- FILTERING -----
        words = text.split()
        filtered_words = [
            w for w in words
            if (len(w) > 2 or w in exceptions)  # Simpan kata >2 huruf atau ada di exceptions
        ]
        text = ' '.join(filtered_words)

        # ----- REMOVE UNWANTED PATTERNS -----
        text = re.sub(r'\b[aeiou]+\b', '', text)  # Hapus kata semua vokal (panjang berapa pun)
        text = re.sub(r'\b[^aeiou\s]+\b', '', text)  # Hapus kata semua konsonan (panjang berapa pun)
        text = re.sub(r'\b\w{20,}\b', '', text)  # Hapus kata sangat panjang (≥20 huruf)
        text = re.sub(r'\s+', ' ', text).strip()  # Bersihkan spasi ekstra

        temp_text.append(text)  # Simpan teks yang sudah dibersihkan
        all_words.extend(text.split())  # Simpan semua kata dari semua teks untuk hitung frekuensi global

    # Hitung frekuensi kata
    word_counts = Counter(all_words)
    rare_words = {word for word, count in word_counts.items() if count == 1}  # Kata yang muncul 1x

    # Hapus kata yang jarang muncul
    final_texts = []
    for text in temp_text:
        words = text.split()
        cleaned_words = [word for word in words if word not in rare_words]
        final_texts.append(" ".join(cleaned_words))

    return final_texts

In [9]:
# import re
# import pandas as pd
# from collections import Counter

# def clean_texts(texts):
#     temp_text = []
#     all_words = []

#     for text in texts:

#         # ----- BASIC CLEANING -----
#         text = re.sub(r"http\S+", "", text)  # Hapus URL
#         text = re.sub(r"\n", " ", text)  # Ganti newline dengan spasi
#         text = re.sub(r"[^a-zA-Z']", " ", text)  # Hanya sisakan huruf dan apostrof
#         text = re.sub(r"\s{2,}", " ", text).strip().lower()  # Hapus spasi ganda, ubah ke lowercase

#         # ----- REMOVE UNWANTED WORD PATTERNS -----
#         text = re.sub(r'\b\w{1}\b', '', text)  # Hapus kata 1 huruf
#         text = re.sub(r'\b[aeiou]{2,4}\b', '', text)  # Hapus kata semua vokal 2-4 huruf
#         text = re.sub(r'\b[^aeiou\s]{2,4}\b', '', text)  # Hapus kata semua konsonan 2-4 huruf
#         text = re.sub(r'\b\w{20,}\b', '', text)  # Hapus kata sangat panjang (≥20 huruf)
#         text = re.sub(r'\s+', ' ', text).strip()  # Bersihkan spasi ekstra

#         temp_text.append(text)  # Simpan teks yang sudah dibersihkan
#         all_words.extend(text.split())  # Simpan semua kata dari semua teks untuk hitung frekuensi global

#     # Hitung frekuensi kata
#     word_counts = Counter(all_words)
#     rare_words = {word for word, count in word_counts.items() if count == 1}  # Kata yang muncul 1x

#     # Hapus kata yang jarang muncul
#     final_texts = []
#     for text in temp_text:
#         words = text.split()
#         cleaned_words = [word for word in words if word not in rare_words]
#         final_texts.append(" ".join(cleaned_words))

#     return final_texts

In [10]:
# # Clean text
# train_df['cleaned_text'] = clean_texts(train_df['Extracted Text'])
# test_df['cleaned_text'] = clean_texts(test_df['Extracted Text'])
# Gabungkan teks dari train dan test sementara
combined_texts = pd.concat([train_df['Extracted Text'], test_df['Extracted Text']], ignore_index=True)

# Bersihkan semua teks gabungan
cleaned_all = clean_texts(combined_texts)

# Bagi kembali hasil cleaned text ke dalam train dan test
train_df['cleaned_text'] = cleaned_all[:len(train_df)]
test_df['cleaned_text'] = cleaned_all[len(train_df):]

In [11]:
# Drop rows with less than 5 words
train_df = train_df[train_df['cleaned_text'].apply(lambda x: len(str(x).split()) >= 5)]
test_df = test_df[test_df['cleaned_text'].apply(lambda x: len(str(x).split()) >= 5)]
train_df

Unnamed: 0,File Name,Extracted Text,Class,cleaned_text
0,0156726.slotslou.sbs.png,HOME jackpotpartycasinoslotsonline — login pr...,judi,home login bigwin jackpot codes
1,015eaglegaze.online.png,r #TOGrt AFATC Situs Online Gaming Terbaik Un...,judi,situs online gaming terbaik untuk gamers langs...
2,016-bar.pro.png,"""€! INFO TERKINI: (TOGEL ONLINE TERBESAR DAN T...",judi,info terkini togel online terbesar dan terpeca...
3,016-good.pro.png,Sa TIPE Nomor Ab sernncan 2 Sa HADIAHTOGEL —— ...,judi,tipe nomor mimpi imei referral hubungi kami
4,0164999.slotslou.sbs.png,bet 365 keluaran togel hongkong kemarin HOME ...,judi,bet keluaran togel hongkong kemarin home apk t...
...,...,...,...,...
7058,zoom.us.png,O cari Dukungan 1.888.799.9666 Minta Demo Berg...,non-judi,cari dukungan minta demo bergabung host masuk ...
7059,zoom.us_home.png,(O Cari Dukungan 1888.7990.9666 Minta Demo ZO...,non-judi,cari dukungan minta demo zoom rapat anda denga...
7060,zoom.us_page2.png,Dukungan 666 Minta Demo Bergabung stv Masuk ...,non-judi,dukungan minta demo bergabung masuk hubungi pe...
7061,zoom.us_page4.png,ZOOM z00m Work Transformation Summit APAC F...,non-judi,zoom work transformation summit from efficienc...


In [12]:
# Cek jumlah duplikasi sebelum dihapus
print("Duplikasi di train:", train_df.duplicated(subset='cleaned_text').sum())
print("Duplikasi di test :", test_df.duplicated(subset='cleaned_text').sum())

# Hapus duplikasi berdasarkan cleaned_text
train_df = train_df.drop_duplicates(subset='cleaned_text').reset_index(drop=True)
test_df = test_df.drop_duplicates(subset='cleaned_text').reset_index(drop=True)

# Cek ulang setelah pembersihan
print("Setelah dihapus:")
print("Train:", len(train_df), "baris")
print("Test :", len(test_df), "baris")

Duplikasi di train: 37
Duplikasi di test : 0
Setelah dihapus:
Train: 6541 baris
Test : 700 baris


In [13]:
# Print jumlah per kelas
print("Distribusi label di Train set:")
print(train_df['Class'].value_counts(), '\n')

print("Distribusi label di Test set:")
print(test_df['Class'].value_counts())

Distribusi label di Train set:
Class
non-judi    3969
judi        2572
Name: count, dtype: int64 

Distribusi label di Test set:
Class
judi        350
non-judi    350
Name: count, dtype: int64


In [14]:
from sklearn.utils import resample

# Pisahkan data berdasarkan kelas
train_judi = train_df[train_df['Class'] == 'judi']
train_nonjudi = train_df[train_df['Class'] == 'non-judi']

# Undersampling kelas mayoritas (non-judi) agar jumlahnya sama dengan kelas judi
train_nonjudi_undersampled = resample(train_nonjudi,
                                      replace=False,      # tanpa duplikasi
                                      n_samples=len(train_judi),  # samakan jumlahnya dengan kelas minoritas
                                      random_state=42)    # untuk replikasi hasil

# Gabungkan kembali data yang sudah diundersample
train_df_balanced = pd.concat([train_judi, train_nonjudi_undersampled])

# Cek distribusi baru
print("Distribusi label setelah undersampling:")
print(train_df_balanced['Class'].value_counts())

Distribusi label setelah undersampling:
Class
judi        2572
non-judi    2572
Name: count, dtype: int64


In [15]:
label_map = {
    "non-judi": 0,
    "judi": 1
}

train_df_balanced['label'] = train_df_balanced['Class'].map(label_map)
test_df['label'] = test_df['Class'].map(label_map)

In [16]:
traindf, validdf = train_test_split(
    train_df_balanced, test_size=0.2, stratify=train_df_balanced['label'], random_state=42
)

print(f"Jumlah data train: {len(traindf)}")
print(f"Jumlah data valid: {len(validdf)}")
print(f"Jumlah data test: {len(test_df)}")

Jumlah data train: 4115
Jumlah data valid: 1029
Jumlah data test: 700


In [17]:
# Print jumlah per kelas
print("Distribusi label di Train set:")
print(traindf['label'].value_counts(), '\n')

print("Distribusi label di Validation set:")
print(validdf['label'].value_counts(), '\n')

print("Distribusi label di Test set:")
print(test_df['label'].value_counts())

Distribusi label di Train set:
label
1    2058
0    2057
Name: count, dtype: int64 

Distribusi label di Validation set:
label
0    515
1    514
Name: count, dtype: int64 

Distribusi label di Test set:
label
1    350
0    350
Name: count, dtype: int64


In [18]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        label = row['label']
        text = str(row['cleaned_text'])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt',
            padding='max_length'
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }


In [19]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
max_length = 128

# Dataset
train_dataset = CustomDataset(traindf, tokenizer, max_length)
valid_dataset = CustomDataset(validdf, tokenizer, max_length)
test_dataset = CustomDataset(test_df, tokenizer, max_length)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [20]:
def run_experiment(batch_size, learning_rate, num_epochs, device):
    experiment_name = f"bs{batch_size}_lr{learning_rate}_ep{num_epochs}"
    print(f"\n===== STARTING EXPERIMENT: {experiment_name} =====\n")

    # Inisialisasi DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                              num_workers=0, worker_init_fn=seed_worker, generator=g)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, 
                              num_workers=0, worker_init_fn=seed_worker, generator=g)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, 
                             num_workers=0, worker_init_fn=seed_worker, generator=g)

    # Model
    text_model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', num_labels=1)
    text_model.to(device)
    is_parallel = False
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        text_model = nn.DataParallel(text_model)
        is_parallel = True

    optimizer = optim.Adam(text_model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    train_losses, train_accuracies = [], []
    valid_losses, valid_accuracies = [], []

    best_val_loss = float('inf')
    best_model_path = f'best_text_model_{experiment_name}_state_dict.pt'

    # Time tracking
    training_start_time = time.time()

    for epoch in range(num_epochs):
        # TRAINING
        epoch_train_start = time.time()
        text_model.train()
        total_loss = 0.0
        correct = 0
        total_samples = 0

        for batch_data in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
            input_ids = batch_data['input_ids'].to(device)
            attention_mask = batch_data['attention_mask'].to(device)
            labels = batch_data['label'].float().to(device)

            optimizer.zero_grad()
            outputs = text_model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            predicted = (torch.sigmoid(outputs) > 0.5).long()
            correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        epoch_loss = total_loss / len(train_loader)
        epoch_accuracy = correct / total_samples
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_accuracy)
        epoch_train_duration = time.time() - epoch_train_start

        # VALIDATION
        epoch_val_start = time.time()
        text_model.eval()
        valid_loss = 0.0
        valid_correct = 0
        valid_total_samples = 0

        with torch.no_grad():
            for batch_data in valid_loader:
                input_ids = batch_data['input_ids'].to(device)
                attention_mask = batch_data['attention_mask'].to(device)
                labels = batch_data['label'].float().to(device)

                outputs = text_model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)
                loss = criterion(outputs, labels)
                valid_loss += loss.item()

                predicted = (torch.sigmoid(outputs) > 0.5).long()
                valid_correct += (predicted == labels).sum().item()
                valid_total_samples += labels.size(0)

        valid_epoch_loss = valid_loss / len(valid_loader)
        valid_epoch_accuracy = valid_correct / valid_total_samples
        valid_losses.append(valid_epoch_loss)
        valid_accuracies.append(valid_epoch_accuracy)
        epoch_val_duration = time.time() - epoch_val_start

        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Acc: {epoch_accuracy:.4f}, Time: {epoch_train_duration:.2f}s")
        print(f"Epoch [{epoch+1}/{num_epochs}], Val   Loss: {valid_epoch_loss:.4f}, Acc: {valid_epoch_accuracy:.4f}, Time: {epoch_val_duration:.2f}s")

        if valid_epoch_loss < best_val_loss:
            best_val_loss = valid_epoch_loss
            torch.save(text_model.module.state_dict() if is_parallel else text_model.state_dict(), best_model_path)
            print(f"New best model saved at epoch {epoch+1} with val loss: {valid_epoch_loss:.4f}")

    total_training_duration = time.time() - training_start_time
    print(f"\n✅ Total training + validation time: {total_training_duration:.2f} seconds\n")

    # Load best model
    best_model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', num_labels=1)
    best_model.load_state_dict(torch.load(best_model_path))
    best_model.to(device)

    # EVALUATION
    test_start_time = time.time()
    metrics = evaluate_text_model(best_model, test_loader, device)
    test_duration = time.time() - test_start_time
    print(f"🧪 Test time: {test_duration:.2f} seconds")

    return {
        'experiment_name': experiment_name,
        'metrics': metrics,
        'model_path': best_model_path,
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'valid_losses': valid_losses,
        'valid_accuracies': valid_accuracies,
        'best_val_loss': best_val_loss,
        'best_epoch': np.argmin(valid_losses) + 1,
        'train_time': total_training_duration,
        'test_time': test_duration,
    }

In [21]:
# Define function to evaluate text model
def evaluate_text_model(model, data_loader, device): 
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_data in tqdm(data_loader, desc="Evaluating Text Model"):
            input_ids = batch_data['input_ids'].to(device)
            attention_mask = batch_data['attention_mask'].to(device)
            labels = batch_data['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1)
            preds = (torch.sigmoid(outputs) > 0.5).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(all_labels, all_preds),
        'Precision': precision_score(all_labels, all_preds),
        'Recall': recall_score(all_labels, all_preds),
        'F1 Score': f1_score(all_labels, all_preds)
    }
    
    return metrics

In [22]:
# Define experiments based on the specified hyperparameters
experiments = [
    {'batch_size': bs, 'learning_rate': lr, 'num_epochs': ep}
    for bs in [16, 32]
    for lr in [5e-5, 3e-5, 2e-5]
    for ep in [2, 3, 4]
]

In [23]:
# Run all experiments and collect results
results = []
for i, exp in enumerate(experiments):
    print(f"\n\n===== RUNNING EXPERIMENT {i+1}/{len(experiments)} =====")
    print(f"Batch Size: {exp['batch_size']}, Learning Rate: {exp['learning_rate']}, Epochs: {exp['num_epochs']}")
    
    experiment_result = run_experiment(
        batch_size=exp['batch_size'],
        learning_rate=exp['learning_rate'],
        num_epochs=exp['num_epochs'],
        device=device
    )
    
    results.append(experiment_result)
    
    # Print current experiment metrics
    print(f"\n----- Results for Experiment {i+1}: {experiment_result['experiment_name']} -----")
    print(f"Best epoch: {experiment_result['best_epoch']} with validation loss: {experiment_result['best_val_loss']:.4f}")
    print("Test Metrics:")
    for metric_name, value in experiment_result['metrics'].items():
        print(f"  {metric_name}: {value:.4f}")



===== RUNNING EXPERIMENT 1/18 =====
Batch Size: 16, Learning Rate: 5e-05, Epochs: 2

===== STARTING EXPERIMENT: bs16_lr5e-05_ep2 =====



pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]


Epoch 1 Training:   0%|          | 0/258 [00:00<?, ?it/s][A
Epoch 1 Training:   0%|          | 1/258 [00:00<03:25,  1.25it/s][A
Epoch 1 Training:   1%|          | 2/258 [00:01<01:56,  2.19it/s][A
Epoch 1 Training:   1%|          | 3/258 [00:01<01:29,  2.84it/s][A
Epoch 1 Training:   2%|▏         | 4/258 [00:01<01:16,  3.32it/s][A
Epoch 1 Training:   2%|▏         | 5/258 [00:01<01:09,  3.67it/s][A
Epoch 1 Training:   2%|▏         | 6/258 [00:01<01:04,  3.91it/s][A
Epoch 1 Training:   3%|▎         | 7/258 [00:02<01:00,  4.14it/s][A
Epoch 1 Training:   3%|▎         | 8/258 [00:02<00:58,  4.29it/s][A
Epoch 1 Training:   3%|▎         | 9/258 [00:02<00:56,  4.41it/s][A
Epoch 1 Training:   4%|▍         | 10/258 [00:02<00:55,  4.51it/s][A
Epoch 1 Training:   4%|▍         | 11/258 [00:02<00:54,  4.57it/s][A
Epoch 1 Training:   5%|▍         | 12/258 [00:03<00:53,  4.58it/s][A
Epoch 1 Training:   5%|▌         | 13/258 [00:03<00:52,  4.63it/s][A
Epoch 1 Training:   5%|▌         | 14

Epoch [1/2], Train Loss: 0.1055, Acc: 0.9655, Time: 55.07s
Epoch [1/2], Val   Loss: 0.0988, Acc: 0.9699, Time: 4.01s
New best model saved at epoch 1 with val loss: 0.0988


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/2], Train Loss: 0.0565, Acc: 0.9847, Time: 54.04s
Epoch [2/2], Val   Loss: 0.0936, Acc: 0.9776, Time: 3.99s
New best model saved at epoch 2 with val loss: 0.0936

✅ Total training + validation time: 119.07 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.15it/s]


🧪 Test time: 2.73 seconds

----- Results for Experiment 1: bs16_lr5e-05_ep2 -----
Best epoch: 2 with validation loss: 0.0936
Test Metrics:
  Accuracy: 0.9857
  Precision: 0.9775
  Recall: 0.9943
  F1 Score: 0.9858


===== RUNNING EXPERIMENT 2/18 =====
Batch Size: 16, Learning Rate: 5e-05, Epochs: 3

===== STARTING EXPERIMENT: bs16_lr5e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [1/3], Train Loss: 0.1197, Acc: 0.9626, Time: 54.08s
Epoch [1/3], Val   Loss: 0.2134, Acc: 0.9329, Time: 4.00s
New best model saved at epoch 1 with val loss: 0.2134


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/3], Train Loss: 0.0449, Acc: 0.9866, Time: 54.06s
Epoch [2/3], Val   Loss: 0.0652, Acc: 0.9776, Time: 4.01s
New best model saved at epoch 2 with val loss: 0.0652


Epoch 3 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [3/3], Train Loss: 0.0321, Acc: 0.9910, Time: 54.05s
Epoch [3/3], Val   Loss: 0.1962, Acc: 0.9534, Time: 4.01s

✅ Total training + validation time: 176.14 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.11it/s]


🧪 Test time: 2.74 seconds

----- Results for Experiment 2: bs16_lr5e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0652
Test Metrics:
  Accuracy: 0.9943
  Precision: 0.9943
  Recall: 0.9943
  F1 Score: 0.9943


===== RUNNING EXPERIMENT 3/18 =====
Batch Size: 16, Learning Rate: 5e-05, Epochs: 4

===== STARTING EXPERIMENT: bs16_lr5e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [1/4], Train Loss: 0.1113, Acc: 0.9628, Time: 54.08s
Epoch [1/4], Val   Loss: 0.0629, Acc: 0.9796, Time: 4.05s
New best model saved at epoch 1 with val loss: 0.0629


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/4], Train Loss: 0.0322, Acc: 0.9910, Time: 54.08s
Epoch [2/4], Val   Loss: 0.0604, Acc: 0.9786, Time: 4.03s
New best model saved at epoch 2 with val loss: 0.0604


Epoch 3 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [3/4], Train Loss: 0.0307, Acc: 0.9913, Time: 54.14s
Epoch [3/4], Val   Loss: 0.0987, Acc: 0.9660, Time: 4.03s


Epoch 4 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [4/4], Train Loss: 0.0386, Acc: 0.9913, Time: 54.09s
Epoch [4/4], Val   Loss: 0.1239, Acc: 0.9747, Time: 4.02s

✅ Total training + validation time: 234.49 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.03it/s]


🧪 Test time: 2.75 seconds

----- Results for Experiment 3: bs16_lr5e-05_ep4 -----
Best epoch: 2 with validation loss: 0.0604
Test Metrics:
  Accuracy: 0.9871
  Precision: 0.9776
  Recall: 0.9971
  F1 Score: 0.9873


===== RUNNING EXPERIMENT 4/18 =====
Batch Size: 16, Learning Rate: 3e-05, Epochs: 2

===== STARTING EXPERIMENT: bs16_lr3e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [1/2], Train Loss: 0.1160, Acc: 0.9587, Time: 54.12s
Epoch [1/2], Val   Loss: 0.0745, Acc: 0.9776, Time: 4.01s
New best model saved at epoch 1 with val loss: 0.0745


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/2], Train Loss: 0.0432, Acc: 0.9878, Time: 54.06s
Epoch [2/2], Val   Loss: 0.0529, Acc: 0.9825, Time: 4.03s
New best model saved at epoch 2 with val loss: 0.0529

✅ Total training + validation time: 118.23 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.01it/s]


🧪 Test time: 2.76 seconds

----- Results for Experiment 4: bs16_lr3e-05_ep2 -----
Best epoch: 2 with validation loss: 0.0529
Test Metrics:
  Accuracy: 0.9786
  Precision: 0.9614
  Recall: 0.9971
  F1 Score: 0.9790


===== RUNNING EXPERIMENT 5/18 =====
Batch Size: 16, Learning Rate: 3e-05, Epochs: 3

===== STARTING EXPERIMENT: bs16_lr3e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [1/3], Train Loss: 0.1020, Acc: 0.9621, Time: 54.05s
Epoch [1/3], Val   Loss: 0.0600, Acc: 0.9825, Time: 4.01s
New best model saved at epoch 1 with val loss: 0.0600


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/3], Train Loss: 0.0392, Acc: 0.9903, Time: 54.06s
Epoch [2/3], Val   Loss: 0.0522, Acc: 0.9825, Time: 4.00s
New best model saved at epoch 2 with val loss: 0.0522


Epoch 3 Training: 100%|██████████| 258/258 [00:54<00:00,  4.78it/s]


Epoch [3/3], Train Loss: 0.0285, Acc: 0.9939, Time: 54.02s
Epoch [3/3], Val   Loss: 0.0770, Acc: 0.9825, Time: 4.01s

✅ Total training + validation time: 176.08 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.11it/s]


🧪 Test time: 2.74 seconds

----- Results for Experiment 5: bs16_lr3e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0522
Test Metrics:
  Accuracy: 0.9886
  Precision: 0.9803
  Recall: 0.9971
  F1 Score: 0.9887


===== RUNNING EXPERIMENT 6/18 =====
Batch Size: 16, Learning Rate: 3e-05, Epochs: 4

===== STARTING EXPERIMENT: bs16_lr3e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [1/4], Train Loss: 0.1021, Acc: 0.9638, Time: 54.04s
Epoch [1/4], Val   Loss: 0.0982, Acc: 0.9660, Time: 4.02s
New best model saved at epoch 1 with val loss: 0.0982


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/4], Train Loss: 0.0386, Acc: 0.9874, Time: 54.05s
Epoch [2/4], Val   Loss: 0.1271, Acc: 0.9728, Time: 4.00s


Epoch 3 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [3/4], Train Loss: 0.0190, Acc: 0.9942, Time: 54.07s
Epoch [3/4], Val   Loss: 0.0572, Acc: 0.9835, Time: 4.03s
New best model saved at epoch 3 with val loss: 0.0572


Epoch 4 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [4/4], Train Loss: 0.0171, Acc: 0.9939, Time: 54.11s
Epoch [4/4], Val   Loss: 0.0903, Acc: 0.9786, Time: 4.02s

✅ Total training + validation time: 234.31 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.03it/s]


🧪 Test time: 2.75 seconds

----- Results for Experiment 6: bs16_lr3e-05_ep4 -----
Best epoch: 3 with validation loss: 0.0572
Test Metrics:
  Accuracy: 0.9871
  Precision: 0.9749
  Recall: 1.0000
  F1 Score: 0.9873


===== RUNNING EXPERIMENT 7/18 =====
Batch Size: 16, Learning Rate: 2e-05, Epochs: 2

===== STARTING EXPERIMENT: bs16_lr2e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [1/2], Train Loss: 0.1035, Acc: 0.9621, Time: 54.10s
Epoch [1/2], Val   Loss: 0.0660, Acc: 0.9825, Time: 4.03s
New best model saved at epoch 1 with val loss: 0.0660


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/2], Train Loss: 0.0332, Acc: 0.9898, Time: 54.11s
Epoch [2/2], Val   Loss: 0.0566, Acc: 0.9806, Time: 4.03s
New best model saved at epoch 2 with val loss: 0.0566

✅ Total training + validation time: 118.21 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.02it/s]


🧪 Test time: 2.76 seconds

----- Results for Experiment 7: bs16_lr2e-05_ep2 -----
Best epoch: 2 with validation loss: 0.0566
Test Metrics:
  Accuracy: 0.9814
  Precision: 0.9668
  Recall: 0.9971
  F1 Score: 0.9817


===== RUNNING EXPERIMENT 8/18 =====
Batch Size: 16, Learning Rate: 2e-05, Epochs: 3

===== STARTING EXPERIMENT: bs16_lr2e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [1/3], Train Loss: 0.1075, Acc: 0.9667, Time: 54.08s
Epoch [1/3], Val   Loss: 0.0585, Acc: 0.9825, Time: 4.03s
New best model saved at epoch 1 with val loss: 0.0585


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/3], Train Loss: 0.0380, Acc: 0.9893, Time: 54.11s
Epoch [2/3], Val   Loss: 0.0692, Acc: 0.9767, Time: 4.02s


Epoch 3 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [3/3], Train Loss: 0.0162, Acc: 0.9956, Time: 54.06s
Epoch [3/3], Val   Loss: 0.1215, Acc: 0.9679, Time: 4.01s

✅ Total training + validation time: 175.04 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.08it/s]


🧪 Test time: 2.75 seconds

----- Results for Experiment 8: bs16_lr2e-05_ep3 -----
Best epoch: 1 with validation loss: 0.0585
Test Metrics:
  Accuracy: 0.9814
  Precision: 0.9642
  Recall: 1.0000
  F1 Score: 0.9818


===== RUNNING EXPERIMENT 9/18 =====
Batch Size: 16, Learning Rate: 2e-05, Epochs: 4

===== STARTING EXPERIMENT: bs16_lr2e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [1/4], Train Loss: 0.1089, Acc: 0.9611, Time: 54.06s
Epoch [1/4], Val   Loss: 0.0640, Acc: 0.9767, Time: 4.02s
New best model saved at epoch 1 with val loss: 0.0640


Epoch 2 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [2/4], Train Loss: 0.0359, Acc: 0.9893, Time: 54.04s
Epoch [2/4], Val   Loss: 0.0609, Acc: 0.9815, Time: 4.02s
New best model saved at epoch 2 with val loss: 0.0609


Epoch 3 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [3/4], Train Loss: 0.0145, Acc: 0.9949, Time: 54.09s
Epoch [3/4], Val   Loss: 0.1524, Acc: 0.9670, Time: 4.02s


Epoch 4 Training: 100%|██████████| 258/258 [00:54<00:00,  4.77it/s]


Epoch [4/4], Train Loss: 0.0143, Acc: 0.9959, Time: 54.10s
Epoch [4/4], Val   Loss: 0.1164, Acc: 0.9718, Time: 4.02s

✅ Total training + validation time: 234.34 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 44/44 [00:02<00:00, 16.01it/s]


🧪 Test time: 2.76 seconds

----- Results for Experiment 9: bs16_lr2e-05_ep4 -----
Best epoch: 2 with validation loss: 0.0609
Test Metrics:
  Accuracy: 0.9886
  Precision: 0.9831
  Recall: 0.9943
  F1 Score: 0.9886


===== RUNNING EXPERIMENT 10/18 =====
Batch Size: 32, Learning Rate: 5e-05, Epochs: 2

===== STARTING EXPERIMENT: bs32_lr5e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [1/2], Train Loss: 0.1117, Acc: 0.9623, Time: 51.80s
Epoch [1/2], Val   Loss: 0.0972, Acc: 0.9708, Time: 3.95s
New best model saved at epoch 1 with val loss: 0.0972


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [2/2], Train Loss: 0.0501, Acc: 0.9864, Time: 51.69s
Epoch [2/2], Val   Loss: 0.0584, Acc: 0.9815, Time: 3.94s
New best model saved at epoch 2 with val loss: 0.0584

✅ Total training + validation time: 113.33 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.11it/s]


🧪 Test time: 2.72 seconds

----- Results for Experiment 10: bs32_lr5e-05_ep2 -----
Best epoch: 2 with validation loss: 0.0584
Test Metrics:
  Accuracy: 0.9829
  Precision: 0.9829
  Recall: 0.9829
  F1 Score: 0.9829


===== RUNNING EXPERIMENT 11/18 =====
Batch Size: 32, Learning Rate: 5e-05, Epochs: 3

===== STARTING EXPERIMENT: bs32_lr5e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [1/3], Train Loss: 0.1145, Acc: 0.9623, Time: 51.63s
Epoch [1/3], Val   Loss: 0.0921, Acc: 0.9699, Time: 3.96s
New best model saved at epoch 1 with val loss: 0.0921


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [2/3], Train Loss: 0.0423, Acc: 0.9874, Time: 51.70s
Epoch [2/3], Val   Loss: 0.0623, Acc: 0.9796, Time: 3.97s
New best model saved at epoch 2 with val loss: 0.0623


Epoch 3 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [3/3], Train Loss: 0.0219, Acc: 0.9942, Time: 51.68s
Epoch [3/3], Val   Loss: 0.0655, Acc: 0.9796, Time: 3.96s

✅ Total training + validation time: 168.82 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.10it/s]


🧪 Test time: 2.73 seconds

----- Results for Experiment 11: bs32_lr5e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0623
Test Metrics:
  Accuracy: 0.9857
  Precision: 0.9749
  Recall: 0.9971
  F1 Score: 0.9859


===== RUNNING EXPERIMENT 12/18 =====
Batch Size: 32, Learning Rate: 5e-05, Epochs: 4

===== STARTING EXPERIMENT: bs32_lr5e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [1/4], Train Loss: 0.1188, Acc: 0.9560, Time: 51.73s
Epoch [1/4], Val   Loss: 0.0622, Acc: 0.9825, Time: 3.94s
New best model saved at epoch 1 with val loss: 0.0622


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [2/4], Train Loss: 0.0419, Acc: 0.9878, Time: 51.63s
Epoch [2/4], Val   Loss: 0.0545, Acc: 0.9796, Time: 3.93s
New best model saved at epoch 2 with val loss: 0.0545


Epoch 3 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [3/4], Train Loss: 0.0159, Acc: 0.9959, Time: 51.69s
Epoch [3/4], Val   Loss: 0.0677, Acc: 0.9796, Time: 3.93s


Epoch 4 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [4/4], Train Loss: 0.0123, Acc: 0.9971, Time: 51.64s
Epoch [4/4], Val   Loss: 0.1138, Acc: 0.9728, Time: 3.96s

✅ Total training + validation time: 224.41 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.12it/s]


🧪 Test time: 2.72 seconds

----- Results for Experiment 12: bs32_lr5e-05_ep4 -----
Best epoch: 2 with validation loss: 0.0545
Test Metrics:
  Accuracy: 0.9900
  Precision: 0.9886
  Recall: 0.9914
  F1 Score: 0.9900


===== RUNNING EXPERIMENT 13/18 =====
Batch Size: 32, Learning Rate: 3e-05, Epochs: 2

===== STARTING EXPERIMENT: bs32_lr3e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [1/2], Train Loss: 0.1187, Acc: 0.9614, Time: 51.71s
Epoch [1/2], Val   Loss: 0.0706, Acc: 0.9776, Time: 3.96s
New best model saved at epoch 1 with val loss: 0.0706


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [2/2], Train Loss: 0.0409, Acc: 0.9898, Time: 51.62s
Epoch [2/2], Val   Loss: 0.0642, Acc: 0.9767, Time: 3.94s
New best model saved at epoch 2 with val loss: 0.0642

✅ Total training + validation time: 113.19 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.07it/s]


🧪 Test time: 2.74 seconds

----- Results for Experiment 13: bs32_lr3e-05_ep2 -----
Best epoch: 2 with validation loss: 0.0642
Test Metrics:
  Accuracy: 0.9814
  Precision: 0.9642
  Recall: 1.0000
  F1 Score: 0.9818


===== RUNNING EXPERIMENT 14/18 =====
Batch Size: 32, Learning Rate: 3e-05, Epochs: 3

===== STARTING EXPERIMENT: bs32_lr3e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [1/3], Train Loss: 0.1088, Acc: 0.9626, Time: 51.70s
Epoch [1/3], Val   Loss: 0.0573, Acc: 0.9815, Time: 3.94s
New best model saved at epoch 1 with val loss: 0.0573


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [2/3], Train Loss: 0.0308, Acc: 0.9927, Time: 51.74s
Epoch [2/3], Val   Loss: 0.0561, Acc: 0.9825, Time: 3.94s
New best model saved at epoch 2 with val loss: 0.0561


Epoch 3 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [3/3], Train Loss: 0.0125, Acc: 0.9978, Time: 51.72s
Epoch [3/3], Val   Loss: 0.1097, Acc: 0.9767, Time: 3.94s

✅ Total training + validation time: 169.01 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.13it/s]


🧪 Test time: 2.72 seconds

----- Results for Experiment 14: bs32_lr3e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0561
Test Metrics:
  Accuracy: 0.9871
  Precision: 0.9858
  Recall: 0.9886
  F1 Score: 0.9872


===== RUNNING EXPERIMENT 15/18 =====
Batch Size: 32, Learning Rate: 3e-05, Epochs: 4

===== STARTING EXPERIMENT: bs32_lr3e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [1/4], Train Loss: 0.1037, Acc: 0.9670, Time: 51.73s
Epoch [1/4], Val   Loss: 0.0669, Acc: 0.9815, Time: 3.96s
New best model saved at epoch 1 with val loss: 0.0669


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [2/4], Train Loss: 0.0474, Acc: 0.9871, Time: 51.74s
Epoch [2/4], Val   Loss: 0.0617, Acc: 0.9845, Time: 3.95s
New best model saved at epoch 2 with val loss: 0.0617


Epoch 3 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [3/4], Train Loss: 0.0234, Acc: 0.9949, Time: 51.67s
Epoch [3/4], Val   Loss: 0.0885, Acc: 0.9806, Time: 3.95s


Epoch 4 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [4/4], Train Loss: 0.0132, Acc: 0.9973, Time: 51.69s
Epoch [4/4], Val   Loss: 0.0686, Acc: 0.9806, Time: 3.95s

✅ Total training + validation time: 224.66 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.14it/s]


🧪 Test time: 2.71 seconds

----- Results for Experiment 15: bs32_lr3e-05_ep4 -----
Best epoch: 2 with validation loss: 0.0617
Test Metrics:
  Accuracy: 0.9900
  Precision: 0.9914
  Recall: 0.9886
  F1 Score: 0.9900


===== RUNNING EXPERIMENT 16/18 =====
Batch Size: 32, Learning Rate: 2e-05, Epochs: 2

===== STARTING EXPERIMENT: bs32_lr2e-05_ep2 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [1/2], Train Loss: 0.1099, Acc: 0.9643, Time: 51.66s
Epoch [1/2], Val   Loss: 0.0726, Acc: 0.9767, Time: 3.95s
New best model saved at epoch 1 with val loss: 0.0726


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [2/2], Train Loss: 0.0418, Acc: 0.9898, Time: 51.71s
Epoch [2/2], Val   Loss: 0.0566, Acc: 0.9845, Time: 3.94s
New best model saved at epoch 2 with val loss: 0.0566

✅ Total training + validation time: 113.26 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.11it/s]


🧪 Test time: 2.72 seconds

----- Results for Experiment 16: bs32_lr2e-05_ep2 -----
Best epoch: 2 with validation loss: 0.0566
Test Metrics:
  Accuracy: 0.9857
  Precision: 0.9802
  Recall: 0.9914
  F1 Score: 0.9858


===== RUNNING EXPERIMENT 17/18 =====
Batch Size: 32, Learning Rate: 2e-05, Epochs: 3

===== STARTING EXPERIMENT: bs32_lr2e-05_ep3 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [1/3], Train Loss: 0.1202, Acc: 0.9550, Time: 51.75s
Epoch [1/3], Val   Loss: 0.0603, Acc: 0.9825, Time: 3.95s
New best model saved at epoch 1 with val loss: 0.0603


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [2/3], Train Loss: 0.0369, Acc: 0.9898, Time: 51.76s
Epoch [2/3], Val   Loss: 0.0601, Acc: 0.9776, Time: 3.94s
New best model saved at epoch 2 with val loss: 0.0601


Epoch 3 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [3/3], Train Loss: 0.0178, Acc: 0.9949, Time: 51.69s
Epoch [3/3], Val   Loss: 0.0678, Acc: 0.9825, Time: 3.94s

✅ Total training + validation time: 169.01 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.13it/s]


🧪 Test time: 2.71 seconds

----- Results for Experiment 17: bs32_lr2e-05_ep3 -----
Best epoch: 2 with validation loss: 0.0601
Test Metrics:
  Accuracy: 0.9914
  Precision: 0.9886
  Recall: 0.9943
  F1 Score: 0.9915


===== RUNNING EXPERIMENT 18/18 =====
Batch Size: 32, Learning Rate: 2e-05, Epochs: 4

===== STARTING EXPERIMENT: bs32_lr2e-05_ep4 =====



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [1/4], Train Loss: 0.1243, Acc: 0.9565, Time: 51.69s
Epoch [1/4], Val   Loss: 0.0621, Acc: 0.9776, Time: 3.95s
New best model saved at epoch 1 with val loss: 0.0621


Epoch 2 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [2/4], Train Loss: 0.0390, Acc: 0.9891, Time: 51.74s
Epoch [2/4], Val   Loss: 0.0563, Acc: 0.9806, Time: 3.95s
New best model saved at epoch 2 with val loss: 0.0563


Epoch 3 Training: 100%|██████████| 129/129 [00:51<00:00,  2.50it/s]


Epoch [3/4], Train Loss: 0.0197, Acc: 0.9954, Time: 51.67s
Epoch [3/4], Val   Loss: 0.0488, Acc: 0.9874, Time: 3.95s
New best model saved at epoch 3 with val loss: 0.0488


Epoch 4 Training: 100%|██████████| 129/129 [00:51<00:00,  2.49it/s]


Epoch [4/4], Train Loss: 0.0180, Acc: 0.9959, Time: 51.76s
Epoch [4/4], Val   Loss: 0.0652, Acc: 0.9815, Time: 3.94s

✅ Total training + validation time: 225.92 seconds



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Text Model: 100%|██████████| 22/22 [00:02<00:00,  8.15it/s]

🧪 Test time: 2.71 seconds

----- Results for Experiment 18: bs32_lr2e-05_ep4 -----
Best epoch: 3 with validation loss: 0.0488
Test Metrics:
  Accuracy: 0.9886
  Precision: 0.9831
  Recall: 0.9943
  F1 Score: 0.9886





In [24]:
# Create a DataFrame for easier analysis
results_df = pd.DataFrame([
    {
        'Experiment': r['experiment_name'],
        'Batch Size': int(r['experiment_name'].split('_')[0][2:]),
        'Learning Rate': float(r['experiment_name'].split('_')[1][2:]),
        'Epochs': int(r['experiment_name'].split('_')[2][2:]),
        'Best Epoch': r['best_epoch'],
        'Best Val Loss': r['best_val_loss'],
        'Accuracy': r['metrics']['Accuracy'],
        'Precision': r['metrics']['Precision'],
        'Recall': r['metrics']['Recall'],
        'F1 Score': r['metrics']['F1 Score'],
        'Model Path': r['model_path']
    }
    for r in results
])

In [25]:
from datetime import datetime
# Save results to CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_csv_path = f'text_model_experiments_results_{timestamp}.csv'
results_df.to_csv(results_csv_path, index=False)
print(f"\nResults saved to {results_csv_path}")


Results saved to text_model_experiments_results_20250509_050910.csv


In [26]:
# Print summary of all experiment results sorted by F1 Score
print("\n===== EXPERIMENT RESULTS SUMMARY (Sorted by F1 Score) =====")
sorted_results = results_df.sort_values('F1 Score', ascending=False)
print(results_df[['Experiment', 'Accuracy', 'Precision', 'Recall','F1 Score']])


===== EXPERIMENT RESULTS SUMMARY (Sorted by F1 Score) =====
          Experiment  Accuracy  Precision    Recall  F1 Score
0   bs16_lr5e-05_ep2  0.985714   0.977528  0.994286  0.985836
1   bs16_lr5e-05_ep3  0.994286   0.994286  0.994286  0.994286
2   bs16_lr5e-05_ep4  0.987143   0.977591  0.997143  0.987270
3   bs16_lr3e-05_ep2  0.978571   0.961433  0.997143  0.978962
4   bs16_lr3e-05_ep3  0.988571   0.980337  0.997143  0.988669
5   bs16_lr3e-05_ep4  0.987143   0.974930  1.000000  0.987306
6   bs16_lr2e-05_ep2  0.981429   0.966759  0.997143  0.981716
7   bs16_lr2e-05_ep3  0.981429   0.964187  1.000000  0.981767
8   bs16_lr2e-05_ep4  0.988571   0.983051  0.994286  0.988636
9   bs32_lr5e-05_ep2  0.982857   0.982857  0.982857  0.982857
10  bs32_lr5e-05_ep3  0.985714   0.974860  0.997143  0.985876
11  bs32_lr5e-05_ep4  0.990000   0.988604  0.991429  0.990014
12  bs32_lr3e-05_ep2  0.981429   0.964187  1.000000  0.981767
13  bs32_lr3e-05_ep3  0.987143   0.985755  0.988571  0.987161
14  bs32_

In [27]:
# Find the best model based on F1 score
best_exp = sorted_results.iloc[0]
print("\n===== BEST MODEL =====")
print(f"Configuration: {best_exp['Experiment']}")
print(f"Batch Size: {best_exp['Batch Size']}")
print(f"Learning Rate: {best_exp['Learning Rate']}")
print(f"Total Epochs: {best_exp['Epochs']}")
print(f"Best Epoch: {best_exp['Best Epoch']}")
print(f"F1 Score: {best_exp['F1 Score']:.4f}")
print(f"Accuracy: {best_exp['Accuracy']:.4f}")
print(f"Precision: {best_exp['Precision']:.4f}")
print(f"Recall: {best_exp['Recall']:.4f}")
print(f"Model Path: {best_exp['Model Path']}")


===== BEST MODEL =====
Configuration: bs16_lr5e-05_ep3
Batch Size: 16
Learning Rate: 5e-05
Total Epochs: 3
Best Epoch: 2
F1 Score: 0.9943
Accuracy: 0.9943
Precision: 0.9943
Recall: 0.9943
Model Path: best_text_model_bs16_lr5e-05_ep3_state_dict.pt


# -----------------