In [1]:
import os
import json
import time
import random
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EvalPrediction,
    pipeline,
    EarlyStoppingCallback
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, matthews_corrcoef, f1_score

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)


Device: cuda


In [2]:
DATA_CSV = "result/label/df_all_labeled_clean.csv"  
OUT_DIR = "model/indobert-base-p2"
os.makedirs(OUT_DIR, exist_ok=True)

exp_meta = {
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    "seed": SEED,
    "data_csv": DATA_CSV,
    "model_name": "indobenchmark/indobert-base-p2",
    "notes": "HF datasets workflow, sliding-window inference"
}
with open(Path(OUT_DIR) / "exp_meta.json", "w") as f:
    json.dump(exp_meta, f, indent=2)
print("Experiment metadata saved to", OUT_DIR)

Experiment metadata saved to model/indobert-base-p2


In [3]:
# Load CSV into HF Dataset and define label map
df = pd.read_csv(DATA_CSV)
label_map = {"Neutral": 0, "Inflation": 1, "Deflation": 2}
df["label_id"] = df["label"].map(label_map)

text_col = "clean_text"
label_col = "label_id"

print("Total rows:", len(df))
print(df[label_col].value_counts().to_dict())

Total rows: 8992
{0: 4443, 1: 3221, 2: 1328}


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8992 entries, 0 to 8991
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           8992 non-null   object
 1   domain        8992 non-null   object
 2   title         8992 non-null   object
 3   date          8992 non-null   object
 4   clean_text    8992 non-null   object
 5   label         8992 non-null   object
 6   label_reason  8992 non-null   object
 7   source        8992 non-null   object
 8   year_month    8992 non-null   object
 9   year          8992 non-null   int64 
 10  month         8992 non-null   int64 
 11  tokens        8992 non-null   int64 
 12  text_len      8992 non-null   int64 
 13  label_id      8992 non-null   int64 
dtypes: int64(5), object(9)
memory usage: 983.6+ KB


In [5]:
df['label'].value_counts()

label
Neutral      4443
Inflation    3221
Deflation    1328
Name: count, dtype: int64

# Stratified train / val / test split

In [6]:
train_idx, temp_idx = train_test_split(
    df.index.tolist(), test_size=0.30, random_state=SEED, stratify=df[label_col]
)
val_idx, test_idx = train_test_split(
    temp_idx, test_size=0.50, random_state=SEED, stratify=df.loc[temp_idx, label_col]
)

train_df = df.loc[train_idx].reset_index(drop=True)
val_df = df.loc[val_idx].reset_index(drop=True)
test_df = df.loc[test_idx].reset_index(drop=True)

print("Sizes\ntrain, val, test:\n", len(train_df), len(val_df), len(test_df))

Sizes
train, val, test:
 6294 1349 1349


# Tokenizer and tokenization (batched, remove original columns)

In [7]:
hf_dset = DatasetDict({
    "train": Dataset.from_pandas(train_df[[text_col, label_col]]),
    "validation": Dataset.from_pandas(val_df[[text_col, label_col]]),
    "test": Dataset.from_pandas(test_df[[text_col, label_col]])
})
hf_dset = hf_dset.rename_column("label_id", "labels")
print(hf_dset)

DatasetDict({
    train: Dataset({
        features: ['clean_text', 'labels'],
        num_rows: 6294
    })
    validation: Dataset({
        features: ['clean_text', 'labels'],
        num_rows: 1349
    })
    test: Dataset({
        features: ['clean_text', 'labels'],
        num_rows: 1349
    })
})


In [8]:
# Tokenizer and tokenization (batched, remove original columns)
MODEL_NAME = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

max_length = 512  # model limit for base bert
def tokenize_fn(batch):
    return tokenizer(batch[text_col], truncation=True, max_length=max_length)

hf_dset = hf_dset.map(tokenize_fn, batched=True, remove_columns=[text_col])
hf_dset.set_format(type="torch")
print(hf_dset["train"][0])



Map:   0%|          | 0/6294 [00:00<?, ? examples/s]

Map:   0%|          | 0/1349 [00:00<?, ? examples/s]

Map:   0%|          | 0/1349 [00:00<?, ? examples/s]

{'labels': tensor(2), 'input_ids': tensor([    2,  1117,   300, 12143,  9499,  2193, 18355,  1835, 30469,   502,
        30468,  6114,  2569, 30464,  6924, 30363, 30465,  4330,  2876,  5810,
        30470,   485,  7081,  9499,  2193, 18355,   339,   262,  8820,  1835,
          502, 30468,  4622,  2569, 30464,    31, 30358, 30364, 30465, 30468,
           41,   339,  6075,  1835,   111, 30468, 10036,  2569, 30464,  1951,
        30371, 30465, 30470, 30458,  1218,  7648, 12265,   405,   126,  1736,
         4393,  2193, 18355, 30468,  9499,  2193, 18355,  5429,  1835, 30469,
          502, 30468,  6114,  2569, 30464,  6924, 30363, 30465, 30468,   216,
         2029,    98,   823,  1131, 30468, 30458,   661,  1179,  4795,  2242,
         1117,   300, 30468,   490,   847,  4844,  5680, 30359,  6359,   112,
         8472,   460,    57, 30468,   678, 30468,  3253, 30464,  2585, 30471,
          606, 30465, 30470, 18583,  3278,    16,  1256,  2876,  5810,   126,
         3098,  2490,   644, 

# Data Collector (dynamic padding)

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Compute Class Weight

To handle the class imbalance without causing data leakage, we calculate weights using **only the training set distribution** (excluding validation and test sets).

The formula used is:

$$
Weight = \frac{\text{Total Training Samples}}{\text{Number of Classes} \times \text{Class Frequency}}
$$

#### Applied to Training Data:
* **Total Training Samples:** $6,294$ ($3110 + 2255 + 929$)
* **Number of Classes:** $3$

**1. Neutral (Majority Class)**
$$
W = \frac{6294}{3 \times 3110} = \mathbf{0.67}
$$

**2. Inflation**
$$
W = \frac{6294}{3 \times 2255} = \mathbf{0.93}
$$

**3. Deflation (Minority Class)**
$$
W = \frac{6294}{3 \times 929} = \mathbf{2.26}
$$
*(Weight > 1: The model pays 2.26x more attention to this class)*

In [10]:
train_labels = train_df[label_col].to_list()
counts = Counter(train_labels)
total=sum(counts.values())
num_classes = len(label_map)
class_weights_list = [total / (num_classes*counts.get(i,1)) for i in range (num_classes)]
class_weights = torch.tensor(class_weights_list, dtype = torch.float)
print("Class counts:", counts)
print("Class weights: ", class_weights_list)

Class counts: Counter({0: 3110, 1: 2255, 2: 929})
Class weights:  [0.6745980707395498, 0.9303769401330377, 2.2583423035522068]


# Model 

# Compute Metrics

In [11]:
def compute_metrics(pred: EvalPrediction):
    logits = pred.predictions
    y_true = pred.label_ids
    y_pred = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="macro", zero_division=0
    )

    mcc = matthews_corrcoef(y_true, y_pred)

    report = {
        "accuracy": (y_true == y_pred).mean(),
        "f1_macro": f1,
        "precision_macro": precision,
        "recall_macro": recall,
        "mcc": mcc
    }
    return report


# Pre-training Evaluation

This code is for sanitiy check to ensure that:
- The model is successfully loaded into memory (GPU/CPU).
- The test data was successfully entered into the model without error (tensor dimensions matched).
- The evaluation function (compute_metrics) runs without bugs.

In [14]:
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_classes,
    local_files_only=True
).to(DEVICE)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
baseline_trainer = Trainer(
    model=baseline_model,
    eval_dataset=hf_dset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
# 3. Evaluate without training
print("Baseline Performance (PRE-TRAIN):")
baseline_metrics = baseline_trainer.evaluate()

Baseline Performance (PRE-TRAIN):


  0%|          | 0/169 [00:00<?, ?it/s]

In [19]:
print(baseline_metrics)

{'eval_loss': 1.0758129358291626, 'eval_model_preparation_time': 0.0, 'eval_accuracy': 0.3535952557449963, 'eval_f1_macro': 0.17754604695962142, 'eval_precision_macro': 0.20161290322580647, 'eval_recall_macro': 0.32843102258394613, 'eval_mcc': -0.050910245767553146, 'eval_runtime': 290.6467, 'eval_samples_per_second': 4.641, 'eval_steps_per_second': 0.581}


In [16]:
base_output = baseline_trainer.predict(hf_dset["test"])
base_y_pred = np.argmax(base_output.predictions, axis=1)
base_y_true = base_output.label_ids

target_names = ["Neutral", "Inflation", "Deflation"]
print("\nBaseline Classification Report")
print(classification_report(base_y_true, base_y_pred, target_names=target_names))

  0%|          | 0/169 [00:00<?, ?it/s]


Baseline Classification Report
              precision    recall  f1-score   support

     Neutral       0.25      0.01      0.01       667
   Inflation       0.35      0.98      0.52       483
   Deflation       0.00      0.00      0.00       199

    accuracy                           0.35      1349
   macro avg       0.20      0.33      0.18      1349
weighted avg       0.25      0.35      0.19      1349



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


It also shows the data integrity. The evaluation results on the log show an accuracy of 0.35 (35%). Since there are 3 classes (Neutral, Inflation, Deflation), the probability of a random guess is 1/3â‰ˆ33%. A yield of 35% is very close to 33%.

Also with the f1-macro of 0.18 indicating the model needed fine tuning.

# Fine Tune

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels = num_classes,
    local_files_only=True
)

model.to(DEVICE)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Create Class WeightedTrainer 

In [12]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits

        weight = class_weights.to(model.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weight, label_smoothing=0.1)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Training Arguments 

In [14]:
# Training arguments and trainer creation
output_dir = Path(OUT_DIR) / "model"
training_args = TrainingArguments(
    output_dir=str(output_dir),
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,

    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_ratio=0.1,

    optim="adamw_torch",
    lr_scheduler_type="cosine",
    max_grad_norm=1.0,
    group_by_length=True,

    fp16=torch.cuda.is_available(),
    logging_steps=50,
    seed=SEED,
    save_total_limit=3
)



# Weighted Trainer Usage

In [15]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_dset["train"],
    eval_dataset=hf_dset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train

In [16]:
train_result = trainer.train()

  0%|          | 0/1960 [00:00<?, ?it/s]

{'loss': 1.1612, 'grad_norm': 5.37812614440918, 'learning_rate': 5.1020408163265315e-06, 'epoch': 0.25}
{'loss': 1.0713, 'grad_norm': 4.552521228790283, 'learning_rate': 1.0204081632653063e-05, 'epoch': 0.51}


  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 1.0142443180084229, 'eval_accuracy': 0.4951816160118606, 'eval_f1_macro': 0.4644816661407761, 'eval_precision_macro': 0.5865261784021548, 'eval_recall_macro': 0.5478007510616206, 'eval_mcc': 0.33778971082721265, 'eval_runtime': 30.2463, 'eval_samples_per_second': 44.601, 'eval_steps_per_second': 2.81, 'epoch': 0.51}
{'loss': 1.0215, 'grad_norm': 4.11823034286499, 'learning_rate': 1.530612244897959e-05, 'epoch': 0.76}
{'loss': 0.9487, 'grad_norm': 4.099294662475586, 'learning_rate': 1.9999746258949146e-05, 'epoch': 1.02}


  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.9053704142570496, 'eval_accuracy': 0.7071905114899926, 'eval_f1_macro': 0.6633026893245205, 'eval_precision_macro': 0.6637798227965924, 'eval_recall_macro': 0.6647218336348771, 'eval_mcc': 0.52127234471594, 'eval_runtime': 35.3637, 'eval_samples_per_second': 38.147, 'eval_steps_per_second': 2.404, 'epoch': 1.02}
{'loss': 0.8719, 'grad_norm': 4.808755397796631, 'learning_rate': 1.9953791129491985e-05, 'epoch': 1.27}
{'loss': 0.8861, 'grad_norm': 5.3804497718811035, 'learning_rate': 1.9828960137631927e-05, 'epoch': 1.52}


  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.8623097538948059, 'eval_accuracy': 0.7316530763528539, 'eval_f1_macro': 0.6998681823446246, 'eval_precision_macro': 0.6935704543936891, 'eval_recall_macro': 0.710686618295314, 'eval_mcc': 0.5679121162475496, 'eval_runtime': 34.1804, 'eval_samples_per_second': 39.467, 'eval_steps_per_second': 2.487, 'epoch': 1.52}
{'loss': 0.8608, 'grad_norm': 6.842891693115234, 'learning_rate': 1.962624246950012e-05, 'epoch': 1.78}
{'loss': 0.8461, 'grad_norm': 7.770502090454102, 'learning_rate': 1.934724450106831e-05, 'epoch': 2.03}


  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.8772342801094055, 'eval_accuracy': 0.7449962935507783, 'eval_f1_macro': 0.7085693888830938, 'eval_precision_macro': 0.7106444601742746, 'eval_recall_macro': 0.7087731520340216, 'eval_mcc': 0.5787986937413553, 'eval_runtime': 34.5468, 'eval_samples_per_second': 39.049, 'eval_steps_per_second': 2.46, 'epoch': 2.03}
{'loss': 0.7911, 'grad_norm': 8.671582221984863, 'learning_rate': 1.8994177068899414e-05, 'epoch': 2.29}
{'loss': 0.743, 'grad_norm': 7.371335029602051, 'learning_rate': 1.8569837951029597e-05, 'epoch': 2.54}


  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.8445148468017578, 'eval_accuracy': 0.7501853224610823, 'eval_f1_macro': 0.7226010401432541, 'eval_precision_macro': 0.7186973328845156, 'eval_recall_macro': 0.7332960289482028, 'eval_mcc': 0.592805001084145, 'eval_runtime': 30.9393, 'eval_samples_per_second': 43.602, 'eval_steps_per_second': 2.747, 'epoch': 2.54}
{'loss': 0.7836, 'grad_norm': 3.0469863414764404, 'learning_rate': 1.8077589696806925e-05, 'epoch': 2.8}
{'loss': 0.7227, 'grad_norm': 5.77237606048584, 'learning_rate': 1.752133298136744e-05, 'epoch': 3.05}


  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.831396222114563, 'eval_accuracy': 0.7160859896219421, 'eval_f1_macro': 0.6883551444681651, 'eval_precision_macro': 0.6787691289936082, 'eval_recall_macro': 0.7147673760717238, 'eval_mcc': 0.5550979527192643, 'eval_runtime': 33.0706, 'eval_samples_per_second': 40.791, 'eval_steps_per_second': 2.57, 'epoch': 3.05}
{'loss': 0.6675, 'grad_norm': 9.59798812866211, 'learning_rate': 1.6905475695893193e-05, 'epoch': 3.3}
{'loss': 0.6782, 'grad_norm': 5.1205525398254395, 'learning_rate': 1.6234898018587336e-05, 'epoch': 3.56}


  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.8862666487693787, 'eval_accuracy': 0.748702742772424, 'eval_f1_macro': 0.7161389079418589, 'eval_precision_macro': 0.7246721047290317, 'eval_recall_macro': 0.712688216166477, 'eval_mcc': 0.5836752081230322, 'eval_runtime': 28.7434, 'eval_samples_per_second': 46.932, 'eval_steps_per_second': 2.957, 'epoch': 3.56}
{'loss': 0.6543, 'grad_norm': 6.635543346405029, 'learning_rate': 1.551491374315094e-05, 'epoch': 3.81}
{'loss': 0.6615, 'grad_norm': 7.628347873687744, 'learning_rate': 1.475122817120253e-05, 'epoch': 4.07}


  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.8591805100440979, 'eval_accuracy': 0.7316530763528539, 'eval_f1_macro': 0.7050706533966898, 'eval_precision_macro': 0.6950774039481179, 'eval_recall_macro': 0.7296961557831123, 'eval_mcc': 0.5783393334461833, 'eval_runtime': 27.6076, 'eval_samples_per_second': 48.863, 'eval_steps_per_second': 3.079, 'epoch': 4.07}
{'train_runtime': 2122.1313, 'train_samples_per_second': 29.659, 'train_steps_per_second': 0.924, 'train_loss': 0.8356060314178467, 'epoch': 4.07}


# Save Model & Metrics

In [17]:
trainer.save_model()
tokenizer.save_pretrained(output_dir)
with open(Path(OUT_DIR) / "train_result.json", "w") as f:
    json.dump(train_result.metrics, f, indent=2)

# Thresholding Minority Class

In [20]:
val_output = trainer.predict(hf_dset["validation"])
val_probs = torch.nn.functional.softmax(torch.tensor(val_output.predictions), dim=1).numpy()
val_labels = val_output.label_ids

TARGET_CLASS_ID = 2  # Deflation
best_threshold = 0.5
best_f1 = 0.0

# Search for best threshold
for thr in np.arange(0.1, 0.95, 0.05):
    preds = []
    for i in range(len(val_probs)):
        if val_probs[i, TARGET_CLASS_ID] >= thr:
            preds.append(TARGET_CLASS_ID)
        else:
            preds.append(np.argmax(val_probs[i]))
            
    # Calculate F1 for the target class only
    # We use the validation set to pick the winner
    current_f1 = f1_score(val_labels, preds, labels=[TARGET_CLASS_ID], average='micro')
    
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = thr

print(f"Optimal Threshold: {best_threshold:.2f} (Val F1: {best_f1:.4f})")

  0%|          | 0/85 [00:00<?, ?it/s]

Optimal Threshold: 0.50 (Val F1: 0.6306)


In [21]:
config_data = {
    "target_class": "Deflation",
    "target_id": TARGET_CLASS_ID,
    "threshold": float(best_threshold)
}

with open(Path(OUT_DIR) / "threshold_config.json", "w") as f:
    json.dump(config_data, f, indent=2)

# Comparison Threshold vs No Threshold

In [22]:
# Get Test Predictions
test_output = trainer.predict(hf_dset["test"])
test_probs = torch.nn.functional.softmax(torch.tensor(test_output.predictions), dim=1).numpy()
test_labels = test_output.label_ids

# SCENARIO A: Standard Prediction, what the model does by default
test_preds_standard = np.argmax(test_probs, axis=1)

# SCENARIO B: Thresholded Prediction with the threshold
test_preds_tuned = []
for i in range(len(test_probs)):
    if test_probs[i, TARGET_CLASS_ID] >= best_threshold:
        test_preds_tuned.append(TARGET_CLASS_ID)
    else:
        test_preds_tuned.append(np.argmax(test_probs[i]))

  0%|          | 0/85 [00:00<?, ?it/s]

In [23]:
target_names = ["Neutral", "Inflation", "Deflation"] 

print("SCENARIO A: Standard Model (No Thresholding)")
print(classification_report(test_labels, test_preds_standard, target_names=target_names))
print(confusion_matrix(test_labels, test_preds_standard))
print(f"\n\nSCENARIO B: Optimized Model (Threshold > {best_threshold:.2f})")
print(classification_report(test_labels, test_preds_tuned, target_names=target_names))
print(confusion_matrix(test_labels, test_preds_tuned))

SCENARIO A: Standard Model (No Thresholding)
              precision    recall  f1-score   support

     Neutral       0.78      0.77      0.78       667
   Inflation       0.78      0.69      0.73       483
   Deflation       0.52      0.68      0.59       199

    accuracy                           0.73      1349
   macro avg       0.69      0.72      0.70      1349
weighted avg       0.74      0.73      0.73      1349

[[515  71  81]
 [104 335  44]
 [ 39  24 136]]


SCENARIO B: Optimized Model (Threshold > 0.50)
              precision    recall  f1-score   support

     Neutral       0.78      0.77      0.78       667
   Inflation       0.78      0.69      0.73       483
   Deflation       0.52      0.68      0.59       199

    accuracy                           0.73      1349
   macro avg       0.69      0.72      0.70      1349
weighted avg       0.74      0.73      0.73      1349

[[515  71  81]
 [104 335  44]
 [ 39  24 136]]


No imporvement rather use the Standard Model

In [26]:
# save metrics
metrics = {
    "test_classification_report": classification_report(test_labels, test_preds_standard, target_names=["Neutral", "Inflation", "Deflation"], 
    output_dict=True, zero_division=0),
    "test_confusion_matrix": confusion_matrix(test_labels, test_preds_standard).tolist(),
    "thresholds": best_threshold
}
with open(Path(OUT_DIR) / "test_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)
print("Test metrics saved to", OUT_DIR)

Test metrics saved to model/indobert-base-p2


# Sliding Window Inference

Since IndoBERT has a hard limit of 512 tokens, it crashes if you feed it a long news report (1000-word analysis of the Indonesian economy). This function creates a work-around called Sliding Window Inference.


How it works: overlapping slightly so it doesn't miss the context

In [None]:
def predict_sliding_window_clean(text, model, tokenizer, device, chunk_size=512, stride=256):
    model.eval()
    
    # Tokenize the whole text
    tokens = tokenizer.encode(text, add_special_tokens=True)
    
    # IF Short Text (Fits in one block)
    if len(tokens) <= chunk_size:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=chunk_size)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            # Standard Softmax (Temperature=1.0 is default behavior)
            probs = F.softmax(outputs.logits, dim=1).cpu().numpy()[0]
        
        pred_label = np.argmax(probs)
        return pred_label, probs

    # IF Long Text (Sliding Window)
    logits_list = []
    
    # Loop through tokens with overlap (stride)
    for start in range(0, len(tokens), stride):
        # Slice the tokens
        window = tokens[start : start + chunk_size]
        
        # Stop if we have a tiny leftover chunk (optional safety)
        if len(window) < 10: 
            break
            
        # Decode back to text and Re-encode
        # This ensures every chunk gets its own [CLS] and [SEP] tokens
        window_text = tokenizer.decode(window, skip_special_tokens=True)
        inputs = tokenizer(window_text, return_tensors="pt", truncation=True, max_length=chunk_size)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            out = model(**inputs)
            logits_list.append(out.logits.cpu().numpy())
            
        # Stop if we reached the end
        if start + chunk_size >= len(tokens):
            break
    
    # 1. Stack all logits (shape: [num_chunks, num_classes])
    all_logits = np.vstack(logits_list)
    
    # 2. Average the logits (Consensus mechanism)
    avg_logits = np.mean(all_logits, axis=0)
    
    # 3. Convert Average Logits to Probability
    probs = F.softmax(torch.tensor(avg_logits), dim=0).numpy()
    
    # 4. Pick Winner
    pred_label = np.argmax(probs)
    
    return pred_label, probs

# Testing

In [None]:
text_example ='Dewan Pengupahan Jawa Timur dari Unsur Buruh mengungkap keinginan para buruh di Jatim agar upah minimum kabupaten/kota (UMK) naik 8-10%. Inflasi menjadi salah satu alasan buruh agar upah naik 8-10%.Harapannya para buruh untuk UMK 38 kabupaten/kota di Jatim naik 8 sampai 10%, kata Ketua Dewan Pengupahan Jatim Unsur Buruh, Ahmad Fauzi di Surabaya, Rabu (12/11/2025). Fauzi menyebut Dewan Pengupahan Jatim akan segera membahas UMK dan Upah Minimum Provinsi (UMP) di mana UMP di-deadline akan diumumkan pada awal Desember, sedangkan UMK pada pertengahan Desember.Rencananya penetapan UMP dijadwalkan pada 8 Desember 2025, sementara UMK akan ditetapkan pada 15 Desember 2025Lebih lanjut kata Fauzi, para buruh ingin kenaikan upah di angka 8-10% sebab biaya hidup semakin meningkat diiringi dengan adanya inflasi.'

pred_id, probabilities = predict_sliding_window_clean(
    text_example, 
    model, 
    tokenizer, 
    device=DEVICE
)

labels = ["Neutral", "Inflation", "Deflation"]
print(f"Prediction: {labels[pred_id]}")
print(f"Confidence: {probabilities[pred_id]:.4f}")
print(f"Full Probs: {probabilities}")

Prediction: Inflation
Confidence: 0.5718
Full Probs: [0.2970798  0.571798   0.13112219]


# Jawaban AI Gemini 3.0 terhadap text tersebut

ECONOMIC ASSESSMENT


PROBABILITIES
* Inflationary: 75%
* Neutral (Stable): 20%
* Deflationary: 5%


REASONING


Wage-Push Pressure (Inflation Driver)
The core signal in this text is the demand for an 8-10% increase in the minimum wage (UMK). In economic theory, a significant rise in the wage floor often triggers "cost-push inflation." Businesses faced with higher labor costs typically raise prices on goods and services to maintain their profit margins, directly contributing to general price increases.


Erosion of Purchasing Power (Inflation Driver)
The text explicitly cites "biaya hidup semakin meningkat" (rising cost of living) as the primary justification for the wage demand. This confirms that inflation is currently present and felt by the consumer base. The labor union is reacting to existing price instability, attempting to restore real income levels that have been eroded by rising prices.


Inflation Expectations (Inflation Driver)
The specific target of 8-10% suggests that economic actors (workers) expect inflation to continue or accelerate. When a large segment of the workforce anticipates higher prices and negotiates wages based on that expectation, it can create a self-fulfilling cycle known as a wage-price spiral, where higher wages fuel further demand and price hikes.


Policy Uncertainty (Neutral Driver)
It is important to note that the 8-10% figure is currently a "keinginan" (desire) or proposal from the labor union, not the finalized government decree. If the final decision (scheduled for mid-December) settles on a lower percentage closer to the national GDP growth rate, the inflationary impact would be contained, resulting in a neutral economic outcome.

In [None]:
long_text = "KBRN, Tarakan: Perekonomian Indonesia masih tumbuh ditengah kondisi ekonomi global yang kurang baik. Meski melambat, ekonomi Indoensia tumbuh 5,4â„…. Hal itu disampaikan Kepala Perwakilan Bank Indonesia Kaltara, Hasiando Ginsar Manik saat menjadi narasumber pada Benuanta Investment and Economic Summit di Kayan Multifunction Hall Hotel Tarakan Plaza, Jumat (21/2011/2025). 'Ekonomi kita di triwulan triwulan III tumbuh 5,4% melambat dibandingkan triwulan II tahun, 2025 yang mana ditopang oleh kinerja ekspor dan konsumsi pemerintah yang meningkat di triwulan III,' ungkap Hasiando. Beberapa sumber-sumber pertumbuhan ekonomi Indonesia di antaranya di Kalimantan yang mampu menyumbangkan 8,02%. Sedangkan porsi ekonomi utama masih berasal dari Jawa dan Sumatera. Menurutnya, tantangan ekonomi nasional saat ini adalah bagaimana menjaga pertumbuhan ekonomi Indonesia. Di sisi lain stabilitas inflasi sebenarnya terkendali. Akan tetapi menurut Hasiando, ada satu tantangan untuk investasi. Yaitu inflasi volitale food. Di mana ada beberapa daerah yang inflasi volitale foodnya di atas 5%, terutama wilayah Sumatera. Sementara wilayah Kalimantan stabilitas harganya relatif terkendali. 'Itu menjadi tantangan tersendiri bagi kita semua,' tutur Hasiando. Menurutnya, berbagai upaya telah dilakukan pemerintah pusat melalui kementerian terkait maupun pemerintah daerah agar pertumbuhan ekonomi Indonesia pada 2025 bisa bisa meningkat. Terlebih target Presiden Prabowo Subianto untuk perekonomian Indonesia tumbuh 8% pada 2028. Bank Indonesia sendiri melalui Dewan Gubernur telah memutuskan BI Ret tetap 4,65â„…. Tugas BI sendiri ada dua. Yaitu menjaga stabilitas, baiknya nilai tukar, harga dan sistem keuangan. Sedangkan kedua adalah upaya mendukung pertumbuhan ekonomi. Adapun kondisi konomi global, menurut Hasiando, sebenarnya tidak terlalu baik. Di mana pertumbuhan ekonomi di dunia di 2025 diperkirakan turun dari 3,3% menjadi 3,1%. Sebagian besar negara merevisi melambat pertumbuhannya. Kondisi ini tentu ada sebabnya. Mulai dari kondisi Amerika Serikat di mana banyak fasilitas pemerintahan yang tidak bekerja optimal. Ditambah lagi perang dagang antara Amerika Serikat dan Cina menyebabkan potensi demand, termasuk Indonesia, mengalami perlambatan. Kepastian ekonomi global juga tercermin dari beberapa indeks ketidakpastian dan polabilitas global. Menurut Hasiando, memang ada tren menurun. Akan tetapi jika melihat history dari tahun 2021 sampai sekarang masih di level atas. Benuanta Investment and Economic Summit merupakan acara yang digelar Kantor Perwakilan Bank Indonesia (KPwBI) Provinsi Kalimantan Utara (Kaltara). Acara yang dirangkai dengan diskusi panel ini dibuka Wakil Gubernur Kaltara, Ingkong Ala, dengan menghadirkan berbagai narasumber. Di antaranya Kepala Perwakilan Bank Indonesia Kaltara, Hasiando Ginsar Manik, Staf Ahli Menteri Keuangan Bidang akepatuhan Pajak, Yon Arsal, Procipal Adviser, Revenue (Police and Administration) At Proses, Rubino Sugana dan Direktur Politeknik Bisnis Kaltara, Dr. Ana Sriekaningsih, S.E., S.Th., M.M. Acara ini mengusung tema 'Epicentrum Pertumbuhan Baru Menakar Peran Hilirisasi, Industrialisasi dan Konektivitas Global terhadap Perekonomian Daerah'. (Rajab)"

pred_id, probabilities = predict_sliding_window_clean(
    long_text, 
    model, 
    tokenizer, 
    device=DEVICE
)   

print(f"Prediction: {labels[pred_id]}")
print(f"Confidence: {probabilities[pred_id]:.4f}")
print(f"Full Probs: {probabilities}")

Prediction: Neutral
Confidence: 0.6048
Full Probs: [0.604755   0.21125492 0.1839901 ]


# Jawaban AI Gemini 3.0 terhadap text tersebut

ECONOMIC ASSESSMENT


PROBABILITIES
* Neutral (Stable): 60%
* Inflationary: 35%
* Deflationary: 5%


REASONING


Explicit Stability (Neutral Driver)
The text explicitly states that inflation stability is controlled. Bank Indonesia decided to hold the BI Rate at 4.65%. Central banks typically hold rates steady when they believe the economy is balanced. They are not raising rates to fight high inflation. They are not lowering rates to fight deflation.


Volatile Food Prices (Inflation Driver)
The primary risk mentioned is "volatile food." Inflation in this sector exceeds 5% in specific regions like Sumatra. This serves as a strong counter-argument to deflation. Prices for essentials are rising in some areas. This creates localized cost-push inflation even if the national average is stable.


Slowing Global Demand (Deflation Driver)
The global economy is slowing down. Growth dropped from 3.3% to 3.1%. The text mentions trade wars and reduced demand from China and the US. Lower global demand typically lowers commodity prices. This external factor helps keep domestic inflation from getting too high.


Positive GDP Growth (General Health)
Indonesia's economy grew by 5.4%. Deflation is usually associated with economic contraction or recession. Since the economy is expanding, general deflation is highly unlikely. The slowdown from Q2 to Q3 suggests cooling, not freezing.

In [None]:
text_another_one = 'Kota Pematangsiantar mencatat deflasi sebesar -0,31 persen (mtm) pada Oktober 2025. Penurunan indeks harga ini terutama dipicu turunnya harga sejumlah komoditas pangan, terutama bawang merah dan cabai hijau.Kepala BPS Kota Pematangsiantar, Ratnauli Naibaho melalui Staf Statistik Harga, Wahyu Andamari, menyampaikan bahwa bawang merah menjadi penyumbang deflasi terbesar pada Oktober dengan andil -0,15 persen, disusul cabai hijau -0,09 persen, serta beras -0,07 persen."Penurunan harga komoditas hortikultura ini terjadi karena pasokan meningkat di wilayah Sumatera Utara, seiring masuknya masa panen pada bulan Oktober," ujar Wahyu kepada Mistar, Kamis (20/11/2025).Ia menjelaskan, deflasi pada bulan tersebut tidak lepas dari membaiknya pasokan bahan pangan, dipengaruhi kondisi cuaca, biaya produksi, dan keseimbangan antara permintaan serta penawaran di pasar.Meski demikian, sejumlah komoditas masih memberikan andil terhadap inflasi, seperti emas perhiasan 0,29 persen, cabai merah 0,04 persen, dan wortel 0,03 persen.Wahyu menambahkan, tekanan inflasi diperkirakan terus menurun pada November 2025. Prediksi curah hujan yang meningkat di Sumatera Utara dinilai akan mendorong hasil panen lebih baik, sehingga harga sejumlah komoditas pangan strategis berpotensi kembali turun."Dengan hasil panen yang meningkat, potensi penurunan tekanan inflasi pada periode mendatang semakin besar," ujarnya. (hm25)'

pred_id, probabilities = predict_sliding_window_clean(
    text_another_one, 
    model, 
    tokenizer, 
    device=DEVICE
)   

print(f"Prediction: {labels[pred_id]}")
print(f"Confidence: {probabilities[pred_id]:.4f}")
print(f"Full Probs: {probabilities}")

Prediction: Deflation
Confidence: 0.9744
Full Probs: [0.01147786 0.01411801 0.9744042 ]


# Jawaban AI Gemini 3.0 terhadap text tersebut

ECONOMIC ASSESSMENT


PROBABILITIES
* Deflationary: 85%
* Neutral (Stable): 10%
* Inflationary: 5%


REASONING


Realized Deflationary Data (Deflation Driver)
The text provides concrete statistical evidence of deflation, stating explicitly that Kota Pematangsiantar recorded a deflation of -0.31% (month-to-month) in October 2025. Unlike previous texts that relied on forecasts or demands, this is realized economic data confirming that the general price level has already decreased.


Supply-Side Surplus (Deflation Driver)
The primary mechanism driving this trend is a positive supply shock in the food sector. The text attributes the price drops to increased supply ("pasokan meningkat") caused by the harvest season ("masa panen"). In economics, when supply exceeds demand due to seasonal factors like a harvest, equilibrium prices naturally fall. Major contributors like shallots and green chilies drove the index down.


Forward-Looking Price Pressure (Deflation Driver)
The outlook for the immediate future remains deflationary or low-pressure. The BPS official predicts that inflationary pressure will continue to decline in November 2025 due to weather conditions (increased rainfall) favoring agricultural output. This expectation of continued abundance suggests that prices for strategic foods will likely remain low or drop further.


Commodity Divergence (Inflation Driver)
While the aggregate index is deflationary, specific assets show inflationary resilience. Gold jewelry rose by 0.29%. This indicates that while food costs are dropping (lowering the cost of living), store-of-value assets or non-perishables are still seeing price appreciation. However, the weight of food commodities in the consumer basket is currently overpowering these increases, resulting in net deflation.