# Import & Setup

In [2]:
import numpy as np
import torch

from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from sklearn.metrics import accuracy_score, f1_score


In [3]:
print("CUDA available:", torch.cuda.is_available())


CUDA available: True


# Load Dataset (AG News)

In [4]:
dataset = load_dataset("sh0416/ag_news")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'description'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['label', 'title', 'description'],
        num_rows: 7600
    })
})

# Label Names (untuk evaluasi)

In [5]:
label_names = ["World", "Sports", "Business", "Sci/Tech"]


# Tokenizer

In [6]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


# Tokenization Function

In [7]:
def tokenize_function(example):
    texts = [
        t + " " + d
        for t, d in zip(example["title"], example["description"])
    ]
    return tokenizer(
        texts,
        truncation=True,
        max_length=128
    )


# Apply Tokenization

In [8]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["title", "description"]
)


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [9]:
def shift_labels(example):
    example["label"] = example["label"] - 1
    return example

tokenized_dataset = tokenized_dataset.map(shift_labels)


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [10]:
print(set(tokenized_dataset["train"]["label"]))
# HARUS: {0, 1, 2, 3}


{0, 1, 2, 3}


# Set Format PyTorch

In [11]:
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)


# Data Collator (Dynamic Padding)

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# Model

In [13]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Metrics

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }


# TrainingArguments

In [32]:
training_args = TrainingArguments(
    output_dir="./results",

    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    num_train_epochs=2,     # cukup untuk AG News
    weight_decay=0.01,

    logging_steps=200,

    load_best_model_at_end=True,
    metric_for_best_model="accuracy",

    fp16=False,             # MATIKAN (hindari CUDA assert)
    report_to="none"
)


# Trainer

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [28]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
)


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Sanity Check (WAJIB)

In [22]:
training_args_debug = TrainingArguments(
    output_dir="./results_debug",

    eval_strategy="no",
    save_strategy="no",

    learning_rate=2e-5,
    per_device_train_batch_size=16,

    max_steps=100,          # ⬅️ PINDAH KE SINI
    logging_steps=1,

    fp16=False,
    report_to="none"
)


In [23]:
trainer_debug = Trainer(
    model=model,
    args=training_args_debug,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer_debug = Trainer(


In [24]:
trainer_debug.train()


Step,Training Loss
1,1.1112
2,1.108
3,1.2404
4,1.2046
5,1.1372
6,1.1922
7,1.1952
8,1.2406
9,1.1972
10,1.2004


TrainOutput(global_step=100, training_loss=0.7367938411235809, metrics={'train_runtime': 354.4973, 'train_samples_per_second': 4.513, 'train_steps_per_second': 0.282, 'total_flos': 76007570975232.0, 'train_loss': 0.7367938411235809, 'epoch': 0.013333333333333334})

# Training

In [34]:
small_train = tokenized_dataset["train"].shuffle(seed=42).select(range(2000))
small_eval  = tokenized_dataset["test"].shuffle(seed=42).select(range(500))


In [35]:
trainer.train()                                                         

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.486642,0.866,0.865707
2,No log,0.421784,0.876,0.875653


TrainOutput(global_step=126, training_loss=0.6231227450900607, metrics={'train_runtime': 2210.019, 'train_samples_per_second': 1.81, 'train_steps_per_second': 0.057, 'total_flos': 106975379392896.0, 'train_loss': 0.6231227450900607, 'epoch': 2.0})

# Final Evaluation

In [36]:
trainer.evaluate()

{'eval_loss': 0.4217838644981384,
 'eval_accuracy': 0.876,
 'eval_f1': 0.8756527765108846,
 'eval_runtime': 40.9105,
 'eval_samples_per_second': 12.222,
 'eval_steps_per_second': 0.391,
 'epoch': 2.0}

# Confusion Matrix (Nilai Tambah)

In [37]:
from sklearn.metrics import confusion_matrix, classification_report

preds = trainer.predict(tokenized_dataset["test"])
y_pred = np.argmax(preds.predictions, axis=1)
y_true = preds.label_ids

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred, target_names=label_names))


[[1672   66   94   68]
 [  20 1862   10    8]
 [  84   20 1538  258]
 [  71    6  124 1699]]
              precision    recall  f1-score   support

       World       0.91      0.88      0.89      1900
      Sports       0.95      0.98      0.97      1900
    Business       0.87      0.81      0.84      1900
    Sci/Tech       0.84      0.89      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600



# Save Model

In [38]:
trainer.save_model("./bert-ag-news")
tokenizer.save_pretrained("./bert-ag-news")


('./bert-ag-news\\tokenizer_config.json',
 './bert-ag-news\\special_tokens_map.json',
 './bert-ag-news\\vocab.txt',
 './bert-ag-news\\added_tokens.json',
 './bert-ag-news\\tokenizer.json')