In [1]:
%pip install -q evaluate
%pip install -U "transformers>=4.35" "datasets>=2.19" "evaluate>=0.4" "accelerate>=0.26"

Note: you may need to restart the kernel to use updated packages.



In [3]:
import os
import numpy as np
import torch
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification, 
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
    IntervalStrategy,
)
import evaluate

In [4]:
# 1) Load dataset
ds = load_dataset("sh0416/ag_news")

In [5]:
# 2) Splits
if "validation" in ds:
    train_ds, val_ds = ds["train"], ds["validation"]
else:
    split = ds["train"].train_test_split(test_size=0.1, seed=42)
    train_ds, val_ds = split["train"], split["test"]

eval_ds = ds["test"] if "test" in ds else val_ds
test_ds = ds["test"] if "test" in ds else None


In [7]:
# 3) Label column + names
features = train_ds.features
label_col = next((n for n, f in features.items() if isinstance(f, ClassLabel)), None)
if label_col is None:
    label_col = "label" if "label" in features else "labels" if "labels" in features else None
assert label_col is not None, f"No label column found. Features: {features}"

# Rename label column to 'labels' for consistency
def rename_labels_column(ds, old_label_col):
    if old_label_col != "labels" and old_label_col in ds.column_names:
        return ds.rename_column(old_label_col, "labels")
    return ds

train_ds = rename_labels_column(train_ds, label_col)
val_ds = rename_labels_column(val_ds, label_col)
if test_ds is not None:
    test_ds = rename_labels_column(test_ds, label_col)


In [8]:
# 4) Text columns auto-detect
def guess_text_cols(cols):
    lmap = {c.lower(): c for c in cols}
    candidates = [
        ["text"],                    # standard
        ["title", "description"],    # common variant
        ["title", "text"],
        ["headline"],
    ]
    for cand in candidates:
        if all(k in lmap for k in cand):
            return [lmap[k] for k in cand]
    raise ValueError(f"Could not find suitable text columns in: {cols}")

text_cols = guess_text_cols(train_ds.column_names)

def build_text_batch(examples):
    if len(text_cols) == 1:
        return examples[text_cols[0]]
    parts = [examples[c] for c in text_cols]
    return [" ".join(items) for items in zip(*parts)]

In [9]:
# 5) Tokenizer + preprocessing
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
max_length = 128

def preprocess(examples):
    texts = build_text_batch(examples)
    return tokenizer(texts, truncation=True, max_length=max_length)

def keep_cols(ds):
    # Keep all columns except 'labels' (which will be added back after tokenization)
    return [c for c in ds.column_names if c != "labels"]

train_ds = train_ds.map(preprocess, batched=True, remove_columns=keep_cols(train_ds))
val_ds = val_ds.map(preprocess, batched=True, remove_columns=keep_cols(val_ds))
if test_ds is not None:
    test_ds = test_ds.map(preprocess, batched=True, remove_columns=keep_cols(test_ds))


In [10]:
# 6) Fix labels: convert 1-based labels to 0-based
def to_zero_based(ds):
    uniq = ds.unique("labels")
    if min(uniq) == 1 and max(uniq) == len(uniq):
        return ds.map(lambda ex: {"labels": ex["labels"] - 1})
    return ds

train_ds = to_zero_based(train_ds)
val_ds = to_zero_based(val_ds)
if test_ds is not None:
    test_ds = to_zero_based(test_ds)

print("train unique labels:", sorted(set(train_ds["labels"])))
print("val unique labels:", sorted(set(val_ds["labels"])))
if test_ds is not None:
    print("test unique labels:", sorted(set(test_ds["labels"])))

train unique labels: [0, 1, 2, 3]
val unique labels: [0, 1, 2, 3]
test unique labels: [0, 1, 2, 3]


In [11]:
# 7) Cast labels to ClassLabel with names (AG News labels)
label_names = ["World", "Sports", "Business", "Sci/Tech"]

train_ds = train_ds.cast_column("labels", ClassLabel(names=label_names))
val_ds = val_ds.cast_column("labels", ClassLabel(names=label_names))
if test_ds is not None:
    test_ds = test_ds.cast_column("labels", ClassLabel(names=label_names))


In [12]:
# 8) Prepare label mappings and num_labels
num_labels = len(label_names)
id2label = {i: n for i, n in enumerate(label_names)}
label2id = {n: i for i, n in enumerate(label_names)}


In [13]:
# 9) Load model with correct num_labels and label mappings
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# 10) Metrics
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1_macro = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    return {"accuracy": acc, "f1_macro": f1_macro}

# --- Select smaller subsets for faster training ---

small_train_ds = train_ds.shuffle(seed=42).select(range(2500))
small_val_ds = val_ds.shuffle(seed=42).select(range(500))
small_test_ds = test_ds.shuffle(seed=42).select(range(1000)) if test_ds is not None else None


In [16]:
# 11) Precision flags - disable fp16/bf16 for CPU
bf16 = False
fp16 = False


In [17]:
# 12) Training args
training_args = TrainingArguments(
    output_dir="outputs/bert-ag-news-sh0416",
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # CPU friendly batch size
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=50,
    bf16=bf16,
    fp16=fp16,
    report_to="none",
    dataloader_pin_memory=False,
)

In [18]:
# 13) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_ds,
    eval_dataset=small_val_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [19]:
# 14) Train + Evaluate
trainer.train()
print("Validation metrics:", trainer.evaluate(small_val_ds))
if small_test_ds is not None:
    print("Test metrics:", trainer.evaluate(small_test_ds))

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.3096,0.320552,0.912,0.909377
2,0.2121,0.39402,0.908,0.904073
3,0.1296,0.417194,0.91,0.906513


Validation metrics: {'eval_loss': 0.3205519914627075, 'eval_accuracy': 0.912, 'eval_f1_macro': 0.9093769457564722, 'eval_runtime': 117.9185, 'eval_samples_per_second': 4.24, 'eval_steps_per_second': 0.271, 'epoch': 3.0}
Test metrics: {'eval_loss': 0.35751891136169434, 'eval_accuracy': 0.899, 'eval_f1_macro': 0.89913142996581, 'eval_runtime': 200.007, 'eval_samples_per_second': 5.0, 'eval_steps_per_second': 0.315, 'epoch': 3.0}
