# 📘 AG News Topic Classification using BERT

This notebook fine-tunes a BERT model on the AG News dataset for topic classification.
We will:
- Load dataset
- Preprocess and tokenize
- Train BERT
- Evaluate
- Save model

In [1]:
# ====================================
# 1. Install & Import Libraries
# ====================================
!pip install transformers datasets scikit-learn

import numpy as np
from datasets import load_dataset
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/eb/62/eb8157afb21bd229c864521c1ab4fa8e9b4f1b06bafdd8c4668a7a31b5dd/datasets-4.0.0-py3-none-any.whl.metadata
  Using cached datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Obtaining dependency information for pyarrow>=15.0.0 from https://files.pythonhosted.org/packages/6e/0b/77ea0600009842b30ceebc3337639a7380cd946061b620ac1a2f3cb541e2/pyarrow-21.0.0-cp311-cp311-win_amd64.whl.metadata
  Using cached pyarrow-21.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting xxhash (from datasets)
  Obtaining dependency information for xxhash from https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata
  Using cached xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Obtaining dependency

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
s3fs 2023.3.0 requires fsspec==2023.3.0, but you have fsspec 2025.3.0 which is incompatible.
Neither PyTorch nor TensorFlow >= 2.0 have been found.Models won't be available and only tokenizers, configurationand file/data utilities can be used.


ImportError: cannot import name 'BertForSequenceClassification' from 'transformers' (D:\anacondaaa\Lib\site-packages\transformers\__init__.py)

In [2]:
# ====================================
# 2. Dataset Load
# ====================================
labels = ["World","Sports","Business","Sci/Tech"]
dataset = load_dataset("ag_news")

def select_cols(batch):
    texts = []
    for i in range(len(batch['label'])):
        title = batch.get('title', [None]*len(batch['label']))[i] if 'title' in batch else None
        text = batch['text'][i]
        headline = title if (title is not None and title != '') else text
        texts.append(headline)
    return {"headline": texts, "label": batch["label"]}

dataset = dataset.map(select_cols, batched=True, remove_columns=dataset['train'].column_names)

AttributeError: module 'transformers' has no attribute 'PreTrainedTokenizerBase'

In [None]:
# ====================================
# 3. Tokenization
# ====================================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['headline'], truncation=True, max_length=64)

encoded = dataset.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# ====================================
# 4. Model Load
# ====================================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(labels))

In [None]:
# ====================================
# 5. Metrics
# ====================================
def compute_metrics(eval_pred):
    logits, y = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(y, preds),
        "f1": f1_score(y, preds, average="weighted")
    }

In [None]:
# ====================================
# 6. Training Arguments
# ====================================
args = TrainingArguments(
    output_dir="./models",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

In [None]:
# ====================================
# 7. Trainer
# ====================================
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded['train'],
    eval_dataset=encoded['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# ====================================
# 8. Train Model
# ====================================
trainer.train()

In [None]:
# ====================================
# 9. Evaluate Model
# ====================================
metrics = trainer.evaluate()
print(metrics)

In [None]:
# ====================================
# 10. Save Model
# ====================================
trainer.save_model("./models/news_topic_classifier")
tokenizer.save_pretrained("./models/news_topic_classifier")