**Step 0: Install Required Libraries**

In [6]:
!pip install transformers==5.0.0 datasets torch scikit-learn gradio accelerate




**Step 1: Import Libraries**

In [7]:
import torch
import numpy as np
import os
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from sklearn.metrics import accuracy_score, f1_score


**Step 2: Load AG News Dataset**

In [8]:
dataset = load_dataset("ag_news")

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


**Step 3: Load Tokenizer**

In [9]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


**Step 4: Tokenization & Preprocessing**

In [10]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

**Step 5: Data Collator (Dynamic Padding)**

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


**Step 6: Load BERT Model**

In [12]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4
)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


**Step 7: Define Evaluation Metrics**

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "f1": f1
    }


In [10]:
os.environ["TENSORBOARD_LOGGING_DIR"] = "./logs"


**Step 8: Training Arguments (Transformers v5 Compatible)**

In [12]:
training_args = TrainingArguments(
    output_dir="./bert-news",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


**Step 9: Trainer Setup**

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


**Step 10: Train the Model**

In [15]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.192353,0.180933,0.944605,0.944622
2,0.112947,0.182403,0.948684,0.948724
3,0.087829,0.224082,0.949474,0.949519


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=22500, training_loss=0.1484587651570638, metrics={'train_runtime': 7045.603, 'train_samples_per_second': 51.096, 'train_steps_per_second': 3.193, 'total_flos': 1.6999491211228032e+16, 'train_loss': 0.1484587651570638, 'epoch': 3.0})

**Step 11: Final Evaluation**

In [16]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.224082350730896, 'eval_accuracy': 0.9494736842105264, 'eval_f1': 0.9495185994133315, 'eval_runtime': 50.3151, 'eval_samples_per_second': 151.048, 'eval_steps_per_second': 9.441, 'epoch': 3.0}


**Step 12: Save Model & Tokenizer**

In [17]:
trainer.save_model("bert-ag-news-model")
tokenizer.save_pretrained("bert-ag-news-model")


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('bert-ag-news-model/tokenizer_config.json',
 'bert-ag-news-model/tokenizer.json')

**Step 13: Deploy with Gradio**

In [18]:
import torch
import gradio as gr
from transformers import BertTokenizerFast, BertForSequenceClassification

# Load model & tokenizer
model_path = "bert-ag-news-model"

tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

labels = ["World", "Sports", "Business", "Sci/Tech"]

def predict_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)

    prediction = torch.argmax(outputs.logits, dim=1).item()
    return labels[prediction]

interface = gr.Interface(
    fn=predict_news,
    inputs=gr.Textbox(lines=2, placeholder="Enter a news headline..."),
    outputs="text",
    title="📰 AG News Topic Classifier (BERT)",
    description="Classifies news headlines into World, Sports, Business, or Sci/Tech"
)

interface.launch()


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f684c97fe383114231.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [19]:
import shutil

shutil.make_archive(
    "bert-ag-news-model",  # zip file name
    "zip",                 # format
    "bert-ag-news-model"   # folder to zip
)


'/content/bert-ag-news-model.zip'

In [20]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [21]:
import shutil

drive_zip_path = "/content/drive/MyDrive/bert-models/bert-ag-news-model"

shutil.make_archive(
    drive_zip_path,        # full path WITHOUT .zip
    "zip",
    "bert-ag-news-model"   # folder to zip
)


'/content/drive/MyDrive/bert-models/bert-ag-news-model.zip'