In [3]:
# STEP 1: Install and Import Required Libraries
!pip install transformers datasets evaluate torch --quiet
!pip install gradio --quiet

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
import numpy as np
import gradio as gr

# STEP 2: Load Dataset (AG News from Hugging Face)
dataset = load_dataset("ag_news")
print(dataset)

# STEP 3: Tokenize Dataset (for BERT input)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

# STEP 4: Load Pretrained BERT Model for Classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# STEP 5: Prepare Data for Training
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }
# STEP 6: Training Arguments & Trainer
!pip install --upgrade transformers --quiet  # Run this at the start once

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",   # Evaluate at the end of each epoch
    save_strategy="epoch",         # Save model after each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# STEP 7: Train the Model
trainer.train()

# STEP 8: Evaluate Model Performance
results = trainer.evaluate()
print("Evaluation Results:", results)

# STEP 9: Deploy Simple Gradio Interface
labels = ["World", "Sports", "Business", "Sci/Tech"]

def predict_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    preds = torch.softmax(outputs.logits, dim=1)
    return {labels[i]: float(preds[0][i]) for i in range(len(labels))}

interface = gr.Interface(fn=predict_news, inputs="text", outputs="label", title="News Topic Classifier (BERT)")
interface.launch(share=True)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[?25h

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'