<a href="https://colab.research.google.com/github/adnan1404-ds/News-Classification-using-BERT/blob/main/AG_News_Classification_using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Enviroment

In [27]:
!pip install transformers datasets evaluate torch scikit-learn gradio streamlit
!pip install --upgrade transformers




# Tokenize and Preprocess Data

#### Load Dataset and Tokenizer

In [28]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the AG News dataset
dataset = load_dataset("ag_news")

# Load the pre-trained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

#### Create a Tokenize Function

In [29]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenizer to the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

#### Prepare the Data for Training

In [30]:
# Rename 'label' to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Select the necessary columns and format for PyTorch
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Get the training and test splits
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

# Fine Tune the Bert Model

#### Load the Model

In [31]:
from transformers import AutoModelForSequenceClassification

# Load the BERT model with a classification head
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Define Training Arguments

In [32]:
import os

# Set the WANDB_DISABLED environment variable
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments

# Now, the Trainer will not try to log to W&B
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    eval_strategy="epoch", # Set eval_strategy to epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


#### Initialize and Run the Trainer

In [33]:
from transformers import Trainer

# Re-initialize the trainer with the metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Run evaluation
eval_results = trainer.evaluate()
print(eval_results)

  trainer = Trainer(


{'eval_loss': 1.407562017440796, 'eval_model_preparation_time': 0.007, 'eval_accuracy': 0.2556578947368421, 'eval_f1_score': 0.1438142987124965, 'eval_runtime': 227.0621, 'eval_samples_per_second': 33.471, 'eval_steps_per_second': 2.092}


In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,F1 Score
1,0.1988,0.175931,0.007,0.943947,0.943898


# Evalute the Model

#### Define the Metrics

In [None]:
import numpy as np
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]

    return {"accuracy": accuracy, "f1_score": f1}

# Deploy the Model

#### Map labels to Catogeries

In [None]:
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

#### Create a Prediction Function

In [None]:
def predict_topic(headline):
    # Tokenize the input headline
    inputs = tokenizer(headline, return_tensors="pt")

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Get the model's prediction
    outputs = model(**inputs)

    # Get the predicted class ID
    predicted_class_id = outputs.logits.argmax(dim=-1).item()

    # Map the ID to the label and return it
    return id2label[predicted_class_id]

#### Create a Gradio Interface

In [None]:
import gradio as gr

# Create the Gradio interface
interface = gr.Interface(
    fn=predict_topic,
    inputs=gr.Textbox(lines=5, label="News Headline"),
    outputs="text",
    title="News Topic Classifier",
    description="Enter a news headline to classify its topic using a fine-tuned BERT model.",
    examples=["U.S. economy adds 350,000 jobs in January", "LeBron James scores 40 points in Lakers win", "Astronomers discover new exoplanet"]
)

# Launch the interface
interface.launch(share=True)

In [None]:
# Save the model and tokenizer to a local directory
save_directory = "./fine-tuned-model"
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [None]:
# Save the tokenized training and test datasets
tokenized_datasets["train"].save_to_disk("./ag_news_train_dataset")
tokenized_datasets["test"].save_to_disk("./ag_news_test_dataset")

# Load the dataset later
from datasets import load_from_disk
loaded_train_dataset = load_from_disk("./ag_news_train_dataset")

In [None]:
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
save_directory = "./fine-tuned-model" # Make sure to replace with your path
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Map label IDs to categories
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

def predict_topic(headline):
    inputs = tokenizer(headline, return_tensors="pt")
    outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax(dim=-1).item()
    return id2label[predicted_class_id]

css = """
.container {
    padding: 20px;
    border-radius: 10px;
    border: 1px solid #e0e0e0;
    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
    background-color: #f8f9fa;
}
.title-text {
    text-align: center;
    color: #4CAF50;
    font-size: 2.5em;
    font-weight: bold;
    margin-bottom: 20px;
}
"""

with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
    gr.HTML("<h1 class='title-text'>News Topic Classifier</h1>")
    with gr.Row():
        with gr.Column(scale=2):
            headline_input = gr.Textbox(
                lines=5,
                label="Enter News Headline",
                placeholder="Example: 'LeBron James leads Lakers to a stunning victory...'"
            )
        with gr.Column(scale=1):
            with gr.Box(elem_classes="container"):
                output = gr.Label(label="Predicted Topic")
                classify_button = gr.Button("Classify Topic")

    classify_button.click(
        fn=predict_topic,
        inputs=headline_input,
        outputs=output
    )

    gr.Examples(
        examples=[
            "Apple's stock price soars after a positive earnings report.",
            "Astronomers discover a new galaxy at the edge of the universe.",
            "The national team won the World Cup in a dramatic penalty shootout."
        ],
        inputs=headline_input,
    )

demo.launch(share=True)