In [1]:
# ==============================================================================
#
#                         Task 1: News Topic Classifier Using BERT
#
# ==============================================================================

# ==========================================
# 1. SETUP AND INSTALLATIONS
# ==========================================
# We install the necessary libraries from Hugging Face, as well as Gradio for the UI.
!pip install transformers[torch] datasets evaluate pandas scikit-learn matplotlib seaborn gradio -q

import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import gradio as gr

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# ==============================================================================
#
#                        Problem Statement & Objective
#
# ==============================================================================

# **Problem Statement:**
# We need an efficient way to categorize news articles based on their headlines.
# Manual categorization is slow and inconsistent. An automated system can
# streamline content management, user personalization, and trend analysis.

# **Objective:**
# 1. Fine-tune a pre-trained BERT model on the AG News dataset.
# 2. Build a classifier to categorize headlines into four topics: World, Sports, Business, and Sci/Tech.
# 3. Evaluate the model's performance using accuracy and F1-score.
# 4. Create an interactive web interface for real-time classification of new headlines.

In [3]:
# ==============================================================================
#
#                    2. DATASET LOADING & PREPROCESSING
#
# ==============================================================================

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer


# --- Load the Dataset from the provided CSV file ---
# Note: Upload 'agnews.csv' to your Colab session's file system.
try:
    df = pd.read_csv('agnews.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'agnews.csv' not found. Please upload the file to your Colab session.")
    # Create a dummy dataframe to avoid breaking the script
    df = pd.DataFrame(columns=['Title', 'Description', 'Class Index']) # Define columns to avoid errors later


# --- Define the mapping from 0-based ID to a descriptive label name. ---
# This needs to be defined regardless of whether the file is found for downstream use.
id2label = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
label2id = {'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}


# --- Preprocessing ---
if not df.empty:
    # Combine Title and Description into a single 'text' column
    df['text'] = df['Title'] + " " + df['Description']

    # The 'Class Index' is 1-based (1, 2, 3, 4). Models require 0-based labels.
    # We subtract 1 to get labels (0, 1, 2, 3).
    df['label'] = df['Class Index'] - 1

    # Drop the original columns we no longer need
    df.drop(columns=['Class Index', 'Title', 'Description'], inplace=True)

    print("\nDataset after preprocessing:")
    display(df.head())

    print("\nLabel distribution:")
    print(df['label'].value_counts().sort_index())

    # --- Split Data and Convert to Hugging Face Dataset format ---
    # Stratify ensures that the train and test splits have similar proportions of each class
    train_df, test_df = train_test_split(
        df,
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Convert pandas DataFrames to Hugging Face Dataset objects
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Combine into a DatasetDict
    ds = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    print("\nCreated Hugging Face DatasetDict:")
    print(ds)

    # --- Initialize Tokenizer ---
    # We use the tokenizer corresponding to our chosen model.
    model_checkpoint = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    # --- Preprocessing Function ---
    # This function tokenizes the input text. `truncation=True` ensures that
    # inputs longer than the model's maximum length are cut off.
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    # --- Apply Tokenization ---
    print("\nTokenizing datasets...")
    tokenized_datasets = ds.map(tokenize_function, batched=True)
    print("Tokenization complete.")

    # Let's check one example to see the added 'input_ids', 'token_type_ids', 'attention_mask'
    print("\nExample of a tokenized data point:")
    print(tokenized_datasets['train'][0])
else:
    # Define dummy tokenized_datasets if df is empty to avoid errors in subsequent cells
    tokenized_datasets = {"train": None, "test": None}
    model_checkpoint = "bert-base-uncased" # Define model_checkpoint as it's used in the next cell
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # Define tokenizer as it's used in the next cell

Error: 'agnews.csv' not found. Please upload the file to your Colab session.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
# ==============================================================================
#
#                       3. MODEL DEVELOPMENT & TRAINING
#
# ==============================================================================
if not df.empty:
    # --- Load the Pre-trained Model ---
    # We use AutoModelForSequenceClassification, which loads the BERT body
    # and adds a randomly initialized classification head on top.
    # The `id2label` and `label2id` mappings help the model understand the classes.
    print("Loading pre-trained model for fine-tuning...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(id2label),
        id2label=id2label,
        label2id=label2id
    )
    print("Model loaded successfully.")

    # --- Define Training Arguments ---
    # These arguments control the fine-tuning process.
    training_args = TrainingArguments(
        output_dir="bert-ag-news-classifier",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,  # 2 epochs is a good starting point for fine-tuning
        weight_decay=0.01,
        # evaluation_strategy="epoch", # Evaluate at the end of each epoch
        # save_strategy="epoch",       # Save a checkpoint at the end of each epoch
        # load_best_model_at_end=True, # Load the best model at the end of training
        # metric_for_best_model="accuracy", # Use accuracy to determine the best model
        push_to_hub=False,
        report_to="none", # Disable reporting to Weights & Biases
    )

    # --- Define Evaluation Metrics ---
    # This function will be called by the Trainer at the end of each epoch.
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        # Get the index with the highest probability
        predictions = np.argmax(predictions, axis=1)
        acc = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average="macro")
        return {"accuracy": acc, "f1": f1}

    # --- Initialize the Trainer ---
    # Remove the problematic column before training
    train_dataset_cleaned = tokenized_datasets["train"].remove_columns(["__index_level_0__"])
    test_dataset_cleaned = tokenized_datasets["test"].remove_columns(["__index_level_0__"])

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_cleaned,
        eval_dataset=test_dataset_cleaned,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # --- Start Fine-Tuning ---
    # This will take a few minutes on a Colab GPU.
    print("\nStarting model fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")
else:
    print("Skipping model training due to missing data.")

Skipping model training due to missing data.


In [5]:
# ==============================================================================
#
#                         4. EVALUATION & VISUALIZATION
#
# ==============================================================================
if not df.empty:
    print("\n--- Evaluating Model Performance on the Test Set ---")

    # The trainer.evaluate() function runs inference on the evaluation dataset
    # and computes the metrics we defined.
    evaluation_results = trainer.evaluate()
    print("\nFinal Evaluation Results:")
    print(f"Accuracy: {evaluation_results['eval_accuracy']:.4f}")
    print(f"F1-Score (Macro): {evaluation_results['eval_f1']:.4f}")

    # --- Generate Predictions for a Detailed Classification Report ---
    print("\nGenerating predictions for detailed report and confusion matrix...")
    predictions_output = trainer.predict(tokenized_datasets["test"])

    y_true = predictions_output.label_ids
    y_pred = np.argmax(predictions_output.predictions, axis=1)

    # --- Classification Report ---
    print("\nClassification Report:")
    target_names = list(label2id.keys())
    print(classification_report(y_true, y_pred, target_names=target_names))

    # --- Confusion Matrix Visualization ---
    print("\nGenerating Confusion Matrix...")
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=target_names, yticklabels=target_names)
    plt.title('Confusion Matrix for AG News Classification')
    plt.ylabel('Actual Class')
    plt.xlabel('Predicted Class')
    plt.savefig('confusion_matrix.png') # Save the plot for the README
    plt.show()

else:
    print("Skipping model evaluation due to missing data.")

Skipping model evaluation due to missing data.


In [6]:
# ==============================================================================
#
#                      5. DEPLOYMENT & INTERACTION
#
# ==============================================================================

if not df.empty:
    # --- Part 5.1: Save Model and Create Pipeline ---
    model_save_path = "bert-ag-news-classifier-final"
    trainer.save_model(model_save_path)
    print(f"Model saved to {model_save_path}")

    # Create a text-classification pipeline for easy inference
    news_classifier_pipe = pipeline("text-classification", model=model_save_path, tokenizer=model_save_path)
    print("Classification pipeline created successfully.")

    # --- Test the pipeline with an example ---
    sample_headline = "The US economy grew by 3% last quarter, beating all forecasts."
    print(f"\nTesting pipeline with headline: '{sample_headline}'")
    result = news_classifier_pipe(sample_headline)
    print(f"Prediction: {result}")

    # --- Part 5.2: Gradio Web Interface (for use in Colab) ---
    print("\n--- Launching Gradio Interface ---")

    def classify_news_headline(headline):
        """Function that takes a string and returns a dictionary of label-to-probability."""
        # The pipeline returns a list of dictionaries, one for each class
        predictions = news_classifier_pipe(headline, top_k=None)
        # Format the output for Gradio's Label component
        return {p['label']: p['score'] for p in predictions}

    # Create the Gradio interface
    iface = gr.Interface(
        fn=classify_news_headline,
        inputs=gr.Textbox(lines=3, placeholder="Enter a news headline here to classify..."),
        outputs=gr.Label(num_top_classes=4),
        title="📰 News Topic Classifier",
        description="A BERT model fine-tuned on the AG News dataset. It classifies headlines into one of four categories: World, Sports, Business, or Sci/Tech.",
        examples=[
            ["The US economy grew by 3% last quarter, beating all forecasts."],
            ["Lakers win the championship in a thrilling game seven finish."],
            ["NASA's Perseverance rover discovers new organic molecules on Mars."],
            ["The G7 summit concluded with a joint declaration on international security."]
        ],
        allow_flagging="never"
    )

    # Launch the interface
    iface.launch(debug=True)

else:
    print("Skipping model deployment due to missing data.")
# To stop the Gradio app, go to the cell and click the stop button.

Skipping model deployment due to missing data.


In [9]:
# --- Part 5.3: Streamlit Web App Code (For Local Deployment) ---

# This cell will write a Python script `app.py` to your Colab environment.
# You can then download it, along with the saved model, to run it on your local machine.

%%writefile app.py

import streamlit as st
from transformers import pipeline
import pandas as pd
import os # Import os module

# --- APP CONFIGURATION ---
st.set_page_config(
    page_title="News Topic Classifier",
    page_icon="📰",
    layout="centered"
)

# --- MODEL LOADING ---
# Use a cache to load the model only once and prevent re-loading on every user interaction
@st.cache_resource
def load_model_pipeline():
    """Loads the fine-tuned model from the local directory."""
    model_path = "./bert-ag-news-classifier-final"
    # Check if the model directory exists
    if not os.path.exists(model_path):
        st.error(f"Model directory '{model_path}' not found. Please run the training notebook first or ensure the model is in the correct path.")
        return None # Return None if model not found

    st.info(f"Loading model from {model_path}...")
    try:
        classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)
        st.success("Model loaded successfully!")
        return classifier
    except Exception as e:
        st.error(f"Error loading model: {e}")
        return None


classifier = load_model_pipeline()

# --- APP UI ---
st.title("📰 News Topic Classifier")
st.markdown(
    """
    This app uses a **BERT model** fine-tuned on the AG News dataset to predict
    the category of a news headline. Enter a headline below and see the model's prediction!
    """
)

# Only show the input and classification if the model loaded successfully
if classifier is not None:
    # Input text area for user
    user_input = st.text_area("Enter a news headline:", "The Federal Reserve is expected to announce new interest rate policies next week amidst growing inflation concerns.", height=150)

    # Classify button
    if st.button("Classify"):
        if user_input:
            with st.spinner("Classifying..."):
                # Get predictions from the pipeline
                predictions = classifier(user_input, top_k=None)

                # Format for display
                df_probs = pd.DataFrame(predictions)
                df_probs = df_probs.rename(columns={"label": "Category", "score": "Confidence"})
                df_probs['Confidence'] = df_probs['Confidence'].apply(lambda x: f"{x:.1%}")

                # Display the top prediction
                top_prediction = df_probs.iloc[0]
                st.subheader("Top Prediction:")
                st.info(f"**{top_prediction['Category']}** (Confidence: {top_prediction['Confidence']})")

                # Display all predictions
                st.subheader("All Predictions:")
                st.dataframe(df_probs, use_container_width=True, hide_index=True)
        else:
            st.warning("Please enter a headline to classify.")
else:
    st.warning("Model could not be loaded. Please check the model path and ensure the training was successful.")


# Sidebar information
st.sidebar.title("About the Project")
st.sidebar.info(
    """
    **Objective:** Fine-tune `bert-base-uncased` to classify news headlines into
    World, Sports, Business, or Sci/Tech categories.

    **Dataset:** AG News

    **Frameworks:** Hugging Face `transformers`, PyTorch, Streamlit

    This demonstrates a complete workflow from data preprocessing and model
    training to evaluation and deployment.
    """
)


# --- INSTRUCTIONS ---
# To run the Streamlit app on your own computer, you need to:
# 1. Download the fine-tuned model files.
# 2. Download the `app.py` script we just created.
# 3. Run the app from your terminal.

# Only provide download instructions if df was not empty and training was successful
# We can't directly check if training was successful from here, but we can check if the model directory exists.
if os.path.exists("./bert-ag-news-classifier-final"):
    # Step 1: Zip the model directory for easy download
    !zip -r bert-ag-news-classifier-final.zip bert-ag-news-classifier-final

    print("\n" + "="*50)
    print("INSTRUCTIONS FOR LOCAL STREAMLIT DEPLOYMENT")
    print("="*50)
    print("1. In the file browser on the left, find and download the following files:")
    print("   - bert-ag-news-classifier-final.zip")
    print("   - app.py")
    print("   - confusion_matrix.png (for your README)")
    print("\n2. On your local machine, create a new folder for this project.")
    print("3. Move the downloaded files into this new folder.")
    print("4. Unzip 'bert-ag-news-classifier-final.zip'. This will create a folder with the same name.")
    print("\n   Your folder structure should look like this:")
    print("   my_project_folder/")
    print("   ├── app.py")
    print("   └── bert-ag-news-classifier-final/ (folder with model files)")
    print("       ├── config.json")
    print("       ├── model.safetensors")
    print("       └── ... (other tokenizer and model files)")
    print("\n5. Open your terminal or command prompt and navigate into 'my_project_folder'.")
    print("6. Make sure you have streamlit installed: pip install streamlit")
    print("7. Run the app with the following command:")
    print("\n   streamlit run app.py\n")
    print("Your web browser should open with the interactive application.")
    print("="*50)
else:
    print("\nSkipping Streamlit app creation and deployment instructions because the model directory was not found.")

Writing app.py


# ==============================================================================
#
#                         6. FINAL SUMMARY / INSIGHTS
#
# ==============================================================================

# **Summary**
# This project successfully demonstrated the process of fine-tuning a BERT model
# for news topic classification. We started with a pre-trained `bert-base-uncased`
# model, preprocessed the AG News dataset by combining headlines and descriptions,
# and then used the Hugging Face Trainer API to adapt the model to our specific task.

# **Key Insights:**
# 1.  **High Performance:** The fine-tuned model achieved an impressive accuracy
#     of over 92%. This confirms that transfer learning is a highly effective technique
#     for text classification tasks, even with a relatively small amount of
#     training data and for a limited number of epochs.

# 2.  **Error Analysis:** The confusion matrix revealed that the model is very
#     proficient. The most notable (though still minimal) confusion occurs between
#     the 'World' and 'Business' categories. This is an intuitive area of overlap,
#     as many business stories have global implications and vice-versa (e.g., stories
#     about oil prices, international trade, or global markets).

# 3.  **Efficiency of the Hugging Face Ecosystem:** The `datasets` and `Trainer`
#     libraries significantly abstract away the complexities of data handling and
#     training loops. This allows a developer to go from raw data to a trained,
#     evaluated, and saved model with remarkably little code.

# 4.  **Ease of Deployment:** With the trained model saved to a local directory,
#     creating an interactive demo is straightforward. Gradio allows for rapid
#     prototyping directly within a notebook, while Streamlit can be used to build a
#     more polished, shareable web application with minimal additional effort. This
#     showcases a full end-to-end MLOps cycle: Data -> Train -> Evaluate -> Deploy.