In [1]:
!pip install transformers datasets torch scikit-learn pandas matplotlib seaborn tqdm




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
df = pd.read_excel('/content/drive/MyDrive/bbc_data.xlsx')


In [4]:
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

Shape: (2225, 2)
Columns: ['category', 'text']


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
import re
from sklearn.preprocessing import LabelEncoder

def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df['text'] = df['text'].apply(clean_text)

# Drop missing or empty rows
df = df.dropna(subset=['text', 'category'])
df = df[df['text'].str.strip() != ""]

# Encode labels
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['category'])
num_classes = len(le.classes_)
print("Classes:", list(le.classes_))


Classes: ['business', 'entertainment', 'politics', 'sport', 'tech']


In [6]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label_id'], random_state=42)

print("Train:", len(train_df), "Validation:", len(val_df), "Test:", len(test_df))


Train: 1780 Validation: 222 Test: 223


In [7]:
from transformers import AutoTokenizer
from datasets import Dataset

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

train_dataset = Dataset.from_pandas(train_df[['text', 'label_id']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label_id']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label_id']])

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("label_id", "labels")
val_dataset = val_dataset.rename_column("label_id", "labels")
test_dataset = test_dataset.rename_column("label_id", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1780 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

In [13]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    return {"accuracy": acc, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./bbc_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir='./bbc_logs',
    logging_strategy="no",  # Changed to "no"
    report_to=None # Added to explicitly disable reporting
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"
print("WANDB logging is disabled.")

WANDB logging is disabled.


In [14]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.06898,0.981982,0.981683
2,No log,0.057663,0.986486,0.986877
3,No log,0.050429,0.990991,0.991194


TrainOutput(global_step=669, training_loss=0.1701623890966578, metrics={'train_runtime': 383.8826, 'train_samples_per_second': 13.911, 'train_steps_per_second': 1.743, 'total_flos': 702525440378880.0, 'train_loss': 0.1701623890966578, 'epoch': 3.0})

In [15]:
metrics = trainer.evaluate(test_dataset)
print("Test set performance:")
print(metrics)


Test set performance:
{'eval_loss': 0.12743814289569855, 'eval_accuracy': 0.9641255605381166, 'eval_f1': 0.9638138919903627, 'eval_runtime': 2.8009, 'eval_samples_per_second': 79.618, 'eval_steps_per_second': 4.998, 'epoch': 3.0}


In [16]:
import torch

def predict_category(text, model, tokenizer, label_encoder):
    model.eval()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    ).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()
    predicted_label = label_encoder.inverse_transform([predicted_class_id])[0]
    return predicted_label

# Example predictions
examples = [
    "The government has announced new tax reforms for middle-class families.",
    "Manchester United defeated Chelsea 2-1 in last night's thrilling match.",
    "Apple launched its new iPhone model with improved camera features."
]

for text in examples:
    category = predict_category(text, model, tokenizer, le)
    print(f"\nüì∞ Text: {text}\n‚û°Ô∏è Predicted Category: {category}")



üì∞ Text: The government has announced new tax reforms for middle-class families.
‚û°Ô∏è Predicted Category: business

üì∞ Text: Manchester United defeated Chelsea 2-1 in last night's thrilling match.
‚û°Ô∏è Predicted Category: sport

üì∞ Text: Apple launched its new iPhone model with improved camera features.
‚û°Ô∏è Predicted Category: tech
