<a href="https://colab.research.google.com/github/amitrege/notebooks/blob/master/hf_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
#!/usr/bin/env python
# text_classification_pipeline_part1.py

"""
Part 1:
 - Loads a small text dataset (IMDB) using 'datasets'
 - Splits into a smaller subset for quick experimentation
 - Sets up multiple Hugging Face classification pipelines
"""

import random
import numpy as np
from datasets import load_dataset
from transformers import pipeline

def load_and_prepare_data(num_samples=200):
    """
    Loads the IMDB dataset from Hugging Face Datasets.
    Takes a small subset for quick experimentation.
    Splits into 'train' and 'test' sets.

    Returns:
    - train_texts, train_labels
    - test_texts, test_labels
    """
    dataset = load_dataset("imdb")

    # We'll use the 'train' split as our entire dataset for sampling
    # (just to keep it small & quick).
    full_data = dataset["train"]

    # Shuffle the dataset so we don't just pick the first ones
    full_data = full_data.shuffle(seed=42)

    # Take num_samples for training and num_samples for testing
    train_data = full_data.select(range(num_samples))
    test_data = full_data.select(range(num_samples, 2 * num_samples))

    train_texts = train_data["text"]
    train_labels = train_data["label"]

    test_texts = test_data["text"]
    test_labels = test_data["label"]

    return train_texts, train_labels, test_texts, test_labels

def create_pipelines():
    """
    Creates two text-classification pipelines for comparison.
    Returns a dict of pipeline_name -> pipeline_object
    """
    # DistilBERT sentiment model
    distilbert_pipeline = pipeline(
        "text-classification",
        model="distilbert-base-uncased-finetuned-sst-2-english",
        return_all_scores=True  # to see probabilities for all classes
    )

    # Another model (e.g., a RoBERTa-based sentiment model)
    roberta_pipeline = pipeline(
        "text-classification",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        return_all_scores=True
    )

    pipelines = {
        "DistilBERT": distilbert_pipeline,
        "RoBERTa": roberta_pipeline
    }

    return pipelines


if __name__ == "__main__":
    # Demo usage
    train_texts, train_labels, test_texts, test_labels = load_and_prepare_data()
    sentiment_pipelines = create_pipelines()

    # Just print out how many samples we have for sanity check
    print(f"Train samples: {len(train_texts)} | Test samples: {len(test_texts)}")
    print("Pipelines loaded:", list(sentiment_pipelines.keys()))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


Train samples: 200 | Test samples: 200
Pipelines loaded: ['DistilBERT', 'RoBERTa']


In [None]:
#!/usr/bin/env python
# text_classification_pipeline_part2.py

"""
Part 2:
 - Loads data & pipelines from Part 1
 - Runs inference on the test set
 - Computes accuracy/F1
 - Compares the two models
 - Performs a brief error analysis
"""

import numpy as np
import evaluate
from evaluate import load

def predict_labels(pipeline_obj, texts):
    """
    Given a pipeline (with return_all_scores=True) and a list of texts,
    return the predicted label (0 or 1) for each text.
    For IMDB, label '0' = negative, '1' = positive.
    """
    preds = pipeline_obj(texts)  # list of [{label:..., score:...}, ...] or list of lists

    predicted_labels = []
    for item in preds:
        # Because we used return_all_scores=True, 'item' is a list of dicts with keys {label, score}
        # Example: [{'label': 'NEGATIVE', 'score': 0.99}, {'label': 'POSITIVE', 'score': 0.01}]
        # We'll pick the label with the highest score and map it to 0/1
        best = max(item, key=lambda x: x['score'])  # the dict with the highest score
        if best['label'].upper() in ["NEGATIVE", "LABEL_0"]:
            predicted_labels.append(0)
        else:
            predicted_labels.append(1)
    return predicted_labels

def compute_accuracy(pred_labels, true_labels):
    """
    Simple accuracy calculation.
    """
    correct = sum(p == t for p, t in zip(pred_labels, true_labels))
    return correct / len(true_labels)

def main_evaluation():
    # Load data
    train_texts, train_labels, test_texts, test_labels = load_and_prepare_data()
    pipelines = create_pipelines()

    # We'll track model name -> (accuracy, predictions)
    results = {}

    for model_name, pipe in pipelines.items():
        print(f"\nEvaluating {model_name} on test set...")
        preds = predict_labels(pipe, test_texts)
        accuracy = compute_accuracy(preds, test_labels)
        results[model_name] = (accuracy, preds)
        print(f"Accuracy = {accuracy:.3f}")

    # Compare results
    print("\n=== Model Comparison ===")
    for model_name, (acc, _) in results.items():
        print(f"{model_name} Accuracy: {acc:.3f}")

    # Brief error analysis for one of the models (e.g. DistilBERT)
    chosen_model = "DistilBERT"
    print(f"\nError Analysis for {chosen_model}...")
    _, chosen_preds = results[chosen_model]

    # Let's look at the first 5 mistakes
    mistakes = []
    for idx, (pred, true) in enumerate(zip(chosen_preds, test_labels)):
        if pred != true:
            mistakes.append(idx)
    print(f"Found {len(mistakes)} mistakes out of {len(test_labels)}.")

    for i in mistakes[:5]:  # show up to 5
        print(f"\n--- Test Sample {i} ---")
        print(f"Text: {test_texts[i][:200]}...")  # print partial text
        print(f"True label: {test_labels[i]} (0=neg, 1=pos)")
        print(f"Pred label: {chosen_preds[i]}")

if __name__ == "__main__":
    main_evaluation()

Device set to use cuda:0
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



Evaluating DistilBERT on test set...


Token indices sequence length is longer than the specified maximum sequence length for this model (816 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (816) must match the size of tensor b (512) at non-singleton dimension 1