# Text classification approaches on news-like data -- Continued

Extending to fine-tuning!  Here we will do simple LLM fine-tuning with DistilBERT.

In [None]:
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import torch

# For LLM fine-tuning
from datasets import Dataset as HFDataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

In [None]:
# For Reproducibility

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED);

### Load and prepare data

* Load a subset of 20 Newsgroups as a stand-in for 'news' articles.
* We pick a few categories to make it multi-class.

In [None]:
categories = [
    "rec.autos",
    "sci.space",
    "comp.graphics",
    "talk.politics.misc",
]
num_classes = 4

In [None]:
dataset = fetch_20newsgroups(
    subset="all",
    categories=categories,
    remove=("headers", "footers", "quotes")
)

In [None]:
texts = dataset.data
labels = dataset.target
target_names = dataset.target_names

In [None]:
print(texts[0])

In [None]:
print(labels[0], ':', target_names[0])

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    texts, 
    labels, 
    test_size=0.2, 
    random_state=RANDOM_SEED, 
    stratify=labels
)

In [None]:
print(f"Loaded {len(texts)} documents, {len(target_names)} classes:")
for i, name in enumerate(target_names):
    print(f"  {i}: {name}")

### 4. Simple LLM Fine-Tuning (DistilBERT)

In [None]:
def prepare_hf_dataset(X_train, X_test, y_train, y_test):
    train_dict = {"text": X_train, "label": y_train}
    test_dict = {"text": X_test, "label": y_test}

    train_dataset = HFDataset.from_dict(train_dict)
    test_dataset = HFDataset.from_dict(test_dict)

    return DatasetDict({"train": train_dataset, "test": test_dataset})

In [None]:
def tokenize_function(examples, tokenizer, max_length=256):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

In [None]:
def compute_metrics(eval_pred):

    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, 
                                                               preds, 
                                                               average="weighted"
    )
    
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
hf_datasets = prepare_hf_dataset(X_train, X_test, y_train, y_test)

In [None]:
hf_datasets

In [None]:
# Tokenize
tokenized_datasets = hf_datasets.map(
    lambda example: tokenize_function(example, tokenizer),
    batched=True
)

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets['train'][0]['label']

In [None]:
tokenized_datasets['train'][0]['input_ids']

In [None]:
tokenizer.decode(tokenized_datasets['train'][0]['input_ids'])

In [None]:
print(X_train[0])

In [None]:
tokenized_datasets['train'][0]['attention_mask']

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

In [None]:
tokenized_datasets.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"],
)

In [None]:
tokenized_datasets['train'][0]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
)

In [None]:
training_args = TrainingArguments(
    output_dir="./distilbert-base-uncased-news",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,  # small here for quick running
    learning_rate=5e-5,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
print("DistilBERT eval metrics:", metrics)

In [None]:
# Show a small classification report
preds_output = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(preds_output.predictions, axis=-1)

print("\nClassification report (DistilBERT fine-tuned):")
print(classification_report(y_test, preds, target_names=target_names))

# Freezing

Following the book, we can check what parameters in our DistilBert model are trainable:

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
)

In [None]:
for index, (name, param) in enumerate(model.named_parameters()):
     print(f"Parameter: {index}{name} ----- {param.requires_grad}")

Given our small dataset, maybe we shouldn't try and train the entire model.

Let's try:
1. only training the final classifier parameters
2. only training the final transformer block + classifier head

In [None]:
# The classifier block starts at index 102
# The final transformer block (#5) starts at index 84
# We can freeze everything up to those indices to constrain our training

model_ix_to_unfreeze = 84

for index, (name, param) in enumerate(model.named_parameters()):
    param.requires_grad = True
    if index < model_ix_to_unfreeze:
        param.requires_grad = False

In [None]:
for index, (name, param) in enumerate(model.named_parameters()):
     print(f"Parameter: {index}{name} ----- {param.requires_grad}")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

In [None]:
# Show a small classification report
preds_output = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(preds_output.predictions, axis=-1)

print("\nClassification report (DistilBERT fine-tuned):")
print(classification_report(y_test, preds, target_names=target_names))