# Classification using HF



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## Setup

1. load the data
2. encode the labels
3. Create train, validation, and test sets
4. tokenize the data

In [None]:
# load the data
classification_df = pd.read_csv("./data/classification_data.csv")

# Encode Labels
label_encoder = LabelEncoder()
classification_df['label_encoded'] = label_encoder.fit_transform(classification_df['label'])

# Create Test Set
tv_texts, test_texts, tv_labels, test_labels = train_test_split(
    classification_df['text'].tolist(),
    classification_df['label_encoded'].tolist(),
    test_size=0.2,
    random_state=42
)

# Create train and validation set
train_texts, val_texts, train_labels, val_labels = train_test_split(
    tv_texts,
    tv_labels,
    test_size=0.1,
    random_state=42
)

# Tokenize the data
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

torch.backends.cuda.matmul.allow_tf32 = True

## Define data class

1. define the data class - add any data augmentations here
2. transform to dataloaders

In [3]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, train_labels)
val_dataset = ClassificationDataset(val_encodings, val_labels)
test_dataset = ClassificationDataset(test_encodings, test_labels)

## Hyperparameter tuning

1. define metrics and objectives
2. initialize the model, training arguments, and trainer
3. define and run hpt

In [4]:
import evaluate
import numpy as np

# Define metrics and objectives
metric = evaluate.combine(["f1"])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

def compute_objective(metric):
    return metric["eval_loss"], metric["eval_f1"]

# define model
def model_init(trial):
    return DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", 
        num_labels=len(label_encoder.classes_)
    )

# define training args
training_args = TrainingArguments(
    output_dir="/tmp/results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="/tmp/logs",
    logging_steps=10,
    save_strategy="no",
    eval_strategy="epoch",
)

# define trainer
hpt_trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=val_dataset,
    eval_dataset=train_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics
)

# define and run hyperparameter training
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs", [5, 10, 15]),
        "weight_decay": trial.suggest_float("weight_decay", 1e-3, 3e-2)
    }

best_trials = hpt_trainer.hyperparameter_search(
    direction=["minimize", "maximize"],
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=5, # this can be increased
    compute_objective=compute_objective
)

print(best_trials)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-05-09 15:59:33,931] A new study created in memory with name: no-name-f3322aa9-e09a-493f-9ac7-5707adea13aa
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.0188,0.889461,0.912156
2,0.7763,0.593663,1.0
3,0.4951,0.345062,1.0
4,0.2996,0.194786,1.0
5,0.1752,0.113124,1.0
6,0.1079,0.070366,1.0
7,0.0706,0.04881,1.0
8,0.0514,0.037562,1.0
9,0.0431,0.031127,1.0
10,0.0357,0.027147,1.0


[I 2025-05-09 15:59:39,352] Trial 0 finished with values: [0.020879525691270828, 1.0] and parameters: {'learning_rate': 1.5149715323249793e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 15, 'weight_decay': 0.006038666974230823}.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,1.085744,0.243107
2,1.087100,1.077743,0.4693
3,1.087100,1.070061,0.646386
4,1.076500,1.062522,0.857596
5,1.076500,1.055166,0.857596
6,1.057100,1.048038,0.857596
7,1.057100,1.041442,0.857596
8,1.048200,1.035372,0.857596
9,1.048200,1.029739,0.848054
10,1.034700,1.024778,0.912156


[I 2025-05-09 15:59:43,141] Trial 1 finished with values: [1.012343168258667, 0.91215559925138] and parameters: {'learning_rate': 1.5046201713508165e-06, 'per_device_train_batch_size': 32, 'num_train_epochs': 15, 'weight_decay': 0.012162159035407318}.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,1.061231,0.857596
2,No log,1.029002,0.912156
3,No log,0.996673,0.912156
4,No log,0.965109,0.912156
5,1.024900,0.934735,0.912156
6,1.024900,0.906861,0.912156
7,1.024900,0.882748,0.912156
8,1.024900,0.863873,0.912156
9,1.024900,0.851438,0.912156
10,0.884700,0.845991,0.912156


[I 2025-05-09 15:59:45,354] Trial 2 finished with values: [0.845991313457489, 0.91215559925138] and parameters: {'learning_rate': 1.217099314629715e-05, 'per_device_train_batch_size': 128, 'num_train_epochs': 10, 'weight_decay': 0.018658280236505335}.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,1.091556,0.174988
2,No log,1.089054,0.243107
3,No log,1.086783,0.243107
4,No log,1.084654,0.243107
5,1.088000,1.082685,0.243107
6,1.088000,1.08086,0.360723
7,1.088000,1.079187,0.429496
8,1.088000,1.077698,0.429496
9,1.088000,1.076361,0.534871
10,1.077000,1.075199,0.61531


[I 2025-05-09 15:59:48,527] Trial 3 finished with values: [1.072414517402649, 0.6153099830421596] and parameters: {'learning_rate': 1.0505442838742028e-06, 'per_device_train_batch_size': 128, 'num_train_epochs': 15, 'weight_decay': 0.006636830916584974}.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.725104,0.912156
2,0.776400,0.355478,1.0
3,0.776400,0.19022,1.0
4,0.241300,0.126173,1.0
5,0.241300,0.106691,1.0


[I 2025-05-09 15:59:50,163] Trial 4 finished with values: [0.10669145733118057, 1.0] and parameters: {'learning_rate': 5.560205083581748e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 5, 'weight_decay': 0.011448069009677628}.


[BestRun(run_id='0', objective=[0.020879525691270828, 1.0], hyperparameters={'learning_rate': 1.5149715323249793e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 15, 'weight_decay': 0.006038666974230823}, run_summary=None)]


## Fine-tune the model with best HPs

1. define validation metrics
2. define training args with optimized HPs
3. fine-tune the model with validation on overfitting
4. evaluate the model on test set (note that we achieve 100% accuracy during training, validation, and test)
5. save the model for inference

In [5]:
# define validation metrics 
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    metrics = {}
    metrics.update(accuracy.compute(predictions=predictions, references=labels))
    metrics.update(precision.compute(predictions=predictions, references=labels, average='weighted'))
    metrics.update(recall.compute(predictions=predictions, references=labels, average='weighted'))
    metrics.update(f1.compute(predictions=predictions, references=labels, average='weighted'))

    return metrics

# setup training args with HPs
training_args = TrainingArguments(
    output_dir="/tmp/results",
    learning_rate=best_trials[0].hyperparameters['learning_rate'],
    per_device_train_batch_size=best_trials[0].hyperparameters['per_device_train_batch_size'],
    per_device_eval_batch_size=16,
    num_train_epochs=best_trials[0].hyperparameters['num_train_epochs'],
    weight_decay=best_trials[0].hyperparameters['weight_decay'],
    logging_dir="/tmp/logs",
    logging_steps=10,
    save_strategy="no",
    eval_strategy="epoch",
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))
model.to('cuda')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# fine-tune and evaluate on the test dataset
losses = trainer.train()

trainer.evaluate(test_dataset)

# save the model
model.save_pretrained("./model/bioclassification-distilbert-base-uncased", from_pt=True)

# dump the training args
import pickle
with open('./model/training_args.pkl', 'wb') as file: 
    pickle.dump(training_args, file) 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3264,0.193874,1.0,1.0,1.0,1.0
2,0.0378,0.026333,1.0,1.0,1.0,1.0
3,0.0174,0.012565,1.0,1.0,1.0,1.0
4,0.011,0.008117,1.0,1.0,1.0,1.0
5,0.008,0.005929,1.0,1.0,1.0,1.0
6,0.0063,0.004642,1.0,1.0,1.0,1.0
7,0.0051,0.003807,1.0,1.0,1.0,1.0
8,0.0045,0.003239,1.0,1.0,1.0,1.0
9,0.004,0.002829,1.0,1.0,1.0,1.0
10,0.0035,0.002539,1.0,1.0,1.0,1.0


## Online Model Inference

1. load fine-tuned model and hyperparameter tuned arguments
2. define batch examples
3. tokenize and create dataset
4. feed into inference 
5. collect and map preds to human readable output

In [6]:
import pickle

with open('./model/training_args.pkl', 'rb') as file: 
    training_args = pickle.load(file) 

model = DistilBertForSequenceClassification.from_pretrained("./model/bioclassification-distilbert-base-uncased", num_labels=len(label_encoder.classes_))
model.to('cuda')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Batch examples - labels are optional
examples = {
    "DrugZ caused severe rashes in some participants.": 0,
    "Increased liver enzymes were noted post-treatment with DrugA.": 0,
    "The study excluded patients with pre-existing conditions.": 1,
    "No significant side effects were observed during the trial." : 2,
    "The treatment resulted in full remission for the majority of patients.": 2
}

# tokenize and create dataset
example_encoding = tokenizer([k for k,_ in examples.items()], truncation=True, padding=True)
example_labels = [v for _,v in examples.items()]

example_dataset = ClassificationDataset(example_encoding, example_labels)

# feed into inference
preds = trainer.predict(example_dataset)

# collect and map preds to human readable output
preds = np.argmax(preds.predictions, axis=1)
mapping = {
    0: "Adverse Effect",
    1: "Neutral Observation",
    2: "Positive Outcome"
}
print({k:mapping[v] for k, v in zip(examples.keys(), preds)})

{'DrugZ caused severe rashes in some participants.': 'Adverse Effect', 'Increased liver enzymes were noted post-treatment with DrugA.': 'Adverse Effect', 'The study excluded patients with pre-existing conditions.': 'Neutral Observation', 'No significant side effects were observed during the trial.': 'Positive Outcome', 'The treatment resulted in full remission for the majority of patients.': 'Positive Outcome'}
