In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# Setup

1. load the data
2. encode the labels
3. Create train, validation, and test sets
4. tokenize the data

In [2]:
# load the data
classification_df = pd.read_csv("./data/classification_data.csv")

# Encode Labels
label_encoder = LabelEncoder()
classification_df['label_encoded'] = label_encoder.fit_transform(classification_df['label'])

# Create Test Set
tv_texts, test_texts, tv_labels, test_labels = train_test_split(
    classification_df['text'].tolist(),
    classification_df['label_encoded'].tolist(),
    test_size=0.2,
    random_state=42
)

# Create train and validation set
train_texts, val_texts, train_labels, val_labels = train_test_split(
    tv_texts,
    tv_labels,
    test_size=0.2,
    random_state=42
)

# Tokenize the data
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

torch.backends.cuda.matmul.allow_tf32 = True

# Define data class

1. define the data class - add any data augmentations here
2. transform to dataloaders

In [3]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, train_labels)
val_dataset = ClassificationDataset(val_encodings, val_labels)
test_dataset = ClassificationDataset(test_encodings, test_labels)

# Hyperparameter tuning

1. define metrics and objectives
2. initialize the model, training arguments, and trainer
3. define and run hpt

In [4]:
import evaluate
import numpy as np

# Define metrics and objectives
metric = evaluate.combine(["f1"])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

def compute_objective(metric):
    return metric["eval_loss"], metric["eval_f1"]

# define model
def model_init(trial):
    return DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", 
        num_labels=len(label_encoder.classes_)
    )

# define training args
training_args = TrainingArguments(
    output_dir="/tmp/results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="/tmp/logs",
    logging_steps=10,
    save_strategy="no",
    eval_strategy="epoch",
)

# define trainer
hpt_trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=val_dataset,
    eval_dataset=train_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics
)

# define and run hyperparameter training
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs", [5, 10, 15]),
        "weight_decay": trial.suggest_float("weight_decay", 1e-3, 3e-2)
    }

best_trials = hpt_trainer.hyperparameter_search(
    direction=["minimize", "maximize"],
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=5,
    compute_objective=compute_objective
)

print(best_trials)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-05-09 10:32:59,607] A new study created in memory with name: no-name-ab72c096-8350-4b62-a6da-d4e18e223119
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.0673,1.023134,0.788441
2,0.9959,0.942992,0.912156
3,0.9162,0.869004,0.912156
4,0.8452,0.81815,0.912156
5,0.8101,0.798494,0.912156


[I 2025-05-09 10:33:02,096] Trial 0 finished with values: [0.7984938621520996, 0.91215559925138] and parameters: {'learning_rate': 6.710901574953645e-06, 'per_device_train_batch_size': 16, 'num_train_epochs': 5, 'weight_decay': 0.02427021108221012}.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,1.076956,0.374879
2,1.080000,1.059428,0.857596
3,1.080000,1.042115,0.857596
4,1.050100,1.024676,0.912156
5,1.050100,1.006104,0.912156
6,1.008600,0.986866,0.912156
7,1.008600,0.968363,0.912156
8,0.976400,0.950932,0.912156
9,0.976400,0.93409,0.912156
10,0.939600,0.918572,0.912156


[I 2025-05-09 10:33:06,100] Trial 1 finished with values: [0.8796542882919312, 0.91215559925138] and parameters: {'learning_rate': 3.0463861314729984e-06, 'per_device_train_batch_size': 32, 'num_train_epochs': 15, 'weight_decay': 0.027378892334551277}.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.0759,1.044932,0.718109
2,1.0274,0.986632,0.912156
3,0.9675,0.921472,0.912156
4,0.8906,0.850396,0.912156
5,0.8233,0.783146,0.912156
6,0.7657,0.72368,1.0
7,0.7135,0.67825,1.0
8,0.6709,0.645373,1.0
9,0.6566,0.624603,1.0
10,0.6321,0.616997,1.0


[I 2025-05-09 10:33:09,736] Trial 2 finished with values: [0.6169969439506531, 1.0] and parameters: {'learning_rate': 4.59965584961873e-06, 'per_device_train_batch_size': 16, 'num_train_epochs': 10, 'weight_decay': 0.0031407009974995714}.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,1.09101,0.243107
2,No log,1.088496,0.243107
3,No log,1.086659,0.243107
4,No log,1.085507,0.243107
5,1.088000,1.084991,0.243107


[I 2025-05-09 10:33:11,152] Trial 3 finished with values: [1.0849907398223877, 0.2431067431067431] and parameters: {'learning_rate': 1.295339521918524e-06, 'per_device_train_batch_size': 128, 'num_train_epochs': 5, 'weight_decay': 0.0012290607859959113}.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.819031,0.912156
2,No log,0.510768,1.0
3,No log,0.293046,1.0
4,0.693800,0.174852,1.0
5,0.693800,0.11148,1.0
6,0.693800,0.076835,1.0
7,0.149300,0.058232,1.0
8,0.149300,0.048021,1.0
9,0.149300,0.042822,1.0
10,0.056800,0.04099,1.0


[I 2025-05-09 10:33:13,883] Trial 4 finished with values: [0.040989574044942856, 1.0] and parameters: {'learning_rate': 6.0241314473928826e-05, 'per_device_train_batch_size': 64, 'num_train_epochs': 10, 'weight_decay': 0.02320569314066657}.


[BestRun(run_id='4', objective=[0.040989574044942856, 1.0], hyperparameters={'learning_rate': 6.0241314473928826e-05, 'per_device_train_batch_size': 64, 'num_train_epochs': 10, 'weight_decay': 0.02320569314066657}, run_summary=None)]


# Fine-tune the model with best HPs

1. define validation metrics
2. define training args with optimized HPs
3. fine-tune the model with validation on overfitting
4. evaluate the model on test set
5. save the model for inference

In [None]:
# define validation metrics 
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    metrics = {}
    metrics.update(accuracy.compute(predictions=predictions, references=labels))
    metrics.update(precision.compute(predictions=predictions, references=labels, average='weighted'))
    metrics.update(recall.compute(predictions=predictions, references=labels, average='weighted'))
    metrics.update(f1.compute(predictions=predictions, references=labels, average='weighted'))

    return metrics

# setup training args with HPs
training_args = TrainingArguments(
    output_dir="/tmp/results",
    learning_rate=best_trials[0].hyperparameters['learning_rate'],
    per_device_train_batch_size=best_trials[0].hyperparameters['per_device_train_batch_size'],
    per_device_eval_batch_size=16,
    num_train_epochs=best_trials[0].hyperparameters['num_train_epochs'],
    weight_decay=best_trials[0].hyperparameters['weight_decay'],
    logging_dir="/tmp/logs",
    logging_steps=10,
    save_strategy="no",
    eval_strategy="epoch",
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))
model.to('cuda')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# fine-tune and evaluate on the test dataset
losses = trainer.train()

trainer.evaluate(test_dataset)

# save the model
model.save_pretrained("./model/bioclassification-distilbert-base-uncased", from_pt=True)

# dump the training args
import pickle
with open('./model/training_args.pkl', 'wb') as file: 
    pickle.dump(training_args, file) 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8076,0.386372,1.0,1.0,1.0,1.0
2,0.2024,0.055676,1.0,1.0,1.0,1.0
3,0.0373,0.016093,1.0,1.0,1.0,1.0
4,0.014,0.008256,1.0,1.0,1.0,1.0
5,0.0084,0.005632,1.0,1.0,1.0,1.0
6,0.0062,0.004461,1.0,1.0,1.0,1.0
7,0.0052,0.003847,1.0,1.0,1.0,1.0
8,0.0045,0.003506,1.0,1.0,1.0,1.0
9,0.0043,0.003328,1.0,1.0,1.0,1.0
10,0.0041,0.003268,1.0,1.0,1.0,1.0


# Online Model Inference

1. load fine-tuned model and hyperparameter tuned arguments
2. define batch examples
3. tokenize and create dataset
4. feed into inference 
5. collect and map preds to human readable output

In [6]:
import pickle

with open('./model/training_args.pkl', 'rb') as file: 
    training_args = pickle.load(file) 

model = DistilBertForSequenceClassification.from_pretrained("./model/bioclassification-distilbert-base-uncased", num_labels=len(label_encoder.classes_))
model.to('cuda')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [7]:
# Batch examples - labels are optional
examples = {
    "DrugZ caused severe rashes in some participants.": 0
}

# tokenize and create dataset
example_encoding = tokenizer([k for k,_ in examples.items()], truncation=True, padding=True)
example_labels = [v for _,v in examples.items()]

example_dataset = ClassificationDataset(example_encoding, example_labels)

# feed into inference
preds = trainer.predict(example_dataset)

# collect and map preds to human readable output
preds = np.argmax(preds.predictions, axis=1)
mapping = {
    0: "Adverse Effect",
    1: "Neutral Observation",
    2: "Positive Outcome"
}
print({k:mapping[v] for k, v in zip(examples.keys(), preds)})

{'DrugZ caused severe rashes in some participants.': 'Adverse Effect'}
