In [1]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import mlflow
from hyperopt import hp
from hyperopt import Trials, fmin, tpe, STATUS_OK
from matplotlib import pyplot as plt
from sklearn.metrics import (f1_score, precision_score, recall_score,
                             accuracy_score, hamming_loss, jaccard_score,multilabel_confusion_matrix,ConfusionMatrixDisplay)
from torch import nn

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import logging
logging.getLogger("mlflow").setLevel(logging.ERROR)


  import pkg_resources


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device : {device}")

device : cuda


In [3]:
experiment_name = "Multi-label classification"
mlflow.set_experiment(experiment_name)

2026/02/07 16:25:04 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/02/07 16:25:04 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/02/07 16:25:04 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/02/07 16:25:04 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/02/07 16:25:04 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/02/07 16:25:04 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/02/07 16:25:05 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/07 16:25:05 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='file:///D:/aiML/NLP/llms/multi_label_classification/multi_label/mlruns/1', creation_time=1770319170928, experiment_id='1', last_update_time=1770319170928, lifecycle_stage='active', name='Multi-label classification', tags={}>

In [4]:
chkpt = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(chkpt)

In [5]:
train_dataset = load_dataset("csv",data_files="data/train_cleaned.csv")["train"]
val_dataset = load_dataset("csv",data_files="data/val_cleaned.csv")["train"]
categories = ["Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]


In [6]:
def combine_labels(batch):
    label_lists = [batch[cat] for cat in categories]    
    numpy_array = np.array(label_lists)
    transpose = numpy_array.T
    labels = transpose.tolist()
    return {
        "labels": labels,
    }

In [7]:
train_dataset = train_dataset.map(combine_labels, batched=True, batch_size=64)
train_dataset

Dataset({
    features: ['text', 'Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance', 'word_count', 'labels'],
    num_rows: 16771
})

In [8]:
val_dataset = val_dataset.map(combine_labels, batched=True, batch_size=64)
val_dataset

Dataset({
    features: ['text', 'Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance', 'labels'],
    num_rows: 4201
})

In [9]:
from datasets import Sequence, Value

new_features = train_dataset.features.copy()
new_features["labels"] = Sequence(Value("float32"))
train_dataset = train_dataset.cast(new_features)

In [10]:
train_dataset[0]

{'text': 'Title: Rotation Invariance Neural Network Abstract: Rotation invariance and translation invariance have great values in image recognition tasks. In this paper, we bring a new architecture in convolutional neural network (CNN) named cyclic convolutional layer to achieve rotation invariance in 2-D symbol recognition. We can also get the position and orientation of the 2-D symbol by the network to achieve detection purpose for multiple non-overlap target. Last but not least, this architecture can achieve one-shot learning in some cases using those invariance.',
 'Computer Science': 1,
 'Physics': 0,
 'Mathematics': 0,
 'Statistics': 0,
 'Quantitative Biology': 0,
 'Quantitative Finance': 0,
 'word_count': 82,
 'labels': [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [11]:
new_features = val_dataset.features.copy()
new_features["labels"] = Sequence(Value("float32"))
val_dataset = val_dataset.cast(new_features)

In [12]:
train_tokenized = train_dataset.map(
    lambda batch: tokenizer(
        batch["text"], 
        padding="max_length", 
        truncation=True,
    max_length=512),batched=True, batch_size=64)
print(train_tokenized.features)
val_tokenized = val_dataset.map(
    lambda batch: tokenizer(
        batch["text"], 
        padding="max_length", 
        truncation=True,
    max_length=512), batched=True, batch_size=64)
print(val_tokenized.features)

{'text': Value('large_string'), 'Computer Science': Value('int64'), 'Physics': Value('int64'), 'Mathematics': Value('int64'), 'Statistics': Value('int64'), 'Quantitative Biology': Value('int64'), 'Quantitative Finance': Value('int64'), 'word_count': Value('int64'), 'labels': List(Value('float32')), 'input_ids': List(Value('int32')), 'token_type_ids': List(Value('int8')), 'attention_mask': List(Value('int8'))}
{'text': Value('large_string'), 'Computer Science': Value('int64'), 'Physics': Value('int64'), 'Mathematics': Value('int64'), 'Statistics': Value('int64'), 'Quantitative Biology': Value('int64'), 'Quantitative Finance': Value('int64'), 'labels': List(Value('float32')), 'input_ids': List(Value('int32')), 'token_type_ids': List(Value('int8')), 'attention_mask': List(Value('int8'))}


In [13]:
distill_bert_model = AutoModelForSequenceClassification.from_pretrained(chkpt)
for encoder_layer, (name,param) in enumerate(distill_bert_model.named_parameters()):
    print(encoder_layer,name)

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert/distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


0 distilbert.embeddings.word_embeddings.weight
1 distilbert.embeddings.position_embeddings.weight
2 distilbert.embeddings.LayerNorm.weight
3 distilbert.embeddings.LayerNorm.bias
4 distilbert.transformer.layer.0.attention.q_lin.weight
5 distilbert.transformer.layer.0.attention.q_lin.bias
6 distilbert.transformer.layer.0.attention.k_lin.weight
7 distilbert.transformer.layer.0.attention.k_lin.bias
8 distilbert.transformer.layer.0.attention.v_lin.weight
9 distilbert.transformer.layer.0.attention.v_lin.bias
10 distilbert.transformer.layer.0.attention.out_lin.weight
11 distilbert.transformer.layer.0.attention.out_lin.bias
12 distilbert.transformer.layer.0.sa_layer_norm.weight
13 distilbert.transformer.layer.0.sa_layer_norm.bias
14 distilbert.transformer.layer.0.ffn.lin1.weight
15 distilbert.transformer.layer.0.ffn.lin1.bias
16 distilbert.transformer.layer.0.ffn.lin2.weight
17 distilbert.transformer.layer.0.ffn.lin2.bias
18 distilbert.transformer.layer.0.output_layer_norm.weight
19 distilbert

In [13]:
import pandas as pd

train_df = pd.read_csv("data/train_cleaned.csv")
total_samples = len(train_df)
categories_distribution = train_df[categories].sum()
categories_distribution

Computer Science        6875
Physics                 4810
Mathematics             4494
Statistics              4165
Quantitative Biology     470
Quantitative Finance     199
dtype: int64

In [14]:
len(train_df)

16771

In [15]:
pos_weights = []
for name, value in categories_distribution.items():
    weight = (total_samples - value) / value
    pos_weights.append(weight)
pos_weights

[1.4394181818181817,
 2.4866943866943867,
 2.7318647085002223,
 3.0266506602641057,
 34.682978723404254,
 83.27638190954774]

In [16]:
pos_weights_smoothed = np.sqrt(pos_weights)
pos_weights_smoothed

array([1.19975755, 1.57692561, 1.65283535, 1.73972718, 5.88922565,
       9.1255894 ])

In [17]:
pos_weights_tensor = torch.tensor(pos_weights_smoothed, dtype=torch.float32)
def multi_label_loss_func(outputs,labels,num_items_in_batch):
    logits = outputs.get("logits")
    loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weights_tensor.to(logits.device))    
    loss = loss_fct(logits, labels)
    return loss

def compute_metrics(preds):
    logits = preds.predictions
    labels = preds.label_ids
    y_pred = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    f1_weighted = f1_score(y_true=labels, y_pred=y_pred, average='weighted')
    f1_macro = f1_score(y_true=labels, y_pred=y_pred, average='macro')
    precision_weighted = precision_score(y_true=labels, y_pred=y_pred, average='weighted')
    precision_macro = precision_score(y_true=labels, y_pred=y_pred, average='macro')
    recall_weighted = recall_score(y_true=labels, y_pred=y_pred, average='weighted')
    recall_macro = recall_score(y_true=labels, y_pred=y_pred, average='macro')
    accuracy = accuracy_score(y_true=labels, y_pred=y_pred, normalize=True)
    h_loss = hamming_loss(labels, y_pred)
    jaccard_macro = jaccard_score(labels, y_pred, average='macro')
    
    return {
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "precision_macro": precision_macro,
        "precision_weighted": precision_weighted,
        "recall_macro": recall_macro,
        "recall_weighted": recall_weighted,
        "accuracy_subset": accuracy,
        "hamming_loss": h_loss,
        "jaccard_macro": jaccard_macro
    }

In [18]:
artificat_path = "distill_bert_layer_4"
def tune_model(training_params):
    model = AutoModelForSequenceClassification.from_pretrained(chkpt, num_labels=6, problem_type="multi_label_classification").to(device)
    for encoder_layer, (name,param) in enumerate(model.named_parameters()):
        # if encoder_layer < 100: # only classification head
        if encoder_layer < 84: # encoder layer 5
        # if encoder_layer < 68: #encoder layer 4
            param.requires_grad = False
        else:
            param.requires_grad = True
                
    training_args = TrainingArguments(
        "distil_bert_freeze",
        eval_strategy="epoch",
        learning_rate = training_params["learning_rate"],
        num_train_epochs=training_params["num_train_epochs"],
        remove_unused_columns=True,
        weight_decay= training_params["weight_decay"],
        lr_scheduler_type=training_params["lr_scheduler"],
        warmup_ratio=training_params["warmup_ratio"],
        load_best_model_at_end=True,          
        metric_for_best_model="eval_loss", 
        greater_is_better=False,
        save_strategy="epoch",
        save_total_limit=2,
        # disable_tqdm=True,
        logging_steps=50,
        gradient_accumulation_steps=8        
    )
    trainer = Trainer(
        model=model,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        args=training_args,
        compute_loss_func=multi_label_loss_func,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    with mlflow.start_run(nested=True):     
        print(f"training_params ==== {training_params}")
        trainer.train()
        model_tokenizer = {
            "model": model,
            "tokenizer": tokenizer,
        }
        metrics = trainer.evaluate()
        metrics["loss"] = metrics["eval_loss"]
        metrics.pop("eval_loss")        
        print(f"metrics ==== {metrics}")
        # mlflow.log_params(training_args)
        mlflow.log_metrics(metrics)
        mlflow.transformers.log_model(transformers_model=model_tokenizer, name=artificat_path, task="text-classification")        
        metrics["status"] = STATUS_OK
        return metrics

In [19]:
def fine_tune_model(ml_flow_run_name, search_space):
    with mlflow.start_run(run_name=ml_flow_run_name):
        # Run optimization
        trials = Trials()
        best_params = fmin(
            fn=tune_model,
            space=search_space,
            algo=tpe.suggest,
            max_evals=8,
            trials=trials,
            verbose=True,
        )
        # Find and log best results
        best_trial = min(trials.results, key=lambda x: x["loss"])
        # Log optimization results
        mlflow.log_params(best_params)
        mlflow.log_metrics(
            {
                "best_val_log_loss": best_trial["loss"],
                "best_accuracy": best_trial["accuracy"],
                "best_precision": best_trial["precision"],
                "best_recall": best_trial["recall"],
                "best_f1": best_trial["f1"],
                "total_trials": len(trials.trials),
                "optimization_completed": 1,
            }
        )
        print(f"best_params ==== {best_params}")

In [None]:
search_space = {
    "learning_rate": hp.uniform("learning_rate", 3e-5, 5e-5),
    "num_train_epochs": hp.uniformint("num_train_epochs", 2, 3),
    "weight_decay": hp.uniform("weight_decay", 0.05, 0.1),
    "lr_scheduler": hp.choice("lr_scheduler", ["cosine", "linear"]),
    "warmup_ratio": hp.choice("warmup_ratio", [0.0, 0.05, 0.1]),
}
fine_tune_model("hyper_parameter_searchs", search_space)