## Multi-label classification with DistilBERT
CSCI 6380 Tutorial

Agustin Lorenzo


### Background
This notebook outlines the process behind training a multilabel classifier. This uses HuggingFace's openly available DistilBERT transformer model, which is a lightweight version of BERT. With a multilabel classifier, instances can be classified under more than one label. This is ideal if you want to describe instances with concepts that aren't mutually exclusive. 

In [None]:
# import modules
import pandas as pd
import numpy as np
import torch
import pickle
from transformers import Trainer
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
from torch.utils.data import Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
from transformers import TrainingArguments, Trainer
#import wandb

  from .autonotebook import tqdm as notebook_tqdm


### Loading data
Reading the .csv file and converting to proper datatypes

In [2]:
# load data
train_df = pd.read_csv('sample_train_split.csv') # splits are saved outside of the program so specific instances can be analyzed later
test_df = pd.read_csv('sample_test_split.csv')

# Convert labels from string back to list
train_df['labels'] = train_df['labels'].str.strip('"').str.split(', ')
test_df['labels'] = test_df['labels'].str.strip('"').str.split(', ')

# Extract texts and labels
train_texts = train_df['entry'].tolist()
test_texts = test_df['entry'].tolist()
train_labels = train_df['labels'].tolist()
test_labels = test_df['labels'].tolist()

### Encoding data
Getting it in the correct format for the model

In [3]:
# Label encoder
multilabel = MultiLabelBinarizer()
train_labels = multilabel.fit_transform(train_labels).astype('float32')
test_labels = multilabel.transform(test_labels).astype('float32')

checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(train_labels[0]), problem_type="multi_label_classification")

# encode data
class CaseNoteDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=250):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])
        
        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

# create final dataset objects with encoded data
train_dataset = CaseNoteDataset(train_texts, train_labels, tokenizer)
test_dataset = CaseNoteDataset(test_texts, test_labels, tokenizer)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Defining metrics

In [4]:
def multi_label_metrics(predictions, labels, threshold=0.3):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    
    f1 = f1_score(y_true, y_pred, average='macro')
    roc_auc = roc_auc_score(y_true, y_pred, average='macro') 
    hamming = hamming_loss(y_true, y_pred)

    metrics = {
        "roc_auc": roc_auc,
        "hamming_loss": hamming,
        "f1": f1
    }
    
    return metrics


def compute_metrics(p:EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    
    return result

### Defining parameters for model training

In [6]:
batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,  # Set an initial learning rate
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #report_to="wandb",
    report_to=[],
    logging_strategy="steps",
    logging_steps=1,
    eval_steps=1
)

#wandb.init(project="case-notes-classification", name="distilbert-base")
# wandb is a good resource for recording/presenting stats from model training



### Train the model

In [None]:
#
# wandb.init(project="case-notes-classification", name="distilbert-base")

trainer = Trainer(model=model, 
                  args=args,
                  train_dataset=train_dataset, 
                  eval_dataset=test_dataset,
                  compute_metrics=compute_metrics,)

trainer.train()
trainer.evaluate()
trainer.save_model("distilbert-finetuned")
with open("binarizer.bin", "wb") as f:
    pickle.dump(multilabel, f)

Epoch,Training Loss,Validation Loss,Roc Auc,Hamming Loss,F1,Runtime,Samples Per Second,Steps Per Second
1,0.6997,0.693376,0.5,0.5,0.666667,0.0202,197.959,49.49
2,0.6973,0.685188,0.5,0.5,0.666667,0.0202,198.073,49.518
3,0.6914,0.676985,0.5,0.5,0.666667,0.0242,165.252,41.313
4,0.6859,0.667953,0.5,0.5,0.666667,0.0196,204.401,51.1
5,0.6726,0.660572,0.5,0.5,0.666667,0.0251,159.323,39.831
6,0.6688,0.654467,0.5,0.5,0.666667,0.0202,197.915,49.479
7,0.6545,0.648852,0.5,0.5,0.666667,0.0252,159.021,39.755
8,0.6434,0.644534,0.5,0.5,0.666667,0.0214,187.029,46.757
9,0.6702,0.64163,0.5,0.5,0.666667,0.0206,194.246,48.561
10,0.6326,0.640435,0.5,0.5,0.666667,0.0166,240.292,60.073


### Running the model
Now that the model has been trained, we can run it and give it a new instance to classify. 

In [None]:
with open("binarizer.bin", "rb") as f:
    multilabel_binarizer = pickle.load(f)

device = torch.device("cpu")
model.to(device)

new_instance = "black chair"
inputs = tokenizer(new_instance, return_tensors="pt") # tokenize string for proper input

# model outputs
outputs = model(**inputs)
predicted_probabilities = outputs.logits.sigmoid().detach().numpy()
binary_predictions = (predicted_probabilities > 0.1).astype(int) # threshold = 0.5
predicted_labels = multilabel.inverse_transform(binary_predictions)[0]

print("Predicted labels:", predicted_labels)


Predicted labels: (' plural', ' singular')


16 instances is far from enough for the model to learn relationships between labels and entries, but with enough data this would be effective. 