# Fine Tuning DistilBERT for MultiLabel Text Classification - multilingual

In [1]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [17]:
"""load dataset

name: Bhuvaneshwari/intent_classification
from: https://huggingface.co/datasets/Bhuvaneshwari/intent_classification
"""
from datasets import load_dataset
dataset = load_dataset("Bhuvaneshwari/intent_classification")
dataset

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 13808
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 13808
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 13808
    })
})

In [18]:
dataset['validation'][:5]

{'text': ['listen to westbam alumb allergic on google music',
  'add step to me to the 50 clásicos playlist',
  'i give this current textbook a rating value of 1 and a best rating of 6',
  'play the song little robin redbreast',
  'please add iris dement to my playlist this is selena'],
 'intent': ['PlayMusic',
  'AddToPlaylist',
  'RateBook',
  'PlayMusic',
  'AddToPlaylist']}

In [19]:
train_filepath = "dataset/dataset.csv"
dataset['train'].to_csv(train_filepath)

Creating CSV from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

818072

In [20]:
df = pd.read_csv(train_filepath)
df

Unnamed: 0,text,intent
0,listen to westbam alumb allergic on google music,PlayMusic
1,add step to me to the 50 clásicos playlist,AddToPlaylist
2,i give this current textbook a rating value of...,RateBook
3,play the song little robin redbreast,PlayMusic
4,please add iris dement to my playlist this is ...,AddToPlaylist
...,...,...
13803,indeed,Affirmation
13804,indeed,Affirmation
13805,indeed,Affirmation
13806,indeed,Affirmation


In [21]:
df.columns = ["text", "labels"]
df

Unnamed: 0,text,labels
0,listen to westbam alumb allergic on google music,PlayMusic
1,add step to me to the 50 clásicos playlist,AddToPlaylist
2,i give this current textbook a rating value of...,RateBook
3,play the song little robin redbreast,PlayMusic
4,please add iris dement to my playlist this is ...,AddToPlaylist
...,...,...
13803,indeed,Affirmation
13804,indeed,Affirmation
13805,indeed,Affirmation
13806,indeed,Affirmation


In [22]:
df["labels"].values.tolist()
labels = set(df["labels"].values.tolist())
labels_dict = dict()
for i, label in enumerate(labels):
    labels_dict[label] = str(i)
labels_dict

{'Greetings': '0',
 'GetWeather': '1',
 'BookRestaurant': '2',
 'Cancellation': '3',
 'Affirmation': '4',
 'Book Meeting': '5',
 'SearchCreativeWork': '6',
 'PlayMusic': '7',
 'AddToPlaylist': '8',
 'SearchScreeningEvent': '9',
 'RateBook': '10',
 'excitment': '11'}

In [23]:
df['labels'] = df['labels'].apply(lambda x: labels_dict[x])
df

Unnamed: 0,text,labels
0,listen to westbam alumb allergic on google music,7
1,add step to me to the 50 clásicos playlist,8
2,i give this current textbook a rating value of...,10
3,play the song little robin redbreast,7
4,please add iris dement to my playlist this is ...,8
...,...,...
13803,indeed,4
13804,indeed,4
13805,indeed,4
13806,indeed,4


In [24]:
# label_df = pd.get_dummies(df['labels'], dtype=int)
onehot_df = pd.get_dummies(df['labels'], prefix='label', dtype=int)
sorted_colums = sorted(onehot_df.columns, key=lambda x: int(x.split('_')[1]))
onehot_df = onehot_df[sorted_colums]
onehot_df

Unnamed: 0,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,label_11
0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
13803,0,0,0,0,1,0,0,0,0,0,0,0
13804,0,0,0,0,1,0,0,0,0,0,0,0
13805,0,0,0,0,1,0,0,0,0,0,0,0
13806,0,0,0,0,1,0,0,0,0,0,0,0


In [25]:
df['labels'] = onehot_df.values.tolist()
df

Unnamed: 0,text,labels
0,listen to westbam alumb allergic on google music,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,add step to me to the 50 clásicos playlist,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
2,i give this current textbook a rating value of...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
3,play the song little robin redbreast,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
4,please add iris dement to my playlist this is ...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
...,...,...
13803,indeed,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
13804,indeed,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
13805,indeed,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
13806,indeed,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"


In [26]:
new_df = df

In [None]:
# data = pd.read_csv('train.csv')
# data

In [27]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 20
LEARNING_RATE = 1e-05

In [34]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert/distilbert-base-multilingual-cased', truncation=True, do_lower_case=True)

In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)
# training_set = MultiLabelDataset(train_data, tokenizer)
# testing_set = MultiLabelDataset(test_data, tokenizer)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Creating the Neural Network for Fine Tuning

In [3]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        # self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.l1 = DistilBertModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 12)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistilBERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

# Validating Model

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
output, targets = validation(testing_loader)
print(f"outputs:\n{output}")
print(f"targets:\n{targets}")

final_outputs = np.array(output) >=0.5
print(f"final outputs:\n{final_outputs}")


In [None]:
import numpy as np
list(map(lambda x: np.where(x == True)[0].item(), final_outputs[:10]))

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

# Saving the files for inference

In [None]:
from datetime import datetime
today = datetime.now().strftime("%Y%m%d")
output_model_file = './output/pytorch_distilbert_{}.bin'.format(today)
output_vocab_file = './output/vocab_distilbert_{}.bin'.format(today)

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')

# Load model for inference

In [15]:
import os

output_model_filepath = "output/pytorch_distilbert_20240903.bin"
output_vocab_filepath = "output/vocab_distilbert_20240903.bin"

assert os.path.isfile(output_model_filepath)
assert os.path.isfile(output_vocab_filepath)

In [32]:
model = torch.load(output_model_filepath).to(device)
model.eval()
model

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

# Predict

In [10]:
def predict(model, tokenizer, texts:list):
    # model.eval()
    
    outputs = list()
    for text in texts:
        text = " ".join(text.split())
        print(f"text:{text}")
        
        inputs = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        print("=================== ids ====================")
        print(ids)
        
        print("=================== mask ====================")
        print(mask)
        
        print("=================== token_type_ids ====================")
        print(token_type_ids)
        
        data = {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }
        print(f"data:\n{data}")
        
        with torch.no_grad():
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            
            output = None
            try:
                output = model(ids, mask, token_type_ids)
            except Exception as e:
                print(e)
            else:
                print(f"outputs:\n{output}")
                outputs.append(output)
            
            # result = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
            # print(f"result:{result}")
            # result = np.array(result) >= 0.5
            # print(f"result:{result}")
            
            # label = result.index(True)
            # print(f"label: {label}")
        return outputs

In [11]:
# 0	listen to westbam alumb allergic on google music	2
# 1	add step to me to the 50 clásicos playlist	6
# 2	i give this current textbook a rating value of...	0
# 3	play the song little robin redbreast	2
# 4	please add iris dement to my playlist this is ...	6

predict(model, tokenizer, "listen to westbam alumb allergic on google music")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


text:listen to westbam alumb allergic on google music
[101, 55129, 10114, 13735, 10537, 10147, 10164, 10465, 10457, 17851, 42153, 10135, 41181, 11839, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
model

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [13]:
tokenizer

DistilBertTokenizer(name_or_path='distilbert/distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [35]:
tokenizer = DistilBertTokenizer.from_pretrained(output_vocab_filepath)
tokenizer

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


DistilBertTokenizer(name_or_path='output/vocab_distilbert_20240903.bin', vocab_size=119547, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [36]:
# 0	listen to westbam alumb allergic on google music	[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
# 1	add step to me to the 50 clásicos playlist	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
# 2	i give this current textbook a rating value of...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
# 3	play the song little robin redbreast	[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
# 4	please add iris dement to my playlist this is ...	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

input_text = "listen to westbam alumb allergic on google music"
inputs = tokenizer(input_text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        token_type_ids=None
    )
    
    

In [37]:
outputs

tensor([[-12.0543, -13.0372,   9.5634, -13.8833, -11.8166, -14.1044, -10.2269,
         -14.5835, -11.0993,  -9.9678, -10.4530, -14.6204]], device='cuda:0')

In [45]:
result = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
print(f"result:{result}")

result = np.array(result) >= 0.5
print(f"result:{result}")

np.where(result == True)

result:[[5.8193809309159406e-06, 2.177785290768952e-06, 0.9999297857284546, 9.34483239234396e-07, 7.3807182161544915e-06, 7.491183282581915e-07, 3.618096889113076e-05, 4.6393546426770627e-07, 1.5122902368602809e-05, 4.688165063271299e-05, 2.886036782001611e-05, 4.4715145008922264e-07]]
result:[[False False  True False False False False False False False False False]]


(array([0]), array([2]))

In [46]:
true_indices = np.where(result == True)[1]
true_indices

array([2])

In [51]:
label_index = true_indices.item()
label_index

2