## fine tune bert model for custom dataset

### 1. install libraries

In [22]:
! pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### 2. load/define data set

In [172]:
import pandas as pd

In [173]:
df = pd.read_csv('data/dummydata.csv')

In [174]:
df['intlabel'] = df['label'].rank(method='dense', ascending=False).astype(int) - 1

In [175]:
labelmapping = {}
for key in df.intlabel.unique():
    value = df.loc[df['intlabel'] == key,'label'].unique()[0]
    labelmapping[key] = value
print(labelmapping)

{2: 'HELP', 0: 'YES', 1: 'NO'}


In [176]:
n_labels = len(labelmapping.values())
print(n_labels)

3


In [177]:
print(df)

                        text label  intlabel
0   would you please help me  HELP         2
1         give me some hints  HELP         2
2       how should I proceed  HELP         2
3         I don't understand  HELP         2
4                       yeap   YES         0
5                         ok   YES         0
6                       fine   YES         0
7                       sure   YES         0
8                    perfect   YES         0
9                         no    NO         1
10                    cancel    NO         1
11                  disagree    NO         1
12               please help  HELP         2
13                     hints  HELP         2
14                      help  HELP         2
15            I don't get it  HELP         2
16               let's do it   YES         0
17                     right   YES         0
18                  well, ok   YES         0
19                  ok, fine   YES         0
20                       nah    NO         1
21        

In [178]:
texts = df.text.tolist()
labels = df.intlabel.tolist()

In [179]:
from sklearn.model_selection import train_test_split

In [180]:
trntxt, tsttxt, trnlbl, tstlbl = train_test_split(texts, labels, test_size=0.1)

In [181]:
print(tsttxt, tstlbl, [labelmapping[key] for key in tstlbl])

["I don't understand", 'yeap', 'no'] [2, 0, 1] ['HELP', 'YES', 'NO']


### 3. preprocess text

In [182]:
import tensorflow as tf
from transformers import DistilBertTokenizerFast

In [183]:
# load the same tokenizer a model was trained with
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [184]:
trnencodings = tokenizer(trntxt, truncation=True, padding=True)
tstencodings = tokenizer(tsttxt, truncation=True, padding=True)

In [185]:
trn_dataset = tf.data.Dataset.from_tensor_slices((
    dict(trnencodings),
    trnlbl
))
tst_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tstencodings),
    tstlbl
))

### 4. load pretrained model

In [186]:
from transformers import TFDistilBertForSequenceClassification

In [187]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=n_labels)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_59', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [188]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(trn_dataset.shuffle(100).batch(16),
          epochs=10,
          batch_size=16,
          validation_data=tst_dataset.shuffle(100).batch(16))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa9454b7950>

In [189]:
# ! mkdir 'output/custom_intent_csf'
# model.save_pretrained("output/custom_intent_csf")

### 5. Test trained model

In [190]:
import numpy as np

In [191]:
def predict(model, tokenizer, text) -> int:
    """
    :param model: trained model
    :param tokenizer: with which model was trained
    :param text: input to be classified
    :return: predicted label
    """
    tokenized = tokenizer.encode(text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
    tf_output = model.predict(tokenized)[0][0]
    return np.argmax(tf_output), tf_output 
    

In [216]:
test_sentence = "hell yes"

In [217]:
pred_label, model_output = predict(model, tokenizer, test_sentence)

In [218]:
print(labelmapping[pred_label], model_output)

YES [ 0.90307343  0.1757419  -0.7780264 ]
