## fine tune bert model for custom dataset

### 1. install libraries

In [22]:
! pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### 2. load/define data set

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/dummycsdata.csv')

In [3]:
# str labels to int
df['intlabel'] = df['label'].rank(method='dense', ascending=False).astype(int) - 1

In [4]:
# prepare mapping from int labels back to str
labelmapping = {}
for key in df.intlabel.unique():
    value = df.loc[df['intlabel'] == key,'label'].unique()[0]
    labelmapping[key] = value
print(labelmapping)

{3: 'HELP', 4: 'CONFIRMATION_YES', 5: 'CONFIRMATION_NO', 1: 'NEXT', 0: 'RESTART', 2: 'IRRELEVANT'}


In [5]:
n_labels = len(labelmapping.values())
print(n_labels)

6


In [6]:
print(df)

                                 text             label  intlabel
0                               pomoc              HELP         3
1                     potřebuju pomoc              HELP         3
2                            pomoz mi              HELP         3
3                     zobraz nápovědu              HELP         3
4                            nápověda              HELP         3
5                        nevím co dál              HELP         3
6                    co mám dělat dál              HELP         3
7                            poraď mi              HELP         3
8                                 ano  CONFIRMATION_YES         4
9                                  jo  CONFIRMATION_YES         4
10                                 ok  CONFIRMATION_YES         4
11                              jasně  CONFIRMATION_YES         4
12                                 ne   CONFIRMATION_NO         5
13                        ani náhodou   CONFIRMATION_NO         5
14        

In [7]:
texts = df.text.tolist()
labels = df.intlabel.tolist()

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
trntxt, tsttxt, trnlbl, tstlbl = train_test_split(texts, labels, test_size=0.1)

In [10]:
print(tsttxt, tstlbl, [labelmapping[key] for key in tstlbl])

['odznova', 'zvuky či videa', 'nápověda'] [0, 2, 3] ['RESTART', 'IRRELEVANT', 'HELP']


### 3. preprocess text

In [20]:
import tensorflow as tf
from transformers import AutoTokenizer

In [21]:
# load the same tokenizer a model was trained with
tokenizer = AutoTokenizer.from_pretrained("Seznam/small-e-czech")

In [22]:
trnencodings = tokenizer(trntxt, truncation=True, padding=True)
tstencodings = tokenizer(tsttxt, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [23]:
trn_dataset = tf.data.Dataset.from_tensor_slices((
    dict(trnencodings),
    trnlbl
))
tst_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tstencodings),
    tstlbl
))

### 4. load pretrained model

In [24]:
from transformers import AutoModelForSequenceClassification

In [25]:
model = AutoModelForSequenceClassification.from_pretrained("Seznam/small-e-czech", num_labels=n_labels)

Some weights of the model checkpoint at Seznam/small-e-czech were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at Seznam/small-e-czech and are newly initialized: ['classifier.out_proj.bias', 'classifier

### 5. Fit model on a custom dataset

In [26]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(trn_dataset.shuffle(100).batch(16),
          epochs=10,
          batch_size=16,
          validation_data=tst_dataset.shuffle(100).batch(16))

AttributeError: 'ElectraForSequenceClassification' object has no attribute 'compile'

In [189]:
# ! mkdir 'output/custom_intent_csf'
# model.save_pretrained("output/custom_intent_csf")

In [27]:
from transformers import TrainingArguments, Trainer

In [32]:
training_args = TrainingArguments(
    output_dir="./tunedbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trn_dataset,
    eval_dataset=tst_dataset,
    tokenizer=tokenizer,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
trainer.train()

***** Running training *****
  Num examples = 27
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4


TypeError: 'TensorSliceDataset' object is not subscriptable

### 6. Test trained model

In [190]:
import numpy as np

In [191]:
def predict(model, tokenizer, text) -> int:
    """
    :param model: trained model
    :param tokenizer: with which model was trained
    :param text: input to be classified
    :return: predicted label
    """
    tokenized = tokenizer.encode(text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
    tf_output = model.predict(tokenized)[0][0]
    return np.argmax(tf_output), tf_output 
    

In [299]:
test_sentence = "I need some help from you"

In [300]:
pred_label, model_output = predict(model, tokenizer, test_sentence)

In [301]:
print(labelmapping[pred_label], model_output)

HELP [-0.2925185 -0.6602275  0.7674151]
