## fine tune bert model for custom dataset

### 1. install libraries

In [None]:
! pip install transformers

In [78]:
! pip install imbalanced-learn



### 2. load/define data set

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/banking.csv')

In [3]:
# str labels to int
df['intlabel'] = df['label'].rank(method='dense', ascending=False).astype(int) - 1

In [4]:
# prepare mapping from int labels back to str
labelmapping = {}
for key in df.intlabel.unique():
    value = df.loc[df['intlabel'] == key,'label'].unique()[0]
    labelmapping[key] = value
print(labelmapping)

{29: 'all_in_one', 28: 'bez_predcisli', 27: 'blokace_karty', 26: 'dekuji', 25: 'go_back', 24: 'hotovo', 23: 'jednorazova_platba', 22: 'jiny_email', 21: 'jiny_ucet', 20: 'konec', 19: 'odeslani_platby', 18: 'opravit', 17: 'pokracovat', 16: 'pomoc', 15: 'pomoc_platby', 14: 'pozdrav', 13: 'say_again', 12: 'validace', 11: 'verifikace_platby_vse_spatne', 10: 'zacit_znovu', 9: 'zadat_inkaso', 8: 'zadat_trvaly_prikaz', 7: 'zadna_operace', 6: 'zadost_o_strpeni', 5: 'zmenit_inkaso', 4: 'zmenit_trvaly_prikaz', 3: 'zruseni_platby', 2: 'zrusit_inkaso', 1: 'zrusit_trvaly_prikaz', 0: 'zustatek_uctu'}


In [5]:
n_labels = len(labelmapping.values())
print(n_labels)

30


In [6]:
print(df.shape)
df.sample(frac=1).head()

(173, 3)


Unnamed: 0,text,label,intlabel
120,Chtěla bych posílat dceři měsíčně peníze na je...,zadat_trvaly_prikaz,8
67,pokračovat,pokracovat,17
119,zadat inkaso,zadat_inkaso,9
23,dál už nic,hotovo,24
152,změnit trvalý příkaz,zmenit_trvaly_prikaz,4


In [7]:
# count how many samples per label in the dataset
dfcounts = df.groupby('label').size().reset_index(name='counts')
dfcounts

Unnamed: 0,label,counts
0,all_in_one,2
1,bez_predcisli,3
2,blokace_karty,9
3,dekuji,4
4,go_back,5
5,hotovo,9
6,jednorazova_platba,17
7,jiny_email,4
8,jiny_ucet,2
9,konec,6


In [8]:
# retrieve labels that have minimum number of samples
dfmincounts = dfcounts[dfcounts.counts > 2]

In [9]:
# remove data with labels that have less than minimum number of samples
df = df[df.label.isin(dfmincounts.label.tolist())]
df.groupby('label').size().reset_index(name='counts')

Unnamed: 0,label,counts
0,bez_predcisli,3
1,blokace_karty,9
2,dekuji,4
3,go_back,5
4,hotovo,9
5,jednorazova_platba,17
6,jiny_email,4
7,konec,6
8,odeslani_platby,5
9,pomoc,20


In [35]:
texts = df.text.tolist()
labels = df.intlabel.tolist()

In [36]:
trntxt, tsttxt, trnlbl, tstlbl = train_test_split(texts, labels, test_size=0.2)

### 3. preprocess text

#### 3.1 balance training dataset

In [37]:
# take a look into training data distribution to decide if balancing is necessary
dftrn = pd.DataFrame({'text': trntxt, 'label': [labelmapping[key] for key in trnlbl], 'intlabel': trnlbl})
dftrn.groupby('label').size().reset_index(name='counts')

Unnamed: 0,label,counts
0,bez_predcisli,3
1,blokace_karty,9
2,dekuji,4
3,go_back,4
4,hotovo,8
5,jednorazova_platba,12
6,jiny_email,2
7,konec,5
8,odeslani_platby,4
9,pomoc,14


In [38]:
from imblearn.over_sampling import RandomOverSampler
import numpy as np

In [39]:
sampler = RandomOverSampler(random_state=42)

In [40]:
trntxt = np.asarray(trntxt)
trntxt = trntxt[:, np.newaxis]

In [41]:
trntxt, trnlbl = sampler.fit_resample(trntxt, trnlbl)
trntxt = trntxt.flatten().tolist()

In [42]:
# chech the result of balancing
dftrn = pd.DataFrame({'text': trntxt, 'label': [labelmapping[key] for key in trnlbl], 'intlabel': trnlbl})
dftrn.groupby('label').size().reset_index(name='counts')

Unnamed: 0,label,counts
0,bez_predcisli,14
1,blokace_karty,14
2,dekuji,14
3,go_back,14
4,hotovo,14
5,jednorazova_platba,14
6,jiny_email,14
7,konec,14
8,odeslani_platby,14
9,pomoc,14


#### 3.2 encode string data with tokenizer

In [43]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
# load the same tokenizer a model was trained with
tokenizer = AutoTokenizer.from_pretrained("Seznam/small-e-czech")

In [45]:
trnencodings = tokenizer(trntxt, truncation=True, padding=True)
tstencodings = tokenizer(tsttxt, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [48]:
trndata = [{'label': label, 'input_ids': inid, 'attention_mask': atmask} for label, inid, atmask in zip(trnlbl, trnencodings['input_ids'], trnencodings['attention_mask'])]
tstdata = [{'label': label, 'input_ids': inid, 'attention_mask': atmask} for label, inid, atmask in zip(tstlbl, tstencodings['input_ids'], tstencodings['attention_mask'])]

### 4. load pretrained model

In [49]:
from transformers import AutoModelForSequenceClassification

In [50]:
model = AutoModelForSequenceClassification.from_pretrained("Seznam/small-e-czech", num_labels=n_labels)

Some weights of the model checkpoint at Seznam/small-e-czech were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at Seznam/small-e-czech and are newly initialized: ['classifier.out_proj.weight', 'classifi

### 5. Fit model on a custom dataset

In [51]:
from transformers import TrainingArguments, Trainer

In [65]:
training_args = TrainingArguments(
    output_dir="./tunedbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=100,
    num_train_epochs=100,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trndata,
    eval_dataset=tstdata,
    tokenizer=tokenizer,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [66]:
trainer.train()

***** Running training *****
  Num examples = 322
  Num Epochs = 100
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2100


Step,Training Loss
100,2.3679
200,2.2112
300,2.075
400,1.9462
500,1.8295
600,1.7217
700,1.6177
800,1.5229
900,1.447
1000,1.3536


Saving model checkpoint to ./tunedbert/checkpoint-500
Configuration saved in ./tunedbert/checkpoint-500/config.json
Model weights saved in ./tunedbert/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./tunedbert/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./tunedbert/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./tunedbert/checkpoint-1000
Configuration saved in ./tunedbert/checkpoint-1000/config.json
Model weights saved in ./tunedbert/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./tunedbert/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./tunedbert/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./tunedbert/checkpoint-1500
Configuration saved in ./tunedbert/checkpoint-1500/config.json
Model weights saved in ./tunedbert/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./tunedbert/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./tunedber

TrainOutput(global_step=2100, training_loss=1.4196265048072452, metrics={'train_runtime': 891.5059, 'train_samples_per_second': 36.119, 'train_steps_per_second': 2.356, 'total_flos': 44438636793600.0, 'train_loss': 1.4196265048072452, 'epoch': 100.0})

### 6. Test trained model

In [74]:
rawpredictions = trainer.predict(tstdata)
pred_intlabels = np.argmax(rawpredictions.predictions, axis=1)
pred_labels = [labelmapping[lbl] for lbl in pred_intlabels]
gt_intlabels = np.array([entry['label'] for entry in tstdata])
gt_labels = [labelmapping[lbl] for lbl in gt_intlabels]

***** Running Prediction *****
  Num examples = 33
  Batch size = 16


In [75]:
correct = np.sum(pred_intlabels == gt_intlabels)
accuracy = correct / gt_intlabels.shape[0]
print("ACCURACY", accuracy, f" {correct} correct out of {gt_intlabels.shape[0]}")

ACCURACY 0.7575757575757576  25 correct out of 33


In [76]:
result = [{'text': txt, 'prediction': pred, 'ground truth': gt} for txt, pred, gt in zip(tsttxt, pred_labels, gt_labels)]
print(result)

[{'text': 'Chtěl bych si přeposlat peníze ze svého druhého účtu.', 'prediction': 'jednorazova_platba', 'ground truth': 'jednorazova_platba'}, {'text': 'platba na účet', 'prediction': 'jednorazova_platba', 'ground truth': 'jednorazova_platba'}, {'text': 'co mohu dělat', 'prediction': 'pomoc', 'ground truth': 'pomoc'}, {'text': 'Kolik na účtu zbývá', 'prediction': 'zustatek_uctu', 'ground truth': 'zustatek_uctu'}, {'text': 'už nic nepotřebuju', 'prediction': 'konec', 'ground truth': 'konec'}, {'text': 'vše', 'prediction': 'pomoc', 'ground truth': 'hotovo'}, {'text': 'Chtěl bych měsíčně posílat peníze na charitu.', 'prediction': 'zadat_trvaly_prikaz', 'ground truth': 'zadat_trvaly_prikaz'}, {'text': 'řekni to znova', 'prediction': 'say_again', 'ground truth': 'say_again'}, {'text': 'Jen si dojdu pro číslo účtu.', 'prediction': 'zrusit_trvaly_prikaz', 'ground truth': 'zadost_o_strpeni'}, {'text': 'Chci zrušit souhlas s inkasem, ale nemůžu to to tam nikde najít.', 'prediction': 'zrusit_inka