## fine tune bert model for custom dataset

### 1. load/define data set

In [151]:
# define dummy data to test api
data = {"train": [{'text': "would you please help me", 'label': "HELP"}, {'text': "give me some hints", 'label': "HELP"}, 
                  {'text': "how should I proceed", 'label': "HELP"}, {'text': "I don't understand", 'label': "HELP"},
                  {'text': "yeap", 'label': "YES"}, {'text': "ok", 'label': "YES"}, {'text': "fine", 'label': "YES"}, {'text': "sure", 'label': "YES"}, {'text': "perfect", 'label': "YES"},
                  {'text': "no", 'label': "NO"}, {'text': "cancel", 'label': "NO"}, {'text': "disagree", 'label': "NO"}],
        "valid": [{'text': "please help", 'label': "HELP"}, {'text': "hints", 'label': "HELP"}, {'text': "help", 'label': "HELP"}, {'text': "I don't get it", 'label': "HELP"},
                  {'text': "let's do it", 'label': "YES"}, {'text': "right", 'label': "YES"}, {'text': "well, ok", 'label': "YES"}, {'text': "ok, fine", 'label': "YES"},
                  {'text': "nah", 'label': "NO"}, {'text': "I don't want it", 'label': "NO"}, {'text': "no no no", 'label': "NO"}, {'text': "stop it", 'label': "NO"}]}
labels = ["HELP", "YES", "NO"]

### 2. install transofrmers library

In [3]:
! pip install transformers



### 3. preprocess text

In [152]:
from transformers import AutoTokenizer

In [153]:
# load the same tokenizer a model was trained with
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/barinale/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.14.1",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /home/barinale/.cache/huggingface/transformers/0e1bbfda7f6

In [132]:
def preprocess(sample: dict, tokenizer: AutoTokenizer) -> dict:
    """
    :param sample: dict in a format {'text': .., 'label': ..}
    :param tokenizer: same tokenizer a model was trained with
    :return: {'text': tokenized truncated text, 'label': ..}
    """
    sample.update(tokenizer(sample['text'], truncation=True, padding="max_length"))
    sample.pop('text')
    return sample

In [133]:
tokenized = {key: [preprocess(sample, tokenizer) for sample in dataset] for key, dataset in data.items()}

In [138]:
print(tokenized['train'][0])

{'label': 'HELP', 'input_ids': [101, 2052, 2017, 3531, 2393, 2033, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [139]:
from transformers import DataCollatorWithPadding

In [140]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

### 4. load pretrained model

In [141]:
from transformers import AutoModelForSequenceClassification

In [142]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(labels))

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/barinale/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.14.1",
  "vocab_size": 30522
}

l

In [143]:
from transformers import TrainingArguments, Trainer

#### a. Define your training hyperparameters in TrainingArguments.

In [144]:
! mkdir './tunedbert'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
mkdir: cannot create directory ‘./tunedbert’: File exists


In [145]:
training_args = TrainingArguments(
    output_dir="./tunedbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


#### b. Pass the training arguments to a Trainer along with the model, dataset, tokenizer, and data collator.

In [149]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["valid"],
    data_collator=collator,
    tokenizer=tokenizer,
)


#### c. Call Trainer.train() to fine-tune your model.

In [150]:
trainer.train()

***** Running training *****
  Num examples = 12
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.