Fine-tune the HerBERT model.

In [1]:
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
import torch
import random
import numpy as np
import evaluate

In [2]:
dataset = load_dataset('json', data_files={ 
    'train' : 'dataset-train-10-morfeusz.jl',
    'eval' : 'dataset-dev-10-morfeusz.jl'})

Using custom data configuration default-3b89f3670fe5d35f
Found cached dataset json (/home/i306412/.cache/huggingface/datasets/json/default-3b89f3670fe5d35f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# dataset['train'] = dataset['train'].select(range(100))
dataset['eval'] = dataset['eval'].select(range(200))

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question_id', 'question_text', 'passages_id', 'passage_text', 'label'],
        num_rows: 44010
    })
    eval: Dataset({
        features: ['question_id', 'question_text', 'passages_id', 'passage_text', 'label'],
        num_rows: 200
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

In [6]:
def tokenize_function(row):
    return tokenizer(text=row['question_text'], text_pair=row['passage_text'], padding='max_length')
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/i306412/.cache/huggingface/datasets/json/default-3b89f3670fe5d35f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-823d1cb18a0f4951.arrow
Loading cached processed dataset at /home/i306412/.cache/huggingface/datasets/json/default-3b89f3670fe5d35f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-561f5ba535896b9d.arrow


In [7]:
train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['eval']

In [8]:
train_dataset

Dataset({
    features: ['question_id', 'question_text', 'passages_id', 'passage_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 44010
})

In [9]:
eval_dataset

Dataset({
    features: ['question_id', 'question_text', 'passages_id', 'passage_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
model = BertForSequenceClassification.from_pretrained("allegro/herbert-base-cased", num_labels=2) # 0 - irrelevant, 1 - relevant
model = model.to(device)

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

In [12]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
training_args = TrainingArguments(
    output_dir="model/morfeusz-10-1epochs",
    num_train_epochs=1,
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=1,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: question_id, question_text, passage_text, passages_id. If question_id, question_text, passage_text, passages_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 44010
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2751
  Number of trainable parameters = 124444418


Step,Training Loss,Validation Loss,Accuracy
500,0.5436,0.49813,0.77
1000,0.4656,0.434011,0.78
1500,0.4233,0.391742,0.82
2000,0.3901,0.361114,0.85
2500,0.3452,0.332435,0.855


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: question_id, question_text, passage_text, passages_id. If question_id, question_text, passage_text, passages_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 16
Saving model checkpoint to model/morfeusz-10-2epochs/checkpoint-500
Configuration saved in model/morfeusz-10-2epochs/checkpoint-500/config.json
Model weights saved in model/morfeusz-10-2epochs/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: question_id, question_text, passage_text, passages_id. If question_id, question_text, passage_text, passages_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this m

In [None]:
trainer.save_model()