# Clasificación de tokens con HuggingFace

In [1]:
import sys
sys.path.append('../datos/procesamiento')
from corpus import Corpus

corpus_train = Corpus()
corpus_test = Corpus()

---

## Cargando el corpus como dataset de HuggingFace

In [2]:
ner_dict = {'O': 0,
            'B-ACTIVE_PRINCIPLE': 1,
            'I-ACTIVE_PRINCIPLE': 2,
            'B-FORMA_FARMA':3,
            'I-FORMA_FARMA':4,
            'B-ADMIN': 5,
            'I-ADMIN': 6,
            'B-PERIODICITY': 7,
            'I-PERIODICITY': 8,
            'B-DURATION': 9,
            'I-DURATION': 10
            }

corpus_train.entidades = ner_dict
corpus_test.entidades = ner_dict

In [3]:
corpus_train.load_conll('../datos/Etiquetado/corpus_ER_train_v2.txt')

Agregadas 82481 secuencias de token-entidad al corpus


In [4]:
corpus_test.load_conll('../datos/Etiquetado/corpus_ER_test_v2.txt')

Agregadas 20621 secuencias de token-entidad al corpus


In [5]:
from datasets import DatasetDict

HF_dataset = DatasetDict()

HF_dataset['train'] = corpus_train.to_HF_dataset()
HF_dataset['test'] = corpus_test.to_HF_dataset()

In [6]:
HF_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 82481
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 20621
    })
})

## Cargamos un modelo basado en Transformers de HuggingFace

Nuestro modelo será: plncmm/bert-clinical-scratch-wl-es. Ha sido fine-tuneado con texto médico (aunque probablemente no con prescripciones)

[Fuente de esta sección](https://huggingface.co/docs/transformers/tasks/token_classification)

### Tokenizer

In [7]:
MODEL = "plncmm/bert-clinical-scratch-wl-es"

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Usaremos el tokenizador para codificar nuestro input.

In [9]:
example = HF_dataset["train"][0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'ácido',
 'val',
 '##pro',
 '##ico',
 '250',
 'mg',
 'comprimido',
 'recu',
 '##bier',
 '##to',
 '1',
 'comprimido',
 'oral',
 'diaria',
 '0',
 '-',
 '0',
 '-',
 '250',
 '##m',
 '##g',
 '[SEP]']

Debemos corregir el _mismatch_ entre input tokenizado y la lista de tokens

In [10]:
# from auxfunctions import tokenize_and_align_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
tokenized_data = HF_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_data = tokenized_data.remove_columns(['id','tokens','ner_tags'])

  0%|          | 0/83 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

In [12]:
tokenized_data.data

{'train': InMemoryTable
 input_ids: list<item: int32>
   child 0, item: int32
 token_type_ids: list<item: int8>
   child 0, item: int8
 attention_mask: list<item: int8>
   child 0, item: int8
 labels: list<item: int64>
   child 0, item: int64
 ----
 input_ids: [[[4,11044,1590,4769,1248,...,1139,9392,30967,30972,5],[4,27576,3356,1087,30968,...,29258,1751,5631,15359,5],...,[4,23565,1820,27604,5880,...,1748,1129,1003,2596,5],[4,4017,5026,7736,1316,...,5631,1748,997,2596,5]],[[4,15772,13955,3284,16056,...,2596,1672,2286,12873,5],[4,13695,1820,1204,1791,...,1748,1129,1003,2596,5],...,[4,20482,6180,30957,4665,...,2596,1672,2286,12873,5],[4,30305,7409,1108,19333,...,1411,2242,1074,1707,5]],...,[[4,1097,23070,14921,4665,...,1139,1444,1139,1129,5],[4,1651,4714,3886,14008,...,2596,1672,1002,12873,5],...,[4,1097,23070,14921,3284,...,1013,989,1001,30974,5],[4,19769,2470,19624,16057,...,2596,1672,1129,12873,5]],[[4,24236,21335,1575,1444,...,15359,1672,1098,12873,5],[4,14053,4021,18229,1405,...,1098

### Modelo

In [13]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(ner_dict))

Some weights of the model checkpoint at plncmm/bert-clinical-scratch-wl-es were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at plncmm/bert-clinical-s

## Entrenamiento

In [14]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    weight_decay = 0.01,
)

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator = data_collator,
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 82481
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 25780


  0%|          | 0/25780 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'loss': 0.3016, 'learning_rate': 1.961210240496509e-05, 'epoch': 0.1}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'loss': 0.0723, 'learning_rate': 1.922420480993018e-05, 'epoch': 0.19}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json


{'loss': 0.0391, 'learning_rate': 1.8836307214895268e-05, 'epoch': 0.29}


Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json


{'loss': 0.0253, 'learning_rate': 1.844840961986036e-05, 'epoch': 0.39}


Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json


{'loss': 0.0194, 'learning_rate': 1.806051202482545e-05, 'epoch': 0.48}


Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/config.json


{'loss': 0.0158, 'learning_rate': 1.7672614429790537e-05, 'epoch': 0.58}


Model weights saved in ./results/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-3500
Configuration saved in ./results/checkpoint-3500/config.json


{'loss': 0.0106, 'learning_rate': 1.7284716834755626e-05, 'epoch': 0.68}


Model weights saved in ./results/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-4000
Configuration saved in ./results/checkpoint-4000/config.json


{'loss': 0.0096, 'learning_rate': 1.6896819239720715e-05, 'epoch': 0.78}


Model weights saved in ./results/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-4500
Configuration saved in ./results/checkpoint-4500/config.json


{'loss': 0.0096, 'learning_rate': 1.6508921644685803e-05, 'epoch': 0.87}


Model weights saved in ./results/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json


{'loss': 0.0085, 'learning_rate': 1.6121024049650892e-05, 'epoch': 0.97}


Model weights saved in ./results/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20621
  Batch size = 16


  0%|          | 0/1289 [00:00<?, ?it/s]

{'eval_loss': 0.004957306664437056, 'eval_runtime': 66.1481, 'eval_samples_per_second': 311.74, 'eval_steps_per_second': 19.487, 'epoch': 1.0}


Saving model checkpoint to ./results/checkpoint-5500
Configuration saved in ./results/checkpoint-5500/config.json


{'loss': 0.0054, 'learning_rate': 1.5733126454615984e-05, 'epoch': 1.07}


Model weights saved in ./results/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-6000
Configuration saved in ./results/checkpoint-6000/config.json


{'loss': 0.0039, 'learning_rate': 1.5345228859581073e-05, 'epoch': 1.16}


Model weights saved in ./results/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-6000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-6000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-6500
Configuration saved in ./results/checkpoint-6500/config.json


{'loss': 0.0059, 'learning_rate': 1.4957331264546162e-05, 'epoch': 1.26}


Model weights saved in ./results/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-6500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-6500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-7000
Configuration saved in ./results/checkpoint-7000/config.json


{'loss': 0.0036, 'learning_rate': 1.456943366951125e-05, 'epoch': 1.36}


Model weights saved in ./results/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-7000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-7000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-7500
Configuration saved in ./results/checkpoint-7500/config.json


{'loss': 0.0056, 'learning_rate': 1.4181536074476339e-05, 'epoch': 1.45}


Model weights saved in ./results/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-7500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-7500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-8000
Configuration saved in ./results/checkpoint-8000/config.json


{'loss': 0.0038, 'learning_rate': 1.379363847944143e-05, 'epoch': 1.55}


Model weights saved in ./results/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-8000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-8000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-8500
Configuration saved in ./results/checkpoint-8500/config.json


{'loss': 0.004, 'learning_rate': 1.3405740884406518e-05, 'epoch': 1.65}


Model weights saved in ./results/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-8500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-8500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-9000
Configuration saved in ./results/checkpoint-9000/config.json


{'loss': 0.0037, 'learning_rate': 1.3017843289371609e-05, 'epoch': 1.75}


Model weights saved in ./results/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-9000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-9000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-9500
Configuration saved in ./results/checkpoint-9500/config.json


{'loss': 0.0032, 'learning_rate': 1.2629945694336696e-05, 'epoch': 1.84}


Model weights saved in ./results/checkpoint-9500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-9500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-9500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-10000
Configuration saved in ./results/checkpoint-10000/config.json


{'loss': 0.0039, 'learning_rate': 1.2242048099301784e-05, 'epoch': 1.94}


Model weights saved in ./results/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-10000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-10000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20621
  Batch size = 16


  0%|          | 0/1289 [00:00<?, ?it/s]

{'eval_loss': 0.002052752999588847, 'eval_runtime': 66.979, 'eval_samples_per_second': 307.873, 'eval_steps_per_second': 19.245, 'epoch': 2.0}


Saving model checkpoint to ./results/checkpoint-10500
Configuration saved in ./results/checkpoint-10500/config.json


{'loss': 0.0029, 'learning_rate': 1.1854150504266875e-05, 'epoch': 2.04}


Model weights saved in ./results/checkpoint-10500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-10500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-10500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-11000
Configuration saved in ./results/checkpoint-11000/config.json


{'loss': 0.0024, 'learning_rate': 1.1466252909231963e-05, 'epoch': 2.13}


Model weights saved in ./results/checkpoint-11000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-11000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-11000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-11500
Configuration saved in ./results/checkpoint-11500/config.json


{'loss': 0.002, 'learning_rate': 1.1078355314197054e-05, 'epoch': 2.23}


Model weights saved in ./results/checkpoint-11500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-11500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-11500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-12000
Configuration saved in ./results/checkpoint-12000/config.json


{'loss': 0.0021, 'learning_rate': 1.0690457719162142e-05, 'epoch': 2.33}


Model weights saved in ./results/checkpoint-12000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-12000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-12000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-12500
Configuration saved in ./results/checkpoint-12500/config.json


{'loss': 0.002, 'learning_rate': 1.0302560124127233e-05, 'epoch': 2.42}


Model weights saved in ./results/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-12500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-12500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-13000
Configuration saved in ./results/checkpoint-13000/config.json


{'loss': 0.0019, 'learning_rate': 9.91466252909232e-06, 'epoch': 2.52}


Model weights saved in ./results/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-13000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-13000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-13500
Configuration saved in ./results/checkpoint-13500/config.json


{'loss': 0.0014, 'learning_rate': 9.52676493405741e-06, 'epoch': 2.62}


Model weights saved in ./results/checkpoint-13500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-13500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-13500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-14000
Configuration saved in ./results/checkpoint-14000/config.json


{'loss': 0.0018, 'learning_rate': 9.138867339022499e-06, 'epoch': 2.72}


Model weights saved in ./results/checkpoint-14000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-14000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-14000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-14500
Configuration saved in ./results/checkpoint-14500/config.json


{'loss': 0.0017, 'learning_rate': 8.750969743987588e-06, 'epoch': 2.81}


Model weights saved in ./results/checkpoint-14500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-14500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-14500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-15000
Configuration saved in ./results/checkpoint-15000/config.json


{'loss': 0.0013, 'learning_rate': 8.363072148952676e-06, 'epoch': 2.91}


Model weights saved in ./results/checkpoint-15000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-15000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-15000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20621
  Batch size = 16


  0%|          | 0/1289 [00:00<?, ?it/s]

{'eval_loss': 0.0012002384755760431, 'eval_runtime': 66.9878, 'eval_samples_per_second': 307.832, 'eval_steps_per_second': 19.242, 'epoch': 3.0}


Saving model checkpoint to ./results/checkpoint-15500
Configuration saved in ./results/checkpoint-15500/config.json


{'loss': 0.0029, 'learning_rate': 7.975174553917767e-06, 'epoch': 3.01}


Model weights saved in ./results/checkpoint-15500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-15500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-15500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-16000
Configuration saved in ./results/checkpoint-16000/config.json


{'loss': 0.0018, 'learning_rate': 7.5872769588828555e-06, 'epoch': 3.1}


Model weights saved in ./results/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-16000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-16000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-16500
Configuration saved in ./results/checkpoint-16500/config.json


{'loss': 0.0011, 'learning_rate': 7.199379363847945e-06, 'epoch': 3.2}


Model weights saved in ./results/checkpoint-16500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-16500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-16500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-17000
Configuration saved in ./results/checkpoint-17000/config.json


{'loss': 0.001, 'learning_rate': 6.811481768813034e-06, 'epoch': 3.3}


Model weights saved in ./results/checkpoint-17000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-17000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-17000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-17500
Configuration saved in ./results/checkpoint-17500/config.json


{'loss': 0.0009, 'learning_rate': 6.423584173778123e-06, 'epoch': 3.39}


Model weights saved in ./results/checkpoint-17500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-17500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-17500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-18000
Configuration saved in ./results/checkpoint-18000/config.json


{'loss': 0.0003, 'learning_rate': 6.035686578743212e-06, 'epoch': 3.49}


Model weights saved in ./results/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-18000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-18000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-18500
Configuration saved in ./results/checkpoint-18500/config.json


{'loss': 0.0008, 'learning_rate': 5.647788983708301e-06, 'epoch': 3.59}


Model weights saved in ./results/checkpoint-18500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-18500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-18500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-19000
Configuration saved in ./results/checkpoint-19000/config.json


{'loss': 0.0006, 'learning_rate': 5.25989138867339e-06, 'epoch': 3.69}


Model weights saved in ./results/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-19000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-19000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-19500
Configuration saved in ./results/checkpoint-19500/config.json


{'loss': 0.0015, 'learning_rate': 4.87199379363848e-06, 'epoch': 3.78}


Model weights saved in ./results/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-19500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-19500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-20000
Configuration saved in ./results/checkpoint-20000/config.json


{'loss': 0.0007, 'learning_rate': 4.484096198603569e-06, 'epoch': 3.88}


Model weights saved in ./results/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-20000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-20000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-20500
Configuration saved in ./results/checkpoint-20500/config.json


{'loss': 0.0008, 'learning_rate': 4.096198603568658e-06, 'epoch': 3.98}


Model weights saved in ./results/checkpoint-20500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-20500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-20500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20621
  Batch size = 16


  0%|          | 0/1289 [00:00<?, ?it/s]

{'eval_loss': 0.0007637494127266109, 'eval_runtime': 67.1112, 'eval_samples_per_second': 307.266, 'eval_steps_per_second': 19.207, 'epoch': 4.0}


Saving model checkpoint to ./results/checkpoint-21000
Configuration saved in ./results/checkpoint-21000/config.json


{'loss': 0.0006, 'learning_rate': 3.708301008533747e-06, 'epoch': 4.07}


Model weights saved in ./results/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-21000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-21000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-21500
Configuration saved in ./results/checkpoint-21500/config.json


{'loss': 0.0004, 'learning_rate': 3.3204034134988368e-06, 'epoch': 4.17}


Model weights saved in ./results/checkpoint-21500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-21500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-21500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-22000
Configuration saved in ./results/checkpoint-22000/config.json


{'loss': 0.0004, 'learning_rate': 2.932505818463926e-06, 'epoch': 4.27}


Model weights saved in ./results/checkpoint-22000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-22000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-22000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-22500
Configuration saved in ./results/checkpoint-22500/config.json


{'loss': 0.0005, 'learning_rate': 2.544608223429015e-06, 'epoch': 4.36}


Model weights saved in ./results/checkpoint-22500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-22500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-22500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-23000
Configuration saved in ./results/checkpoint-23000/config.json


{'loss': 0.0005, 'learning_rate': 2.156710628394104e-06, 'epoch': 4.46}


Model weights saved in ./results/checkpoint-23000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-23000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-23000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-23500
Configuration saved in ./results/checkpoint-23500/config.json


{'loss': 0.0002, 'learning_rate': 1.7688130333591933e-06, 'epoch': 4.56}


Model weights saved in ./results/checkpoint-23500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-23500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-23500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-24000
Configuration saved in ./results/checkpoint-24000/config.json


{'loss': 0.0004, 'learning_rate': 1.3809154383242826e-06, 'epoch': 4.65}


Model weights saved in ./results/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-24000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-24000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-24500
Configuration saved in ./results/checkpoint-24500/config.json


{'loss': 0.0007, 'learning_rate': 9.930178432893718e-07, 'epoch': 4.75}


Model weights saved in ./results/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-24500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-24500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-25000
Configuration saved in ./results/checkpoint-25000/config.json


{'loss': 0.0004, 'learning_rate': 6.051202482544609e-07, 'epoch': 4.85}


Model weights saved in ./results/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-25000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-25000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-25500
Configuration saved in ./results/checkpoint-25500/config.json


{'loss': 0.0004, 'learning_rate': 2.1722265321955006e-07, 'epoch': 4.95}


Model weights saved in ./results/checkpoint-25500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-25500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-25500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20621
  Batch size = 16


  0%|          | 0/1289 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.0007221808773465455, 'eval_runtime': 67.0398, 'eval_samples_per_second': 307.594, 'eval_steps_per_second': 19.227, 'epoch': 5.0}
{'train_runtime': 5315.1258, 'train_samples_per_second': 77.591, 'train_steps_per_second': 4.85, 'train_loss': 0.011468237978344837, 'epoch': 5.0}


TrainOutput(global_step=25780, training_loss=0.011468237978344837, metrics={'train_runtime': 5315.1258, 'train_samples_per_second': 77.591, 'train_steps_per_second': 4.85, 'train_loss': 0.011468237978344837, 'epoch': 5.0})

In [17]:
trainer.save_model("bert-clinical-scratch-wl-es-NER-prescription")

Saving model checkpoint to bert-clinical-scratch-wl-es-NER-prescription
Configuration saved in bert-clinical-scratch-wl-es-NER-prescription/config.json
Model weights saved in bert-clinical-scratch-wl-es-NER-prescription/pytorch_model.bin
tokenizer config file saved in bert-clinical-scratch-wl-es-NER-prescription/tokenizer_config.json
Special tokens file saved in bert-clinical-scratch-wl-es-NER-prescription/special_tokens_map.json


## Evaluación de resultados

In [18]:
from transformers import AutoTokenizer
from auxfunctions import tokenize_and_align_labels

MODEL = "bert-clinical-scratch-wl-es-NER-prescription"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [19]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(MODEL)

loading configuration file bert-clinical-scratch-wl-es-NER-prescription/config.json
Model config BertConfig {
  "_name_or_path": "bert-clinical-scratch-wl-es-NER-prescription",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "

In [20]:
from auxfunctions import eval_text, map_entities, calculate_metrics

In [21]:
text = "PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias"

eval_text(text,tokenizer,model)

array([ 1,  2,  2,  3,  0,  0,  5,  7,  8,  8,  9, 10, 10])

In [22]:
ner_dict = {'O': 0,
            'B-ACTIVE_PRINCIPLE': 1,
            'I-ACTIVE_PRINCIPLE': 2,
            'B-FORMA_FARMA':3,
            'I-FORMA_FARMA':4,
            'B-ADMIN': 5,
            'I-ADMIN': 6,
            'B-PERIODICITY': 7,
            'I-PERIODICITY': 8,
            'B-DURATION': 9,
            'I-DURATION': 10
            }

In [23]:
print(text)
map_entities(eval_text(text,tokenizer,model),ner_dict)

PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias


['B-ACTIVE_PRINCIPLE',
 'I-ACTIVE_PRINCIPLE',
 'I-ACTIVE_PRINCIPLE',
 'B-FORMA_FARMA',
 'O',
 'O',
 'B-ADMIN',
 'B-PERIODICITY',
 'I-PERIODICITY',
 'I-PERIODICITY',
 'B-DURATION',
 'I-DURATION',
 'I-DURATION']

Evaluación con otras métricas

In [24]:
y_test = [row['ner_tags'] for row in HF_dataset['test']]

In [28]:
y_preds = []
length = len(HF_dataset['test'])
for i, row in enumerate(HF_dataset['test']):
    if 100*(i+1)/length % 10 == 0:
        print("progreso = {}%".format(100*(i+1)/length))
    y_preds.append(list(eval_text(row['tokens'],tokenizer,model)))

progreso = 100.0%


In [29]:
print("Métricas en test-etiquetado con ER")
calculate_metrics(y_preds,y_test,ner_dict=ner_dict)

Métricas en test-etiquetado con ER
Resultados de evaluación
	 f1: 1.00 | precision: 1.00 | recall: 1.00


(0.9995104807330237, 0.9997614563716258, 0.9996359527993973)

In [26]:
datos_conll = Corpus()
datos_conll.entidades = ner_dict

for i in range(4):
    datos_conll.load_conll('../datos/Etiquetado/corpus_s{}_etiquetados.conll'.format(i+1))

HF_data_mini = datos_conll.to_HF_dataset()
HF_dataset_mini = HF_data_mini.train_test_split(test_size=0.2,seed=0)

Agregadas 250 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus


In [27]:
from auxfunctions import eval_text, map_entities, calculate_metrics

y_test_em = [row['ner_tags'] for row in HF_dataset_mini['test']]
y_preds_em = [list(eval_text(row['tokens'],tokenizer,model)) for row in HF_dataset_mini['test']]

print("Métricas en test etiquetado manual")
calculate_metrics(y_preds_em,y_test_em,ner_dict=ner_dict)

Métricas en test etiquetado manual
Resultados de evaluación
	 f1: 0.30 | precision: 0.32 | recall: 0.29


(0.32114882506527415, 0.28771929824561404, 0.3035163479333745)

## Versión mini

A continuación definimos un modelo como el anterior, con la diferencia de que este será entrenado (fine-tunning) en 80% de los datos etiquetados a mano y testeado con 20 % de estos.

In [31]:
import sys
sys.path.append('../datos/procesamiento')
from corpus import Corpus

datos_conll = Corpus()

In [32]:
ner_dict = {'O': 0,
            'B-ACTIVE_PRINCIPLE': 1,
            'I-ACTIVE_PRINCIPLE': 2,
            'B-FORMA_FARMA':3,
            'I-FORMA_FARMA':4,
            'B-ADMIN': 5,
            'I-ADMIN': 6,
            'B-PERIODICITY': 7,
            'I-PERIODICITY': 8,
            'B-DURATION': 9,
            'I-DURATION': 10
            }

datos_conll.entidades = ner_dict

In [33]:
for i in range(4):
    datos_conll.load_conll('../datos/Etiquetado/corpus_s{}_etiquetados.conll'.format(i+1))

Agregadas 250 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus


In [34]:
HF_data_mini = datos_conll.to_HF_dataset()

HF_dataset = HF_data_mini.train_test_split(test_size=0.2,seed=0)
HF_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 802
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 201
    })
})

In [44]:
from transformers import AutoTokenizer
from auxfunctions import tokenize_and_align_labels

MODEL = "plncmm/bert-clinical-scratch-wl-es"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

process = lambda examples: tokenize_and_align_labels(examples,tokenizer)

tokenized_data_mini = HF_dataset.map(process, batched=True)
tokenized_data_mini = tokenized_data_mini.remove_columns(['id','tokens','ner_tags'])

loading file vocab.txt from cache at /home/camilo/.cache/huggingface/hub/models--plncmm--bert-clinical-scratch-wl-es/snapshots/e9314ad921431b0c0b69c84149ce3dfe5810b324/vocab.txt
loading file tokenizer.json from cache at /home/camilo/.cache/huggingface/hub/models--plncmm--bert-clinical-scratch-wl-es/snapshots/e9314ad921431b0c0b69c84149ce3dfe5810b324/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/camilo/.cache/huggingface/hub/models--plncmm--bert-clinical-scratch-wl-es/snapshots/e9314ad921431b0c0b69c84149ce3dfe5810b324/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/camilo/.cache/huggingface/hub/models--plncmm--bert-clinical-scratch-wl-es/snapshots/e9314ad921431b0c0b69c84149ce3dfe5810b324/tokenizer_config.json


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [45]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model_mini = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(ner_dict))

loading configuration file config.json from cache at /home/camilo/.cache/huggingface/hub/models--plncmm--bert-clinical-scratch-wl-es/snapshots/e9314ad921431b0c0b69c84149ce3dfe5810b324/config.json
Model config BertConfig {
  "_name_or_path": "plncmm/bert-clinical-scratch-wl-es",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8"

In [46]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 20,
    weight_decay = 0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [47]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model_mini,
    args=training_args,
    train_dataset = tokenized_data_mini["train"],
    eval_dataset = tokenized_data_mini["test"],
    tokenizer=tokenizer,
    data_collator = data_collator,
)

In [48]:
trainer.train()

***** Running training *****
  Num examples = 802
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1020


  0%|          | 0/1020 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.5831220746040344, 'eval_runtime': 0.5841, 'eval_samples_per_second': 344.145, 'eval_steps_per_second': 22.258, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.42039012908935547, 'eval_runtime': 0.5968, 'eval_samples_per_second': 336.817, 'eval_steps_per_second': 21.784, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.384698748588562, 'eval_runtime': 0.596, 'eval_samples_per_second': 337.243, 'eval_steps_per_second': 21.812, 'epoch': 3.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3523677885532379, 'eval_runtime': 0.5796, 'eval_samples_per_second': 346.783, 'eval_steps_per_second': 22.429, 'epoch': 4.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.32841044664382935, 'eval_runtime': 0.5911, 'eval_samples_per_second': 340.042, 'eval_steps_per_second': 21.993, 'epoch': 5.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3156881630420685, 'eval_runtime': 0.6252, 'eval_samples_per_second': 321.495, 'eval_steps_per_second': 20.793, 'epoch': 6.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.32430407404899597, 'eval_runtime': 0.6249, 'eval_samples_per_second': 321.671, 'eval_steps_per_second': 20.805, 'epoch': 7.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3156029284000397, 'eval_runtime': 0.6059, 'eval_samples_per_second': 331.724, 'eval_steps_per_second': 21.455, 'epoch': 8.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3170432150363922, 'eval_runtime': 0.5995, 'eval_samples_per_second': 335.294, 'eval_steps_per_second': 21.686, 'epoch': 9.0}


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'loss': 0.3258, 'learning_rate': 1.0196078431372549e-05, 'epoch': 9.8}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3161526918411255, 'eval_runtime': 0.6586, 'eval_samples_per_second': 305.212, 'eval_steps_per_second': 19.74, 'epoch': 10.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.33779987692832947, 'eval_runtime': 0.6436, 'eval_samples_per_second': 312.303, 'eval_steps_per_second': 20.199, 'epoch': 11.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.34254539012908936, 'eval_runtime': 0.645, 'eval_samples_per_second': 311.611, 'eval_steps_per_second': 20.154, 'epoch': 12.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3367880880832672, 'eval_runtime': 0.6175, 'eval_samples_per_second': 325.488, 'eval_steps_per_second': 21.051, 'epoch': 13.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.34026771783828735, 'eval_runtime': 0.6365, 'eval_samples_per_second': 315.813, 'eval_steps_per_second': 20.426, 'epoch': 14.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3516131341457367, 'eval_runtime': 0.6438, 'eval_samples_per_second': 312.217, 'eval_steps_per_second': 20.193, 'epoch': 15.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3546003997325897, 'eval_runtime': 0.6061, 'eval_samples_per_second': 331.639, 'eval_steps_per_second': 21.449, 'epoch': 16.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.34490272402763367, 'eval_runtime': 0.6138, 'eval_samples_per_second': 327.448, 'eval_steps_per_second': 21.178, 'epoch': 17.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3546936511993408, 'eval_runtime': 0.6192, 'eval_samples_per_second': 324.602, 'eval_steps_per_second': 20.994, 'epoch': 18.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3440197706222534, 'eval_runtime': 0.6194, 'eval_samples_per_second': 324.505, 'eval_steps_per_second': 20.988, 'epoch': 19.0}


Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'loss': 0.0749, 'learning_rate': 3.921568627450981e-07, 'epoch': 19.61}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.34232205152511597, 'eval_runtime': 0.6261, 'eval_samples_per_second': 321.058, 'eval_steps_per_second': 20.765, 'epoch': 20.0}
{'train_runtime': 199.0388, 'train_samples_per_second': 80.587, 'train_steps_per_second': 5.125, 'train_loss': 0.1973839650551478, 'epoch': 20.0}


TrainOutput(global_step=1020, training_loss=0.1973839650551478, metrics={'train_runtime': 199.0388, 'train_samples_per_second': 80.587, 'train_steps_per_second': 5.125, 'train_loss': 0.1973839650551478, 'epoch': 20.0})

In [49]:
trainer.save_model("bert-clinical-scratch-wl-es-NER-prescription-mini")

Saving model checkpoint to bert-clinical-scratch-wl-es-NER-prescription-mini
Configuration saved in bert-clinical-scratch-wl-es-NER-prescription-mini/config.json
Model weights saved in bert-clinical-scratch-wl-es-NER-prescription-mini/pytorch_model.bin
tokenizer config file saved in bert-clinical-scratch-wl-es-NER-prescription-mini/tokenizer_config.json
Special tokens file saved in bert-clinical-scratch-wl-es-NER-prescription-mini/special_tokens_map.json


In [50]:
model_mini = AutoModelForTokenClassification.from_pretrained("bert-clinical-scratch-wl-es-NER-prescription-mini")

loading configuration file bert-clinical-scratch-wl-es-NER-prescription-mini/config.json
Model config BertConfig {
  "_name_or_path": "bert-clinical-scratch-wl-es-NER-prescription-mini",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings"

In [51]:
from auxfunctions import eval_text, map_entities, calculate_metrics

y_test = [row['ner_tags'] for row in HF_dataset['test']]
y_preds = [list(eval_text(row['tokens'],tokenizer,model_mini)) for row in HF_dataset['test']]

calculate_metrics(y_preds,y_test,ner_dict=ner_dict)

Resultados de evaluación
	 f1: 0.86 | precision: 0.83 | recall: 0.90


(0.8270676691729323, 0.9005847953216374, 0.8622620380739081)

## Versión fine-tunning

In [52]:
MODEL = "bert-clinical-scratch-wl-es-NER-prescription"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

process = lambda examples: tokenize_and_align_labels(examples,tokenizer)

tokenized_data_mini = HF_dataset.map(process, batched=True)
tokenized_data_mini = tokenized_data_mini.remove_columns(['id','tokens','ner_tags'])

model_mini = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(ner_dict))

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file bert-clinical-scratch-wl-es-NER-prescription/config.json
Model config BertConfig {
  "_name_or_path": "bert-clinical-scratch-wl-es-NER-prescription",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "

In [53]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 20,
    weight_decay = 0.01,
)

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model_mini,
    args=training_args,
    train_dataset = tokenized_data_mini["train"],
    eval_dataset = tokenized_data_mini["test"],
    tokenizer=tokenizer,
    data_collator = data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [54]:
trainer.train()

***** Running training *****
  Num examples = 802
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1020


  0%|          | 0/1020 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3356422781944275, 'eval_runtime': 0.5573, 'eval_samples_per_second': 360.646, 'eval_steps_per_second': 23.325, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.28154703974723816, 'eval_runtime': 0.5674, 'eval_samples_per_second': 354.252, 'eval_steps_per_second': 22.912, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.28420379757881165, 'eval_runtime': 0.5723, 'eval_samples_per_second': 351.208, 'eval_steps_per_second': 22.715, 'epoch': 3.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.26701831817626953, 'eval_runtime': 0.579, 'eval_samples_per_second': 347.155, 'eval_steps_per_second': 22.453, 'epoch': 4.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.2742534577846527, 'eval_runtime': 0.5846, 'eval_samples_per_second': 343.846, 'eval_steps_per_second': 22.239, 'epoch': 5.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.28834789991378784, 'eval_runtime': 0.5871, 'eval_samples_per_second': 342.349, 'eval_steps_per_second': 22.142, 'epoch': 6.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.30232951045036316, 'eval_runtime': 0.5916, 'eval_samples_per_second': 339.731, 'eval_steps_per_second': 21.973, 'epoch': 7.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.2897992730140686, 'eval_runtime': 0.5946, 'eval_samples_per_second': 338.023, 'eval_steps_per_second': 21.862, 'epoch': 8.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3276868164539337, 'eval_runtime': 0.5951, 'eval_samples_per_second': 337.761, 'eval_steps_per_second': 21.845, 'epoch': 9.0}


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'loss': 0.2139, 'learning_rate': 1.0196078431372549e-05, 'epoch': 9.8}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.34667789936065674, 'eval_runtime': 0.5975, 'eval_samples_per_second': 336.397, 'eval_steps_per_second': 21.757, 'epoch': 10.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3635108172893524, 'eval_runtime': 0.5981, 'eval_samples_per_second': 336.077, 'eval_steps_per_second': 21.736, 'epoch': 11.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3986039161682129, 'eval_runtime': 0.598, 'eval_samples_per_second': 336.14, 'eval_steps_per_second': 21.74, 'epoch': 12.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.398256778717041, 'eval_runtime': 0.6004, 'eval_samples_per_second': 334.789, 'eval_steps_per_second': 21.653, 'epoch': 13.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.3979812264442444, 'eval_runtime': 0.6061, 'eval_samples_per_second': 331.654, 'eval_steps_per_second': 21.45, 'epoch': 14.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.41253241896629333, 'eval_runtime': 0.6064, 'eval_samples_per_second': 331.44, 'eval_steps_per_second': 21.436, 'epoch': 15.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.41532662510871887, 'eval_runtime': 0.6085, 'eval_samples_per_second': 330.303, 'eval_steps_per_second': 21.363, 'epoch': 16.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4188380539417267, 'eval_runtime': 0.6114, 'eval_samples_per_second': 328.734, 'eval_steps_per_second': 21.261, 'epoch': 17.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4327171742916107, 'eval_runtime': 0.6103, 'eval_samples_per_second': 329.339, 'eval_steps_per_second': 21.301, 'epoch': 18.0}


***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4252334535121918, 'eval_runtime': 0.6135, 'eval_samples_per_second': 327.603, 'eval_steps_per_second': 21.188, 'epoch': 19.0}


Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'loss': 0.0189, 'learning_rate': 3.921568627450981e-07, 'epoch': 19.61}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 201
  Batch size = 16


  0%|          | 0/13 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.42849618196487427, 'eval_runtime': 0.6127, 'eval_samples_per_second': 328.075, 'eval_steps_per_second': 21.219, 'epoch': 20.0}
{'train_runtime': 193.1858, 'train_samples_per_second': 83.029, 'train_steps_per_second': 5.28, 'train_loss': 0.11428771498156529, 'epoch': 20.0}


TrainOutput(global_step=1020, training_loss=0.11428771498156529, metrics={'train_runtime': 193.1858, 'train_samples_per_second': 83.029, 'train_steps_per_second': 5.28, 'train_loss': 0.11428771498156529, 'epoch': 20.0})

In [55]:
trainer.save_model("bert-clinical-scratch-wl-es-NER-prescription-mini")

model_mini = AutoModelForTokenClassification.from_pretrained("bert-clinical-scratch-wl-es-NER-prescription-mini")

from auxfunctions import eval_text, map_entities, calculate_metrics

y_test = [row['ner_tags'] for row in HF_dataset['test']]
y_preds = [list(eval_text(row['tokens'],tokenizer,model_mini)) for row in HF_dataset['test']]

calculate_metrics(y_preds,y_test,ner_dict=ner_dict)

Saving model checkpoint to bert-clinical-scratch-wl-es-NER-prescription-mini
Configuration saved in bert-clinical-scratch-wl-es-NER-prescription-mini/config.json
Model weights saved in bert-clinical-scratch-wl-es-NER-prescription-mini/pytorch_model.bin
tokenizer config file saved in bert-clinical-scratch-wl-es-NER-prescription-mini/tokenizer_config.json
Special tokens file saved in bert-clinical-scratch-wl-es-NER-prescription-mini/special_tokens_map.json
loading configuration file bert-clinical-scratch-wl-es-NER-prescription-mini/config.json
Model config BertConfig {
  "_name_or_path": "bert-clinical-scratch-wl-es-NER-prescription-mini",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",

Resultados de evaluación
	 f1: 0.93 | precision: 0.92 | recall: 0.94


(0.9220183486238532, 0.9403508771929825, 0.9310943833236828)