# Clasificación de tokens con HuggingFace

In [1]:
import sys
sys.path.append('../datos/procesamiento')
from corpus import Corpus

corpus_train = Corpus()
corpus_test = Corpus()

---

## Cargando el corpus como dataset de HuggingFace

In [2]:
ner_dict = {'O': 0,
            'B-ACTIVE_PRINCIPLE': 1,
            'I-ACTIVE_PRINCIPLE': 2,
            'B-FORMA_FARMA':3,
            'I-FORMA_FARMA':4,
            'B-ADMIN': 5,
            'I-ADMIN': 6,
            'B-PERIODICITY': 7,
            'I-PERIODICITY': 8,
            'B-DURATION': 9,
            'I-DURATION': 10
            }

corpus_train.entidades = ner_dict
corpus_test.entidades = ner_dict

In [3]:
corpus_train.load_conll('../datos/Etiquetado/corpus_ER_train.txt')

Agregadas 85246 secuencias de token-entidad al corpus


In [4]:
corpus_train.load_conll('../datos/Etiquetado/corpus_ER_test.txt')

Agregadas 51183 secuencias de token-entidad al corpus


In [5]:
from datasets import DatasetDict

HF_dataset = DatasetDict()

HF_dataset['train'] = corpus_train.to_HF_dataset()
HF_dataset['test'] = corpus_test.to_HF_dataset()

In [6]:
HF_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 136429
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 136429
    })
})

## Cargamos un modelo basado en Transformers de HuggingFace

Nuestro modelo será: plncmm/bert-clinical-scratch-wl-es. Ha sido fine-tuneado con texto médico (aunque probablemente no con prescripciones)

[Fuente de esta sección](https://huggingface.co/docs/transformers/tasks/token_classification)

### Tokenizer

In [9]:
MODEL = "plncmm/bert-clinical-scratch-wl-es"

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Usaremos el tokenizador para codificar nuestro input.

In [11]:
example = HF_dataset["train"][0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'flu',
 '##cona',
 '##zo',
 '##l',
 '150',
 'mg',
 'cápsula',
 '1',
 'cápsula',
 'oral',
 'cada',
 '3',
 'dias',
 '[SEP]']

Debemos corregir el _mismatch_ entre input tokenizado y la lista de tokens

In [7]:
# from auxfunctions import tokenize_and_align_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
tokenized_data = HF_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_data = tokenized_data.remove_columns(['id','tokens','ner_tags'])

  0%|          | 0/137 [00:00<?, ?ba/s]

  0%|          | 0/137 [00:00<?, ?ba/s]

In [13]:
tokenized_data.data

{'train': InMemoryTable
 input_ids: list<item: int32>
   child 0, item: int32
 token_type_ids: list<item: int8>
   child 0, item: int8
 attention_mask: list<item: int8>
   child 0, item: int8
 labels: list<item: int64>
   child 0, item: int64
 ----
 input_ids: [[[4,5121,30534,1600,30962,...,12791,1748,1306,12873,5],[4,11236,1348,29117,3284,...,1139,28376,30967,30972,5],...,[4,11247,13436,15275,30956,...,2581,29258,12791,15359,5],[4,8753,3283,2861,22396,...,1411,2242,1074,1707,5]],[[4,15023,13394,1177,1114,...,2596,1672,1129,12873,5],[4,5970,4358,1167,1600,...,1707,1672,1129,12873,5],...,[4,11236,1348,29117,3284,...,29258,12791,1074,23938,5],[4,27625,11425,30960,6821,...,1139,1992,1482,30961,5]],...,[[4,4667,27921,19257,15023,...,1411,2242,1074,1707,5],[4,15721,13067,10791,997,...,2596,1672,1413,12873,5],...,[4,1254,1092,1104,10110,...,2596,1672,1306,2817,5],[4,16768,1307,1275,1316,...,989,1129,1139,1098,5]],[[4,9969,13908,1519,30967,...,15126,2155,9934,2940,5],[4,13695,13456,16387,1527

### Modelo

In [15]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(ner_dict))

Some weights of the model checkpoint at plncmm/bert-clinical-scratch-wl-es were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at plncmm/bert-clinical-s

## Entrenamiento

In [16]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    weight_decay = 0.01,
)

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator = data_collator,
)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 136429
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 42635


  0%|          | 0/42635 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'loss': 0.2023, 'learning_rate': 1.9765450920605137e-05, 'epoch': 0.06}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'loss': 0.0551, 'learning_rate': 1.9530901841210276e-05, 'epoch': 0.12}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json


{'loss': 0.0347, 'learning_rate': 1.929635276181541e-05, 'epoch': 0.18}


Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json


{'loss': 0.0257, 'learning_rate': 1.9061803682420547e-05, 'epoch': 0.23}


Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json


{'loss': 0.0176, 'learning_rate': 1.8827254603025685e-05, 'epoch': 0.29}


Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/config.json


{'loss': 0.0135, 'learning_rate': 1.859270552363082e-05, 'epoch': 0.35}


Model weights saved in ./results/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-3500
Configuration saved in ./results/checkpoint-3500/config.json


{'loss': 0.0133, 'learning_rate': 1.8358156444235956e-05, 'epoch': 0.41}


Model weights saved in ./results/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-4000
Configuration saved in ./results/checkpoint-4000/config.json


{'loss': 0.0097, 'learning_rate': 1.8123607364841095e-05, 'epoch': 0.47}


Model weights saved in ./results/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-4500
Configuration saved in ./results/checkpoint-4500/config.json


{'loss': 0.0099, 'learning_rate': 1.788905828544623e-05, 'epoch': 0.53}


Model weights saved in ./results/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json


{'loss': 0.0082, 'learning_rate': 1.7654509206051366e-05, 'epoch': 0.59}


Model weights saved in ./results/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-5500
Configuration saved in ./results/checkpoint-5500/config.json


{'loss': 0.0091, 'learning_rate': 1.7419960126656504e-05, 'epoch': 0.65}


Model weights saved in ./results/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-6000
Configuration saved in ./results/checkpoint-6000/config.json


{'loss': 0.0066, 'learning_rate': 1.7185411047261643e-05, 'epoch': 0.7}


Model weights saved in ./results/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-6000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-6000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-6500
Configuration saved in ./results/checkpoint-6500/config.json


{'loss': 0.006, 'learning_rate': 1.695086196786678e-05, 'epoch': 0.76}


Model weights saved in ./results/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-6500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-6500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-7000
Configuration saved in ./results/checkpoint-7000/config.json


{'loss': 0.0059, 'learning_rate': 1.6716312888471914e-05, 'epoch': 0.82}


Model weights saved in ./results/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-7000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-7000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-7500
Configuration saved in ./results/checkpoint-7500/config.json


{'loss': 0.0049, 'learning_rate': 1.6481763809077053e-05, 'epoch': 0.88}


Model weights saved in ./results/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-7500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-7500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-8000
Configuration saved in ./results/checkpoint-8000/config.json


{'loss': 0.0048, 'learning_rate': 1.6247214729682188e-05, 'epoch': 0.94}


Model weights saved in ./results/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-8000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-8000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-8500
Configuration saved in ./results/checkpoint-8500/config.json


{'loss': 0.0052, 'learning_rate': 1.6012665650287323e-05, 'epoch': 1.0}


Model weights saved in ./results/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-8500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-8500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 136429
  Batch size = 16


  0%|          | 0/8527 [00:00<?, ?it/s]

{'eval_loss': 0.0027356892824172974, 'eval_runtime': 439.9435, 'eval_samples_per_second': 310.106, 'eval_steps_per_second': 19.382, 'epoch': 1.0}


Saving model checkpoint to ./results/checkpoint-9000
Configuration saved in ./results/checkpoint-9000/config.json


{'loss': 0.0037, 'learning_rate': 1.5778116570892462e-05, 'epoch': 1.06}


Model weights saved in ./results/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-9000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-9000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-9500
Configuration saved in ./results/checkpoint-9500/config.json


{'loss': 0.0036, 'learning_rate': 1.5543567491497598e-05, 'epoch': 1.11}


Model weights saved in ./results/checkpoint-9500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-9500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-9500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-10000
Configuration saved in ./results/checkpoint-10000/config.json


{'loss': 0.0032, 'learning_rate': 1.5309018412102733e-05, 'epoch': 1.17}


Model weights saved in ./results/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-10000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-10000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-10500
Configuration saved in ./results/checkpoint-10500/config.json


{'loss': 0.005, 'learning_rate': 1.507446933270787e-05, 'epoch': 1.23}


Model weights saved in ./results/checkpoint-10500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-10500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-10500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-11000
Configuration saved in ./results/checkpoint-11000/config.json


{'loss': 0.0032, 'learning_rate': 1.4839920253313007e-05, 'epoch': 1.29}


Model weights saved in ./results/checkpoint-11000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-11000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-11000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-11500
Configuration saved in ./results/checkpoint-11500/config.json


{'loss': 0.003, 'learning_rate': 1.4605371173918144e-05, 'epoch': 1.35}


Model weights saved in ./results/checkpoint-11500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-11500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-11500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-12000
Configuration saved in ./results/checkpoint-12000/config.json


{'loss': 0.0034, 'learning_rate': 1.437082209452328e-05, 'epoch': 1.41}


Model weights saved in ./results/checkpoint-12000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-12000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-12000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-12500
Configuration saved in ./results/checkpoint-12500/config.json


{'loss': 0.0028, 'learning_rate': 1.4136273015128417e-05, 'epoch': 1.47}


Model weights saved in ./results/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-12500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-12500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-13000
Configuration saved in ./results/checkpoint-13000/config.json


{'loss': 0.0026, 'learning_rate': 1.3901723935733554e-05, 'epoch': 1.52}


Model weights saved in ./results/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-13000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-13000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-13500
Configuration saved in ./results/checkpoint-13500/config.json


{'loss': 0.0033, 'learning_rate': 1.3667174856338689e-05, 'epoch': 1.58}


Model weights saved in ./results/checkpoint-13500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-13500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-13500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-14000
Configuration saved in ./results/checkpoint-14000/config.json


{'loss': 0.0025, 'learning_rate': 1.3432625776943826e-05, 'epoch': 1.64}


Model weights saved in ./results/checkpoint-14000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-14000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-14000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-14500
Configuration saved in ./results/checkpoint-14500/config.json


{'loss': 0.0032, 'learning_rate': 1.3198076697548963e-05, 'epoch': 1.7}


Model weights saved in ./results/checkpoint-14500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-14500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-14500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-15000
Configuration saved in ./results/checkpoint-15000/config.json


{'loss': 0.0025, 'learning_rate': 1.2963527618154099e-05, 'epoch': 1.76}


Model weights saved in ./results/checkpoint-15000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-15000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-15000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-15500
Configuration saved in ./results/checkpoint-15500/config.json


{'loss': 0.0028, 'learning_rate': 1.2728978538759236e-05, 'epoch': 1.82}


Model weights saved in ./results/checkpoint-15500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-15500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-15500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-16000
Configuration saved in ./results/checkpoint-16000/config.json


{'loss': 0.0029, 'learning_rate': 1.2494429459364373e-05, 'epoch': 1.88}


Model weights saved in ./results/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-16000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-16000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-16500
Configuration saved in ./results/checkpoint-16500/config.json


{'loss': 0.0035, 'learning_rate': 1.2259880379969508e-05, 'epoch': 1.94}


Model weights saved in ./results/checkpoint-16500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-16500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-16500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-17000
Configuration saved in ./results/checkpoint-17000/config.json


{'loss': 0.0012, 'learning_rate': 1.2025331300574645e-05, 'epoch': 1.99}


Model weights saved in ./results/checkpoint-17000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-17000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-17000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 136429
  Batch size = 16


  0%|          | 0/8527 [00:00<?, ?it/s]

{'eval_loss': 0.0011624422622844577, 'eval_runtime': 440.8558, 'eval_samples_per_second': 309.464, 'eval_steps_per_second': 19.342, 'epoch': 2.0}


Saving model checkpoint to ./results/checkpoint-17500
Configuration saved in ./results/checkpoint-17500/config.json


{'loss': 0.0015, 'learning_rate': 1.1790782221179782e-05, 'epoch': 2.05}


Model weights saved in ./results/checkpoint-17500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-17500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-17500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-18000
Configuration saved in ./results/checkpoint-18000/config.json


{'loss': 0.0014, 'learning_rate': 1.1556233141784921e-05, 'epoch': 2.11}


Model weights saved in ./results/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-18000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-18000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-18500
Configuration saved in ./results/checkpoint-18500/config.json


{'loss': 0.0011, 'learning_rate': 1.1321684062390057e-05, 'epoch': 2.17}


Model weights saved in ./results/checkpoint-18500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-18500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-18500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-19000
Configuration saved in ./results/checkpoint-19000/config.json


{'loss': 0.0026, 'learning_rate': 1.1087134982995194e-05, 'epoch': 2.23}


Model weights saved in ./results/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-19000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-19000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-19500
Configuration saved in ./results/checkpoint-19500/config.json


{'loss': 0.0029, 'learning_rate': 1.085258590360033e-05, 'epoch': 2.29}


Model weights saved in ./results/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-19500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-19500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-20000
Configuration saved in ./results/checkpoint-20000/config.json


{'loss': 0.0023, 'learning_rate': 1.0618036824205466e-05, 'epoch': 2.35}


Model weights saved in ./results/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-20000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-20000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-20500
Configuration saved in ./results/checkpoint-20500/config.json


{'loss': 0.0013, 'learning_rate': 1.0383487744810603e-05, 'epoch': 2.4}


Model weights saved in ./results/checkpoint-20500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-20500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-20500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-21000
Configuration saved in ./results/checkpoint-21000/config.json


{'loss': 0.0018, 'learning_rate': 1.014893866541574e-05, 'epoch': 2.46}


Model weights saved in ./results/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-21000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-21000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-21500
Configuration saved in ./results/checkpoint-21500/config.json


{'loss': 0.0018, 'learning_rate': 9.914389586020876e-06, 'epoch': 2.52}


Model weights saved in ./results/checkpoint-21500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-21500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-21500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-22000
Configuration saved in ./results/checkpoint-22000/config.json


{'loss': 0.0011, 'learning_rate': 9.679840506626013e-06, 'epoch': 2.58}


Model weights saved in ./results/checkpoint-22000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-22000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-22000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-22500
Configuration saved in ./results/checkpoint-22500/config.json


{'loss': 0.0026, 'learning_rate': 9.44529142723115e-06, 'epoch': 2.64}


Model weights saved in ./results/checkpoint-22500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-22500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-22500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-23000
Configuration saved in ./results/checkpoint-23000/config.json


{'loss': 0.0009, 'learning_rate': 9.210742347836285e-06, 'epoch': 2.7}


Model weights saved in ./results/checkpoint-23000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-23000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-23000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-23500
Configuration saved in ./results/checkpoint-23500/config.json


{'loss': 0.0015, 'learning_rate': 8.976193268441422e-06, 'epoch': 2.76}


Model weights saved in ./results/checkpoint-23500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-23500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-23500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-24000
Configuration saved in ./results/checkpoint-24000/config.json


{'loss': 0.0013, 'learning_rate': 8.74164418904656e-06, 'epoch': 2.81}


Model weights saved in ./results/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-24000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-24000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-24500
Configuration saved in ./results/checkpoint-24500/config.json


{'loss': 0.0011, 'learning_rate': 8.507095109651695e-06, 'epoch': 2.87}


Model weights saved in ./results/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-24500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-24500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-25000
Configuration saved in ./results/checkpoint-25000/config.json


{'loss': 0.0011, 'learning_rate': 8.272546030256832e-06, 'epoch': 2.93}


Model weights saved in ./results/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-25000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-25000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-25500
Configuration saved in ./results/checkpoint-25500/config.json


{'loss': 0.0008, 'learning_rate': 8.037996950861969e-06, 'epoch': 2.99}


Model weights saved in ./results/checkpoint-25500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-25500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-25500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 136429
  Batch size = 16


  0%|          | 0/8527 [00:00<?, ?it/s]

{'eval_loss': 0.0005563591257669032, 'eval_runtime': 439.4956, 'eval_samples_per_second': 310.422, 'eval_steps_per_second': 19.402, 'epoch': 3.0}


Saving model checkpoint to ./results/checkpoint-26000
Configuration saved in ./results/checkpoint-26000/config.json


{'loss': 0.0007, 'learning_rate': 7.803447871467104e-06, 'epoch': 3.05}


Model weights saved in ./results/checkpoint-26000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-26000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-26000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-26500
Configuration saved in ./results/checkpoint-26500/config.json


{'loss': 0.0014, 'learning_rate': 7.568898792072241e-06, 'epoch': 3.11}


Model weights saved in ./results/checkpoint-26500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-26500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-26500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-27000
Configuration saved in ./results/checkpoint-27000/config.json


{'loss': 0.0009, 'learning_rate': 7.334349712677379e-06, 'epoch': 3.17}


Model weights saved in ./results/checkpoint-27000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-27000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-27000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-27500
Configuration saved in ./results/checkpoint-27500/config.json


{'loss': 0.0006, 'learning_rate': 7.099800633282515e-06, 'epoch': 3.23}


Model weights saved in ./results/checkpoint-27500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-27500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-27500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-28000
Configuration saved in ./results/checkpoint-28000/config.json


{'loss': 0.0009, 'learning_rate': 6.865251553887652e-06, 'epoch': 3.28}


Model weights saved in ./results/checkpoint-28000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-28000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-28000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-28500
Configuration saved in ./results/checkpoint-28500/config.json


{'loss': 0.001, 'learning_rate': 6.630702474492789e-06, 'epoch': 3.34}


Model weights saved in ./results/checkpoint-28500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-28500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-28500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-29000
Configuration saved in ./results/checkpoint-29000/config.json


{'loss': 0.0007, 'learning_rate': 6.396153395097925e-06, 'epoch': 3.4}


Model weights saved in ./results/checkpoint-29000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-29000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-29000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-29500
Configuration saved in ./results/checkpoint-29500/config.json


{'loss': 0.0007, 'learning_rate': 6.161604315703061e-06, 'epoch': 3.46}


Model weights saved in ./results/checkpoint-29500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-29500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-29500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-30000
Configuration saved in ./results/checkpoint-30000/config.json


{'loss': 0.0011, 'learning_rate': 5.927055236308198e-06, 'epoch': 3.52}


Model weights saved in ./results/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-30000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-30000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-30500
Configuration saved in ./results/checkpoint-30500/config.json


{'loss': 0.0008, 'learning_rate': 5.6925061569133345e-06, 'epoch': 3.58}


Model weights saved in ./results/checkpoint-30500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-30500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-30500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-31000
Configuration saved in ./results/checkpoint-31000/config.json


{'loss': 0.0005, 'learning_rate': 5.457957077518471e-06, 'epoch': 3.64}


Model weights saved in ./results/checkpoint-31000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-31000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-31000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-31500
Configuration saved in ./results/checkpoint-31500/config.json


{'loss': 0.0011, 'learning_rate': 5.223407998123608e-06, 'epoch': 3.69}


Model weights saved in ./results/checkpoint-31500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-31500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-31500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-32000
Configuration saved in ./results/checkpoint-32000/config.json


{'loss': 0.0005, 'learning_rate': 4.988858918728745e-06, 'epoch': 3.75}


Model weights saved in ./results/checkpoint-32000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-32000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-32000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-32500
Configuration saved in ./results/checkpoint-32500/config.json


{'loss': 0.0002, 'learning_rate': 4.754309839333881e-06, 'epoch': 3.81}


Model weights saved in ./results/checkpoint-32500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-32500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-32500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-33000
Configuration saved in ./results/checkpoint-33000/config.json


{'loss': 0.0013, 'learning_rate': 4.519760759939017e-06, 'epoch': 3.87}


Model weights saved in ./results/checkpoint-33000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-33000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-33000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-33500
Configuration saved in ./results/checkpoint-33500/config.json


{'loss': 0.0007, 'learning_rate': 4.285211680544154e-06, 'epoch': 3.93}


Model weights saved in ./results/checkpoint-33500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-33500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-33500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-34000
Configuration saved in ./results/checkpoint-34000/config.json


{'loss': 0.0007, 'learning_rate': 4.050662601149291e-06, 'epoch': 3.99}


Model weights saved in ./results/checkpoint-34000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-34000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-34000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 136429
  Batch size = 16


  0%|          | 0/8527 [00:00<?, ?it/s]

{'eval_loss': 0.00036962440935894847, 'eval_runtime': 438.1945, 'eval_samples_per_second': 311.343, 'eval_steps_per_second': 19.459, 'epoch': 4.0}


Saving model checkpoint to ./results/checkpoint-34500
Configuration saved in ./results/checkpoint-34500/config.json


{'loss': 0.0002, 'learning_rate': 3.816113521754428e-06, 'epoch': 4.05}


Model weights saved in ./results/checkpoint-34500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-34500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-34500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-35000
Configuration saved in ./results/checkpoint-35000/config.json


{'loss': 0.0007, 'learning_rate': 3.5815644423595643e-06, 'epoch': 4.1}


Model weights saved in ./results/checkpoint-35000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-35000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-35000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-35500
Configuration saved in ./results/checkpoint-35500/config.json


{'loss': 0.0001, 'learning_rate': 3.3470153629647006e-06, 'epoch': 4.16}


Model weights saved in ./results/checkpoint-35500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-35500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-35500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-36000
Configuration saved in ./results/checkpoint-36000/config.json


{'loss': 0.0004, 'learning_rate': 3.1124662835698372e-06, 'epoch': 4.22}


Model weights saved in ./results/checkpoint-36000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-36000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-36000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-36500
Configuration saved in ./results/checkpoint-36500/config.json


{'loss': 0.0003, 'learning_rate': 2.877917204174974e-06, 'epoch': 4.28}


Model weights saved in ./results/checkpoint-36500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-36500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-36500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-37000
Configuration saved in ./results/checkpoint-37000/config.json


{'loss': 0.0002, 'learning_rate': 2.64336812478011e-06, 'epoch': 4.34}


Model weights saved in ./results/checkpoint-37000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-37000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-37000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-37500
Configuration saved in ./results/checkpoint-37500/config.json


{'loss': 0.0004, 'learning_rate': 2.408819045385247e-06, 'epoch': 4.4}


Model weights saved in ./results/checkpoint-37500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-37500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-37500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-38000
Configuration saved in ./results/checkpoint-38000/config.json


{'loss': 0.0002, 'learning_rate': 2.174269965990384e-06, 'epoch': 4.46}


Model weights saved in ./results/checkpoint-38000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-38000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-38000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-38500
Configuration saved in ./results/checkpoint-38500/config.json


{'loss': 0.0002, 'learning_rate': 1.93972088659552e-06, 'epoch': 4.52}


Model weights saved in ./results/checkpoint-38500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-38500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-38500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-39000
Configuration saved in ./results/checkpoint-39000/config.json


{'loss': 0.0004, 'learning_rate': 1.7051718072006567e-06, 'epoch': 4.57}


Model weights saved in ./results/checkpoint-39000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-39000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-39000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-39500
Configuration saved in ./results/checkpoint-39500/config.json


{'loss': 0.0004, 'learning_rate': 1.4706227278057936e-06, 'epoch': 4.63}


Model weights saved in ./results/checkpoint-39500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-39500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-39500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-40000
Configuration saved in ./results/checkpoint-40000/config.json


{'loss': 0.0003, 'learning_rate': 1.23607364841093e-06, 'epoch': 4.69}


Model weights saved in ./results/checkpoint-40000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-40000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-40000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-40500
Configuration saved in ./results/checkpoint-40500/config.json


{'loss': 0.0014, 'learning_rate': 1.0015245690160667e-06, 'epoch': 4.75}


Model weights saved in ./results/checkpoint-40500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-40500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-40500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-41000
Configuration saved in ./results/checkpoint-41000/config.json


{'loss': 0.0002, 'learning_rate': 7.669754896212032e-07, 'epoch': 4.81}


Model weights saved in ./results/checkpoint-41000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-41000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-41000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-41500
Configuration saved in ./results/checkpoint-41500/config.json


{'loss': 0.0004, 'learning_rate': 5.324264102263399e-07, 'epoch': 4.87}


Model weights saved in ./results/checkpoint-41500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-41500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-41500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-42000
Configuration saved in ./results/checkpoint-42000/config.json


{'loss': 0.0004, 'learning_rate': 2.978773308314765e-07, 'epoch': 4.93}


Model weights saved in ./results/checkpoint-42000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-42000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-42000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-42500
Configuration saved in ./results/checkpoint-42500/config.json


{'loss': 0.0002, 'learning_rate': 6.332825143661312e-08, 'epoch': 4.98}


Model weights saved in ./results/checkpoint-42500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-42500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-42500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 136429
  Batch size = 16


  0%|          | 0/8527 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.00019751596846617758, 'eval_runtime': 437.8101, 'eval_samples_per_second': 311.617, 'eval_steps_per_second': 19.476, 'epoch': 5.0}
{'train_runtime': 10561.2495, 'train_samples_per_second': 64.589, 'train_steps_per_second': 4.037, 'train_loss': 0.006245762737205974, 'epoch': 5.0}


TrainOutput(global_step=42635, training_loss=0.006245762737205974, metrics={'train_runtime': 10561.2495, 'train_samples_per_second': 64.589, 'train_steps_per_second': 4.037, 'train_loss': 0.006245762737205974, 'epoch': 5.0})

In [19]:
trainer.save_model("bert-clinical-scratch-wl-es-NER-prescription")

Saving model checkpoint to bert-clinical-scratch-wl-es-NER-prescription
Configuration saved in bert-clinical-scratch-wl-es-NER-prescription/config.json
Model weights saved in bert-clinical-scratch-wl-es-NER-prescription/pytorch_model.bin
tokenizer config file saved in bert-clinical-scratch-wl-es-NER-prescription/tokenizer_config.json
Special tokens file saved in bert-clinical-scratch-wl-es-NER-prescription/special_tokens_map.json


## Evaluación de resultados

In [14]:
from transformers import AutoTokenizer
from auxfunctions import tokenize_and_align_labels

MODEL = "plncmm/bert-clinical-scratch-wl-es"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [15]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-clinical-scratch-wl-es-NER-prescription")

In [16]:
from auxfunctions import eval_text, map_entities, calculate_metrics

In [17]:
text = "PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias"

eval_text(text,tokenizer,model)

array([ 1,  0,  0,  3,  0,  0,  5,  7,  8,  8,  9, 10, 10])

In [18]:
ner_dict = {'O': 0,
            'B-ACTIVE_PRINCIPLE': 1,
            'I-ACTIVE_PRINCIPLE': 2,
            'B-FORMA_FARMA':3,
            'I-FORMA_FARMA':4,
            'B-ADMIN': 5,
            'I-ADMIN': 6,
            'B-PERIODICITY': 7,
            'I-PERIODICITY': 8,
            'B-DURATION': 9,
            'I-DURATION': 10
            }

In [19]:
print(text)
map_entities(eval_text(text,tokenizer,model),ner_dict)

PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias


['B-ACTIVE_PRINCIPLE',
 'O',
 'O',
 'B-FORMA_FARMA',
 'O',
 'O',
 'B-ADMIN',
 'B-PERIODICITY',
 'I-PERIODICITY',
 'I-PERIODICITY',
 'B-DURATION',
 'I-DURATION',
 'I-DURATION']

Evaluación con otras métricas

In [20]:
y_test = [row['ner_tags'] for row in HF_dataset['test']]

In [24]:
y_preds = []
length = len(HF_dataset['test'])
for i, row in enumerate(HF_dataset['test']):
    if 100*(i+1)/length % 10 == 0:
        print("progreso = {}%".format(100*(i+1)/length))
    y_preds.append(list(eval_text(row['tokens'],tokenizer,model)))

KeyboardInterrupt: 

In [None]:
print("Métricas en test-etiquetado con ER")
calculate_metrics(y_preds,y_test,ner_dict=ner_dict)

In [26]:
datos_conll = Corpus()
datos_conll.entidades = ner_dict

for i in range(4):
    datos_conll.load_conll('../datos/Etiquetado/corpus_s{}_etiquetados.conll'.format(i+1))

HF_data_mini = datos_conll.to_HF_dataset()
HF_dataset_mini = HF_data_mini.train_test_split(test_size=0.5,seed=0)

Agregadas 250 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus


In [27]:
from auxfunctions import eval_text, map_entities, calculate_metrics

y_test = [row['ner_tags'] for row in HF_dataset_mini['test']]
y_preds = [list(eval_text(row['tokens'],tokenizer,model)) for row in HF_dataset_mini['test']]

print("Métricas en test etiquetado manual")
calculate_metrics(y_preds,y_test,ner_dict=ner_dict)

## Versión mini

A continuación definimos un modelo como el anterior, con la diferencia de que este será entrenado (fine-tunning) en 80% de los datos etiquetados a mano y testeado con 20 % de estos.

In [1]:
import sys
sys.path.append('../datos/procesamiento')
from corpus import Corpus

datos_conll = Corpus()

In [2]:
ner_dict = {'O': 0,
            'B-ACTIVE_PRINCIPLE': 1,
            'I-ACTIVE_PRINCIPLE': 2,
            'B-FORMA_FARMA':3,
            'I-FORMA_FARMA':4,
            'B-ADMIN': 5,
            'I-ADMIN': 6,
            'B-PERIODICITY': 7,
            'I-PERIODICITY': 8,
            'B-DURATION': 9,
            'I-DURATION': 10
            }

datos_conll.entidades = ner_dict

In [3]:
for i in range(4):
    datos_conll.load_conll('../datos/Etiquetado/corpus_s{}_etiquetados.conll'.format(i+1))

Agregadas 250 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus


In [4]:
HF_data_mini = datos_conll.to_HF_dataset()

HF_dataset = HF_data_mini.train_test_split(test_size=0.5,seed=0)
HF_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 501
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 502
    })
})

In [5]:
from transformers import AutoTokenizer
from auxfunctions import tokenize_and_align_labels

MODEL = "plncmm/bert-clinical-scratch-wl-es"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

process = lambda examples: tokenize_and_align_labels(examples,tokenizer)

tokenized_data_mini = HF_dataset.map(process, batched=True)
tokenized_data_mini = tokenized_data_mini.remove_columns(['id','tokens','ner_tags'])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model_mini = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(ner_dict))

Some weights of the model checkpoint at plncmm/bert-clinical-scratch-wl-es were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at plncmm/bert-clinical-s

In [7]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    weight_decay = 0.01,
)

In [8]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model_mini,
    args=training_args,
    train_dataset = tokenized_data_mini["train"],
    eval_dataset = tokenized_data_mini["test"],
    tokenizer=tokenizer,
    data_collator = data_collator,
)

In [9]:
trainer.train()

***** Running training *****
  Num examples = 501
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 160


  0%|          | 0/160 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.7695652842521667, 'eval_runtime': 5.6407, 'eval_samples_per_second': 88.996, 'eval_steps_per_second': 5.673, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.5680007338523865, 'eval_runtime': 14.4643, 'eval_samples_per_second': 34.706, 'eval_steps_per_second': 2.212, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.4883681833744049, 'eval_runtime': 4.6701, 'eval_samples_per_second': 107.492, 'eval_steps_per_second': 6.852, 'epoch': 3.0}


***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.4615930914878845, 'eval_runtime': 1.3913, 'eval_samples_per_second': 360.802, 'eval_steps_per_second': 22.999, 'epoch': 4.0}


***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.44520142674446106, 'eval_runtime': 1.4058, 'eval_samples_per_second': 357.08, 'eval_steps_per_second': 22.762, 'epoch': 5.0}
{'train_runtime': 210.5847, 'train_samples_per_second': 11.895, 'train_steps_per_second': 0.76, 'train_loss': 0.6563064575195312, 'epoch': 5.0}


TrainOutput(global_step=160, training_loss=0.6563064575195312, metrics={'train_runtime': 210.5847, 'train_samples_per_second': 11.895, 'train_steps_per_second': 0.76, 'train_loss': 0.6563064575195312, 'epoch': 5.0})

In [10]:
trainer.save_model("bert-clinical-scratch-wl-es-NER-prescription-mini")

Saving model checkpoint to bert-clinical-scratch-wl-es-NER-prescription-mini
Configuration saved in bert-clinical-scratch-wl-es-NER-prescription-mini/config.json
Model weights saved in bert-clinical-scratch-wl-es-NER-prescription-mini/pytorch_model.bin
tokenizer config file saved in bert-clinical-scratch-wl-es-NER-prescription-mini/tokenizer_config.json
Special tokens file saved in bert-clinical-scratch-wl-es-NER-prescription-mini/special_tokens_map.json


In [11]:
model_mini = AutoModelForTokenClassification.from_pretrained("bert-clinical-scratch-wl-es-NER-prescription-mini")

loading configuration file bert-clinical-scratch-wl-es-NER-prescription-mini/config.json
Model config BertConfig {
  "_name_or_path": "bert-clinical-scratch-wl-es-NER-prescription-mini",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings"

In [12]:
from auxfunctions import eval_text, map_entities, calculate_metrics

y_test = [row['ner_tags'] for row in HF_dataset['test']]
y_preds = [list(eval_text(row['tokens'],tokenizer,model_mini)) for row in HF_dataset['test']]

calculate_metrics(y_preds,y_test,ner_dict=ner_dict)

Resultados de evaluación
	 f1: 0.62 | precision: 0.53 | recall: 0.74


(0.5306397306397307, 0.7367928938756428, 0.6169504795458994)