# Clasificación de tokens con HuggingFace

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../datos/procesamiento')
from corpus import Corpus
from etiquetado_entidades import codigo_hfl, split_rule, process_column

Cargando datos

In [2]:
main_db = pd.read_csv('../datos/DATA_HLF_MDS_2.csv',sep=',')

code_db = pd.read_excel('../datos/PRINCIPIOS_ACTIVOS_MDS.xlsx')
HLF = code_db.loc[:,['PRINCIPIO_ACTIVO','CODIGO_HLF']]
HLF_df = codigo_hfl(HLF)

df = main_db.join(HLF_df.set_index('CODIGO_MEDICAMENTO'), on='CODIGO_MEDICAMENTO')

Cargar principios activos, forma farma y juntar columnas

In [3]:
#PARA DETECTAR LA POSICION DE ENTIDADES TIPO PRINCIPIO ACTIVO, FORMA FARMACO SE COMPARARAN LOS ELEMENTOS DEL CORPUS CON LAS LISTAS CORRESPONDIENTES.
PA = np.unique(split_rule(df['PRINCIPIO_ACTIVO'].dropna().unique()))
FF = np.unique(split_rule(df['FORMA_FARMA'].dropna().unique()))

rows = (df['PRES_DENOMINACION'] + ' ' + df['RESUMEN']).dropna().unique()

Ejecutar procesamiento de datos de Martin

In [4]:
corpus = Corpus()

for i in range(len(rows)):
    row = rows[i].split()
    

    # begin patch
    err_count = 0
    try:
        tagged_seq = process_column(row,PA,FF)
        corpus.append(tagged_seq)
    except ValueError:
        print("Advertencia: error en etiquetado")
        print(tagged_seq.tokens)
        print(tagged_seq.tags)
        err_count += 1

    if i > 100000 + err_count:  # end patch
        # if i > 100:
        # mismo tamaño que baseline  martin
        break

Advertencia: error en etiquetado
['PREDNISONA', '20', 'MG', 'COMPRIMIDO', '40', 'MG', 'ORAL', 'cada', '24', 'horas', 'durante', '4', 'dias']
['B-ACTVPRNCP', 'O', 'O', 'B-ADMIN', 'O', 'O', 'B-ADMIN', 'B-PERIODICITY', 'I-PERIODICITY', 'I-PERIODICITY', 'B-DURATION', 'I-DURATION', 'I-DURATION']


---

## Cargando el corpus como dataset de HuggingFace

In [5]:
ner_dict = {'O': 0,
            'B-ACTVPRNCP': 1,
            'I-ACTVPRNCP': 2,
            'B-ADMIN': 3,
            'I-ADMIN': 4,
            'B-PERIODICITY': 5,
            'I-PERIODICITY': 6,
            'B-DURATION': 7,
            'I-DURATION': 8
            }

corpus.entidades = ner_dict

In [6]:
HF_dataset = corpus.to_HF_dataset()

In [7]:
HF_dataset.data

InMemoryTable
id: int64
tokens: list<item: string>
  child 0, item: string
ner_tags: list<item: int64>
  child 0, item: int64
----
id: [[0,1,2,3,4,...,99996,99997,99998,99999,100000]]
tokens: [[["PARACETAMOL","500","MG","COMPRIMIDO","1",...,"6","horas","durante","3","dias"],["KETOROLACO","10","MG","COMPRIMIDO","1",...,"8","horas","durante","3","dias"],...,["DEXAMETASONA","4","MG/ML","SOLUCIÓN","INYECTABLE",...,"UNIDAD","PARENTERAL","cada","8","horas"],["KETOROLACO","TROMETAMOL","30","MG/ML","SOLUCIÓN",...,"MG","INTRAVENOSA","EN","BOLO","diaria"]]]
ner_tags: [[[1,0,0,3,0,...,6,6,7,8,8],[1,0,0,3,0,...,6,6,7,8,8],...,[1,0,0,1,2,...,0,0,5,6,6],[1,0,0,0,1,...,0,0,3,0,0]]]

In [8]:
HF_dataset.features[f"ner_tags"].feature.names = [key for key in ner_dict.keys()]
HF_dataset.features[f"ner_tags"].feature.names

['O',
 'B-ACTVPRNCP',
 'I-ACTVPRNCP',
 'B-ADMIN',
 'I-ADMIN',
 'B-PERIODICITY',
 'I-PERIODICITY',
 'B-DURATION',
 'I-DURATION']

In [9]:
HF_dataset = HF_dataset.train_test_split(test_size=0.2,seed=0)
HF_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 20001
    })
})

## Cargamos un modelo basado en Transformers de HuggingFace

Nuestro modelo será: plncmm/bert-clinical-scratch-wl-es. Ha sido fine-tuneado con texto médico (aunque probablemente no con prescripciones)

[Fuente de esta sección](https://huggingface.co/docs/transformers/tasks/token_classification)

### Tokenizer

In [10]:
MODEL = "plncmm/bert-clinical-scratch-wl-es"

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Usaremos el tokenizador para codificar nuestro input.

In [12]:
example = HF_dataset["train"][0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'aci',
 '##do',
 'tran',
 '##ex',
 '##ami',
 '##co',
 '1',
 '##g',
 '/',
 '10',
 '##ml',
 'inyec',
 '##y',
 '##table',
 '2',
 'unidad',
 'intraven',
 '##osa',
 'cada',
 '2',
 '##4',
 'horas',
 '[SEP]']

Debemos corregir el _mismatch_ entre input tokenizado y la lista de tokens

In [13]:
# from auxfunctions import tokenize_and_align_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
tokenized_data = HF_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_data = tokenized_data.remove_columns(['id','tokens','ner_tags'])

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

In [15]:
tokenized_data.data

{'train': InMemoryTable
 input_ids: list<item: int32>
   child 0, item: int32
 token_type_ids: list<item: int8>
   child 0, item: int8
 attention_mask: list<item: int8>
   child 0, item: int8
 labels: list<item: int64>
   child 0, item: int64
 ----
 input_ids: [[[4,15721,1050,1572,2483,...,1748,1129,1003,2596,5],[4,15772,13955,3284,16056,...,1976,1748,997,2596,5],...,[4,1097,23070,14921,3284,...,2427,2242,1074,1707,5],[4,19051,25612,1492,25579,...,1411,2242,1074,1707,5]],[[4,15772,13955,3284,16056,...,15359,1672,1972,12873,5],[4,19769,2470,19624,16057,...,1748,1129,1003,2596,5],...,[4,5536,8413,1213,30962,...,1748,1129,1003,2596,5],[4,6752,5899,30957,4129,...,28982,1976,9934,2940,5]],...,[[4,5647,1699,23565,15709,...,2596,1672,999,12873,5],[4,19890,1577,1343,2633,...,12791,1748,1992,2596,5],...,[4,1067,7785,1197,3092,...,1707,1672,2286,12873,5],[4,6265,1446,1600,30962,...,1748,1129,1003,2596,5]],[[4,5536,8413,1213,30962,...,1976,1748,1413,2885,5],[4,6265,1446,1600,30962,...,1976,1748,9

### Modelo

In [16]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(ner_dict))

Some weights of the model checkpoint at plncmm/bert-clinical-scratch-wl-es were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at plncmm/bert-clinical-s

## Entrenamiento

In [17]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    weight_decay = 0.01,
)

In [18]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator = data_collator,
)

In [19]:
trainer.train()

***** Running training *****
  Num examples = 80000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 25000


  0%|          | 0/25000 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'loss': 0.164, 'learning_rate': 1.9600000000000002e-05, 'epoch': 0.1}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'loss': 0.043, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.2}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json


{'loss': 0.0262, 'learning_rate': 1.88e-05, 'epoch': 0.3}


Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json


{'loss': 0.0196, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.4}


Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json


{'loss': 0.0155, 'learning_rate': 1.8e-05, 'epoch': 0.5}


Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/config.json


{'loss': 0.0128, 'learning_rate': 1.76e-05, 'epoch': 0.6}


Model weights saved in ./results/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-3500
Configuration saved in ./results/checkpoint-3500/config.json


{'loss': 0.0112, 'learning_rate': 1.72e-05, 'epoch': 0.7}


Model weights saved in ./results/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-4000
Configuration saved in ./results/checkpoint-4000/config.json


{'loss': 0.0095, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.8}


Model weights saved in ./results/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-4500
Configuration saved in ./results/checkpoint-4500/config.json


{'loss': 0.0095, 'learning_rate': 1.64e-05, 'epoch': 0.9}


Model weights saved in ./results/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-4500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-4500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json


{'loss': 0.0085, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


Model weights saved in ./results/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20001
  Batch size = 16


  0%|          | 0/1251 [00:00<?, ?it/s]

{'eval_loss': 0.006247437559068203, 'eval_runtime': 58.9982, 'eval_samples_per_second': 339.01, 'eval_steps_per_second': 21.204, 'epoch': 1.0}


Saving model checkpoint to ./results/checkpoint-5500
Configuration saved in ./results/checkpoint-5500/config.json


{'loss': 0.0072, 'learning_rate': 1.5600000000000003e-05, 'epoch': 1.1}


Model weights saved in ./results/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-5500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-5500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-6000
Configuration saved in ./results/checkpoint-6000/config.json


{'loss': 0.0063, 'learning_rate': 1.5200000000000002e-05, 'epoch': 1.2}


Model weights saved in ./results/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-6000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-6000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-6500
Configuration saved in ./results/checkpoint-6500/config.json


{'loss': 0.0068, 'learning_rate': 1.48e-05, 'epoch': 1.3}


Model weights saved in ./results/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-6500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-6500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-7000
Configuration saved in ./results/checkpoint-7000/config.json


{'loss': 0.0062, 'learning_rate': 1.4400000000000001e-05, 'epoch': 1.4}


Model weights saved in ./results/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-7000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-7000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-7500
Configuration saved in ./results/checkpoint-7500/config.json


{'loss': 0.0058, 'learning_rate': 1.4e-05, 'epoch': 1.5}


Model weights saved in ./results/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-7500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-7500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-8000
Configuration saved in ./results/checkpoint-8000/config.json


{'loss': 0.0051, 'learning_rate': 1.3600000000000002e-05, 'epoch': 1.6}


Model weights saved in ./results/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-8000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-8000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-8500
Configuration saved in ./results/checkpoint-8500/config.json


{'loss': 0.0053, 'learning_rate': 1.3200000000000002e-05, 'epoch': 1.7}


Model weights saved in ./results/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-8500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-8500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-9000
Configuration saved in ./results/checkpoint-9000/config.json


{'loss': 0.0044, 'learning_rate': 1.2800000000000001e-05, 'epoch': 1.8}


Model weights saved in ./results/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-9000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-9000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-9500
Configuration saved in ./results/checkpoint-9500/config.json


{'loss': 0.004, 'learning_rate': 1.2400000000000002e-05, 'epoch': 1.9}


Model weights saved in ./results/checkpoint-9500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-9500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-9500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-10000
Configuration saved in ./results/checkpoint-10000/config.json


{'loss': 0.0049, 'learning_rate': 1.2e-05, 'epoch': 2.0}


Model weights saved in ./results/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-10000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-10000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20001
  Batch size = 16


  0%|          | 0/1251 [00:00<?, ?it/s]

{'eval_loss': 0.0041688005439937115, 'eval_runtime': 59.3497, 'eval_samples_per_second': 337.003, 'eval_steps_per_second': 21.078, 'epoch': 2.0}


Saving model checkpoint to ./results/checkpoint-10500
Configuration saved in ./results/checkpoint-10500/config.json


{'loss': 0.003, 'learning_rate': 1.16e-05, 'epoch': 2.1}


Model weights saved in ./results/checkpoint-10500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-10500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-10500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-11000
Configuration saved in ./results/checkpoint-11000/config.json


{'loss': 0.0036, 'learning_rate': 1.1200000000000001e-05, 'epoch': 2.2}


Model weights saved in ./results/checkpoint-11000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-11000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-11000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-11500
Configuration saved in ./results/checkpoint-11500/config.json


{'loss': 0.0038, 'learning_rate': 1.0800000000000002e-05, 'epoch': 2.3}


Model weights saved in ./results/checkpoint-11500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-11500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-11500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-12000
Configuration saved in ./results/checkpoint-12000/config.json


{'loss': 0.0033, 'learning_rate': 1.04e-05, 'epoch': 2.4}


Model weights saved in ./results/checkpoint-12000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-12000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-12000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-12500
Configuration saved in ./results/checkpoint-12500/config.json


{'loss': 0.0032, 'learning_rate': 1e-05, 'epoch': 2.5}


Model weights saved in ./results/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-12500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-12500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-13000
Configuration saved in ./results/checkpoint-13000/config.json


{'loss': 0.0038, 'learning_rate': 9.600000000000001e-06, 'epoch': 2.6}


Model weights saved in ./results/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-13000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-13000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-13500
Configuration saved in ./results/checkpoint-13500/config.json


{'loss': 0.003, 'learning_rate': 9.200000000000002e-06, 'epoch': 2.7}


Model weights saved in ./results/checkpoint-13500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-13500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-13500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-14000
Configuration saved in ./results/checkpoint-14000/config.json


{'loss': 0.0036, 'learning_rate': 8.8e-06, 'epoch': 2.8}


Model weights saved in ./results/checkpoint-14000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-14000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-14000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-14500
Configuration saved in ./results/checkpoint-14500/config.json


{'loss': 0.0033, 'learning_rate': 8.400000000000001e-06, 'epoch': 2.9}


Model weights saved in ./results/checkpoint-14500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-14500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-14500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-15000
Configuration saved in ./results/checkpoint-15000/config.json


{'loss': 0.0029, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


Model weights saved in ./results/checkpoint-15000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-15000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-15000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20001
  Batch size = 16


  0%|          | 0/1251 [00:00<?, ?it/s]

{'eval_loss': 0.0035770474933087826, 'eval_runtime': 59.2839, 'eval_samples_per_second': 337.377, 'eval_steps_per_second': 21.102, 'epoch': 3.0}


Saving model checkpoint to ./results/checkpoint-15500
Configuration saved in ./results/checkpoint-15500/config.json


{'loss': 0.0023, 'learning_rate': 7.600000000000001e-06, 'epoch': 3.1}


Model weights saved in ./results/checkpoint-15500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-15500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-15500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-16000
Configuration saved in ./results/checkpoint-16000/config.json


{'loss': 0.0018, 'learning_rate': 7.2000000000000005e-06, 'epoch': 3.2}


Model weights saved in ./results/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-16000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-16000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-16500
Configuration saved in ./results/checkpoint-16500/config.json


{'loss': 0.0024, 'learning_rate': 6.800000000000001e-06, 'epoch': 3.3}


Model weights saved in ./results/checkpoint-16500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-16500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-16500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-17000
Configuration saved in ./results/checkpoint-17000/config.json


{'loss': 0.0025, 'learning_rate': 6.4000000000000006e-06, 'epoch': 3.4}


Model weights saved in ./results/checkpoint-17000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-17000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-17000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-17500
Configuration saved in ./results/checkpoint-17500/config.json


{'loss': 0.0022, 'learning_rate': 6e-06, 'epoch': 3.5}


Model weights saved in ./results/checkpoint-17500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-17500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-17500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-18000
Configuration saved in ./results/checkpoint-18000/config.json


{'loss': 0.002, 'learning_rate': 5.600000000000001e-06, 'epoch': 3.6}


Model weights saved in ./results/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-18000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-18000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-18500
Configuration saved in ./results/checkpoint-18500/config.json


{'loss': 0.0016, 'learning_rate': 5.2e-06, 'epoch': 3.7}


Model weights saved in ./results/checkpoint-18500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-18500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-18500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-19000
Configuration saved in ./results/checkpoint-19000/config.json


{'loss': 0.0022, 'learning_rate': 4.800000000000001e-06, 'epoch': 3.8}


Model weights saved in ./results/checkpoint-19000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-19000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-19000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-19500
Configuration saved in ./results/checkpoint-19500/config.json


{'loss': 0.0022, 'learning_rate': 4.4e-06, 'epoch': 3.9}


Model weights saved in ./results/checkpoint-19500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-19500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-19500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-20000
Configuration saved in ./results/checkpoint-20000/config.json


{'loss': 0.002, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


Model weights saved in ./results/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-20000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-20000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20001
  Batch size = 16


  0%|          | 0/1251 [00:00<?, ?it/s]

{'eval_loss': 0.003310434753075242, 'eval_runtime': 59.0835, 'eval_samples_per_second': 338.521, 'eval_steps_per_second': 21.173, 'epoch': 4.0}


Saving model checkpoint to ./results/checkpoint-20500
Configuration saved in ./results/checkpoint-20500/config.json


{'loss': 0.0012, 'learning_rate': 3.6000000000000003e-06, 'epoch': 4.1}


Model weights saved in ./results/checkpoint-20500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-20500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-20500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-21000
Configuration saved in ./results/checkpoint-21000/config.json


{'loss': 0.0018, 'learning_rate': 3.2000000000000003e-06, 'epoch': 4.2}


Model weights saved in ./results/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-21000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-21000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-21500
Configuration saved in ./results/checkpoint-21500/config.json


{'loss': 0.0017, 'learning_rate': 2.8000000000000003e-06, 'epoch': 4.3}


Model weights saved in ./results/checkpoint-21500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-21500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-21500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-22000
Configuration saved in ./results/checkpoint-22000/config.json


{'loss': 0.0013, 'learning_rate': 2.4000000000000003e-06, 'epoch': 4.4}


Model weights saved in ./results/checkpoint-22000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-22000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-22000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-22500
Configuration saved in ./results/checkpoint-22500/config.json


{'loss': 0.0014, 'learning_rate': 2.0000000000000003e-06, 'epoch': 4.5}


Model weights saved in ./results/checkpoint-22500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-22500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-22500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-23000
Configuration saved in ./results/checkpoint-23000/config.json


{'loss': 0.0015, 'learning_rate': 1.6000000000000001e-06, 'epoch': 4.6}


Model weights saved in ./results/checkpoint-23000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-23000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-23000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-23500
Configuration saved in ./results/checkpoint-23500/config.json


{'loss': 0.0014, 'learning_rate': 1.2000000000000002e-06, 'epoch': 4.7}


Model weights saved in ./results/checkpoint-23500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-23500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-23500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-24000
Configuration saved in ./results/checkpoint-24000/config.json


{'loss': 0.0013, 'learning_rate': 8.000000000000001e-07, 'epoch': 4.8}


Model weights saved in ./results/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-24000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-24000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-24500
Configuration saved in ./results/checkpoint-24500/config.json


{'loss': 0.0013, 'learning_rate': 4.0000000000000003e-07, 'epoch': 4.9}


Model weights saved in ./results/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-24500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-24500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-25000
Configuration saved in ./results/checkpoint-25000/config.json


{'loss': 0.0014, 'learning_rate': 0.0, 'epoch': 5.0}


Model weights saved in ./results/checkpoint-25000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-25000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-25000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20001
  Batch size = 16


  0%|          | 0/1251 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.003244617022573948, 'eval_runtime': 58.6449, 'eval_samples_per_second': 341.053, 'eval_steps_per_second': 21.332, 'epoch': 5.0}
{'train_runtime': 4891.2761, 'train_samples_per_second': 81.778, 'train_steps_per_second': 5.111, 'train_loss': 0.008898326160907745, 'epoch': 5.0}


TrainOutput(global_step=25000, training_loss=0.008898326160907745, metrics={'train_runtime': 4891.2761, 'train_samples_per_second': 81.778, 'train_steps_per_second': 5.111, 'train_loss': 0.008898326160907745, 'epoch': 5.0})

In [20]:
trainer.save_model("bert-clinical-scratch-wl-es-NER-prescription")

Saving model checkpoint to bert-clinical-scratch-wl-es-NER-prescription
Configuration saved in bert-clinical-scratch-wl-es-NER-prescription/config.json
Model weights saved in bert-clinical-scratch-wl-es-NER-prescription/pytorch_model.bin
tokenizer config file saved in bert-clinical-scratch-wl-es-NER-prescription/tokenizer_config.json
Special tokens file saved in bert-clinical-scratch-wl-es-NER-prescription/special_tokens_map.json


## Evaluación de resultados

In [21]:
model = AutoModelForTokenClassification.from_pretrained("bert-clinical-scratch-wl-es-NER-prescription")

loading configuration file bert-clinical-scratch-wl-es-NER-prescription/config.json
Model config BertConfig {
  "_name_or_path": "bert-clinical-scratch-wl-es-NER-prescription",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "

In [22]:
from auxfunctions import eval_text, map_entities, calculate_metrics

In [23]:
text = "PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias"

eval_text(text,tokenizer,model)

array([1, 0, 0, 3, 0, 3, 4, 5, 6, 6, 7, 8, 8])

In [24]:
print(text)
map_entities(eval_text(text,tokenizer,model),ner_dict)

PARACETAMOL 500 MG COMPRIMIDO 1 COMPRIMIDO ORAL cada 6 horas durante 3 dias


array(['B-ACTVPRNCP', 'O', 'O', 'B-ADMIN', 'O', 'B-ADMIN', 'I-ADMIN',
       'B-PERIODICITY', 'I-PERIODICITY', 'I-PERIODICITY', 'B-DURATION',
       'I-DURATION', 'I-DURATION'], dtype='<U13')

Evaluación con otras métricas

In [25]:
y_test = [row['ner_tags'] for row in HF_dataset['test']]

In [26]:
y_preds = [list(eval_text(row['tokens'],tokenizer,model)) for row in HF_dataset['test']]

In [56]:
calculate_metrics(y_preds,y_test,ner_dict=ner_dict)

Resultados de evaluación
	 f1: 1.00 | precision: 1.00 | recall: 1.00


(0.9975186409031165, 0.9989052082564519, 0.9982114430765637)

## Versión mini

A continuación definimos un modelo como el anterior, con la diferencia de que este será entrenado (fine-tunning) en 80% de los datos etiquetados a mano y testeado con 20 % de estos.

In [1]:
import sys
sys.path.append('../datos/procesamiento')
from corpus import Corpus

datos_conll = Corpus()

In [2]:
ner_dict = {'O': 0,
            'B-ACTIVE_PRINCIPLE': 1,
            'I-ACTIVE_PRINCIPLE': 2,
            'B-FORMA_FARMA':3,
            'I-FORMA_FARMA':4,
            'B-ADMIN': 5,
            'I-ADMIN': 6,
            'B-PERIODICITY': 7,
            'I-PERIODICITY': 8,
            'B-DURATION': 9,
            'I-DURATION': 10
            }

datos_conll.entidades = ner_dict

In [3]:
for i in range(4):
    datos_conll.load_conll('../datos/Etiquetado/corpus_s{}_etiquetados.conll'.format(i+1))

Agregadas 250 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus
Agregadas 251 secuencias de token-entidad al corpus


In [4]:
HF_data_mini = datos_conll.to_HF_dataset()

HF_dataset = HF_data_mini.train_test_split(test_size=0.5,seed=0)
HF_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 501
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 502
    })
})

In [5]:
from transformers import AutoTokenizer
from auxfunctions import tokenize_and_align_labels

MODEL = "plncmm/bert-clinical-scratch-wl-es"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

process = lambda examples: tokenize_and_align_labels(examples,tokenizer)

tokenized_data_mini = HF_dataset.map(process, batched=True)
tokenized_data_mini = tokenized_data_mini.remove_columns(['id','tokens','ner_tags'])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model_mini = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(ner_dict))

Some weights of the model checkpoint at plncmm/bert-clinical-scratch-wl-es were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at plncmm/bert-clinical-s

In [7]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    weight_decay = 0.01,
)

In [8]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model_mini,
    args=training_args,
    train_dataset = tokenized_data_mini["train"],
    eval_dataset = tokenized_data_mini["test"],
    tokenizer=tokenizer,
    data_collator = data_collator,
)

In [9]:
trainer.train()

***** Running training *****
  Num examples = 501
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 160


  0%|          | 0/160 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.7695652842521667, 'eval_runtime': 5.6407, 'eval_samples_per_second': 88.996, 'eval_steps_per_second': 5.673, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.5680007338523865, 'eval_runtime': 14.4643, 'eval_samples_per_second': 34.706, 'eval_steps_per_second': 2.212, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.4883681833744049, 'eval_runtime': 4.6701, 'eval_samples_per_second': 107.492, 'eval_steps_per_second': 6.852, 'epoch': 3.0}


***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]

{'eval_loss': 0.4615930914878845, 'eval_runtime': 1.3913, 'eval_samples_per_second': 360.802, 'eval_steps_per_second': 22.999, 'epoch': 4.0}


***** Running Evaluation *****
  Num examples = 502
  Batch size = 16


  0%|          | 0/32 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.44520142674446106, 'eval_runtime': 1.4058, 'eval_samples_per_second': 357.08, 'eval_steps_per_second': 22.762, 'epoch': 5.0}
{'train_runtime': 210.5847, 'train_samples_per_second': 11.895, 'train_steps_per_second': 0.76, 'train_loss': 0.6563064575195312, 'epoch': 5.0}


TrainOutput(global_step=160, training_loss=0.6563064575195312, metrics={'train_runtime': 210.5847, 'train_samples_per_second': 11.895, 'train_steps_per_second': 0.76, 'train_loss': 0.6563064575195312, 'epoch': 5.0})

In [10]:
trainer.save_model("bert-clinical-scratch-wl-es-NER-prescription-mini")

Saving model checkpoint to bert-clinical-scratch-wl-es-NER-prescription-mini
Configuration saved in bert-clinical-scratch-wl-es-NER-prescription-mini/config.json
Model weights saved in bert-clinical-scratch-wl-es-NER-prescription-mini/pytorch_model.bin
tokenizer config file saved in bert-clinical-scratch-wl-es-NER-prescription-mini/tokenizer_config.json
Special tokens file saved in bert-clinical-scratch-wl-es-NER-prescription-mini/special_tokens_map.json


In [11]:
model_mini = AutoModelForTokenClassification.from_pretrained("bert-clinical-scratch-wl-es-NER-prescription-mini")

loading configuration file bert-clinical-scratch-wl-es-NER-prescription-mini/config.json
Model config BertConfig {
  "_name_or_path": "bert-clinical-scratch-wl-es-NER-prescription-mini",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings"

In [12]:
from auxfunctions import eval_text, map_entities, calculate_metrics

y_test = [row['ner_tags'] for row in HF_dataset['test']]
y_preds = [list(eval_text(row['tokens'],tokenizer,model_mini)) for row in HF_dataset['test']]

calculate_metrics(y_preds,y_test,ner_dict=ner_dict)

Resultados de evaluación
	 f1: 0.62 | precision: 0.53 | recall: 0.74


(0.5306397306397307, 0.7367928938756428, 0.6169504795458994)