# Clasificación de tokens con HuggingFace

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../datos/procesamiento')
from corpus import Corpus
from etiquetado_entidades import codigo_hfl, split_rule, process_column

Cargando datos

In [2]:
main_db = pd.read_csv('../datos/DATA_HLF_MDS_2.csv',sep=',')

code_db = pd.read_excel('../datos/PRINCIPIOS_ACTIVOS_MDS.xlsx')
HLF = code_db.loc[:,['PRINCIPIO_ACTIVO','CODIGO_HLF']]
HLF_df = codigo_hfl(HLF)

df = main_db.join(HLF_df.set_index('CODIGO_MEDICAMENTO'), on='CODIGO_MEDICAMENTO')

Cargar principios activos, forma farma y juntar columnas

In [3]:
#PARA DETECTAR LA POSICION DE ENTIDADES TIPO PRINCIPIO ACTIVO, FORMA FARMACO SE COMPARARAN LOS ELEMENTOS DEL CORPUS CON LAS LISTAS CORRESPONDIENTES.
PA = np.unique(split_rule(df['PRINCIPIO_ACTIVO'].dropna().unique()))
FF = np.unique(split_rule(df['FORMA_FARMA'].dropna().unique()))

rows = (df['PRES_DENOMINACION'] + ' ' + df['RESUMEN']).dropna().unique()

Ejecutar procesamiento de datos de Martin

In [4]:
corpus = Corpus()

for i in range(len(rows)):
    row = rows[i].split()
    tagged_seq = process_column(row,PA,FF)

    corpus.append(tagged_seq)

    if i > 100:
        # solo testeando :)
        break

---

## Cargando el corpus como dataset de HuggingFace

In [5]:
ner_dict = {'O': 0,
            'B-ACTVPRNCP': 1,
            'I-ACTVPRNCP': 2,
            'B-ADMIN': 3,
            'I-ADMIN': 4,
            'B-PERIODICITY': 5,
            'I-PERIODICITY': 6,
            'B-DURATION': 7,
            'I-DURATION': 8
            }

corpus.entidades = ner_dict

In [6]:
HF_dataset = corpus.to_HF_dataset()

In [7]:
HF_dataset.data

InMemoryTable
id: int64
tokens: list<item: string>
  child 0, item: string
ner_tags: list<item: int64>
  child 0, item: int64
----
id: [[0,1,2,3,4,...,97,98,99,100,101]]
tokens: [[["PARACETAMOL","500","MG","COMPRIMIDO","1",...,"6","horas","durante","3","dias"],["KETOROLACO","10","MG","COMPRIMIDO","1",...,"8","horas","durante","3","dias"],...,["KETOPROFENO","50","MG","CÁPSULA","1",...,"ORAL","tres","veces","al","día"],["DICLOFENACO","75","MG/3","ML","SOLUCIÓN",...,"1","UNIDAD","INTRAMUSCULAR","dosis","simple"]]]
ner_tags: [[[1,0,0,3,0,...,6,6,7,8,8],[1,0,0,3,0,...,6,6,7,8,8],...,[1,0,0,3,0,...,3,0,0,0,0],[1,0,0,0,1,...,0,0,0,0,0]]]

In [8]:
HF_dataset.features[f"ner_tags"].feature.names = [key for key in ner_dict.keys()]
HF_dataset.features[f"ner_tags"].feature.names

['O',
 'B-ACTVPRNCP',
 'I-ACTVPRNCP',
 'B-ADMIN',
 'I-ADMIN',
 'B-PERIODICITY',
 'I-PERIODICITY',
 'B-DURATION',
 'I-DURATION']

In [9]:
HF_dataset = HF_dataset.train_test_split(test_size=0.2,seed=0)
HF_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 81
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 21
    })
})

## Cargamos un modelo basado en Transformers de HuggingFace

Nuestro modelo será: plncmm/bert-clinical-scratch-wl-es. Ha sido fine-tuneado con texto médico (aunque probablemente no con prescripciones)

[Fuente de esta sección](https://huggingface.co/docs/transformers/tasks/token_classification)

### Tokenizer

In [10]:
MODEL = "plncmm/bert-clinical-scratch-wl-es"

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Usaremos el tokenizador para codificar nuestro input.

In [12]:
example = HF_dataset["train"][0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'meta',
 '##mi',
 '##zo',
 '##l',
 'só',
 '##dico',
 '1',
 'g',
 '/',
 '2',
 'ml',
 'solución',
 'inyecta',
 '##ble',
 'amp',
 '##olla',
 '2',
 'ml',
 '4',
 'unidad',
 'paren',
 '##tera',
 '##l',
 'diaria',
 '[SEP]']

Debemos corregir el _mismatch_ entre input tokenizado y la lista de tokens

In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [15]:
tokenized_data = HF_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
tokenized_data.data

{'train': InMemoryTable
 id: int64
 tokens: list<item: string>
   child 0, item: string
 ner_tags: list<item: int64>
   child 0, item: int64
 input_ids: list<item: int32>
   child 0, item: int32
 token_type_ids: list<item: int8>
   child 0, item: int8
 attention_mask: list<item: int8>
   child 0, item: int8
 labels: list<item: int64>
   child 0, item: int64
 ----
 id: [[23,22,19,101,92,...,41,56,33,79,95]]
 tokens: [[["METAMIZOL","SÓDICO","1","G/2","ML",...,"ML","4","UNIDAD","PARENTERAL","diaria"],["CEFTRIAXONA","1","G","POLVO","LIOFILIZADO",...,"24","horas","durante","21","dias"],...,["ISOSORBIDE","DINITRATO","10","MG","COMPRIMIDO",...,"COMPRIMIDO","ORAL","cada","8","horas"],["SULPIRIDA","50","MG","ORAL","1",...,"DURA","ORAL","cada","8","horas"]]]
 ner_tags: [[[1,2,0,0,0,...,0,0,0,0,0],[1,0,0,3,4,...,6,6,7,8,8],...,[1,2,0,0,3,...,3,4,5,6,6],[1,0,0,3,0,...,4,3,5,6,6]]]
 input_ids: [[[4,6265,1446,1600,30962,...,15843,2432,30962,15359,5],[4,1651,4714,3886,14008,...,2596,1672,2744,12873,5

### Modelo

In [16]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(ner_dict))

Some weights of the model checkpoint at plncmm/bert-clinical-scratch-wl-es were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at plncmm/bert-clinical-s

## Entrenamiento

In [20]:
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    weight_decay = 0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [21]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator = data_collator,
)

In [22]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 81
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 18


  0%|          | 0/18 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 21
  Batch size = 16


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.7249085307121277, 'eval_runtime': 0.1981, 'eval_samples_per_second': 106.016, 'eval_steps_per_second': 10.097, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 21
  Batch size = 16


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.562258780002594, 'eval_runtime': 0.1992, 'eval_samples_per_second': 105.432, 'eval_steps_per_second': 10.041, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 21
  Batch size = 16


  0%|          | 0/2 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.48975545167922974, 'eval_runtime': 0.207, 'eval_samples_per_second': 101.441, 'eval_steps_per_second': 9.661, 'epoch': 3.0}
{'train_runtime': 13.4633, 'train_samples_per_second': 18.049, 'train_steps_per_second': 1.337, 'train_loss': 0.7343078189425998, 'epoch': 3.0}


TrainOutput(global_step=18, training_loss=0.7343078189425998, metrics={'train_runtime': 13.4633, 'train_samples_per_second': 18.049, 'train_steps_per_second': 1.337, 'train_loss': 0.7343078189425998, 'epoch': 3.0})