In [1]:
import os
from globals import TRAINING_DIR,MODELS_DIR, DATA_DIR, id2label,label2id
import mlflow
from datasets import load_dataset
import requests
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer)
from pprint import PrettyPrinter
import numpy as np
import evaluate

Definición de variables globales, parámetros de entrenamiento y MLflow

In [2]:
# VARIABLES GLOBALES
train_max = None # Número máximo de elementos para entrenamiento (para pruebas) None para ir en serio
training_output_dir = os.path.join(TRAINING_DIR,"NER")
# Defino una serie de variables que registraré en los entrenamientos de MLflow
ml_params = {
    'num_epochs': 5,
    'lr' : 1e-5,
    'eval_steps' : 0.05, 
    'eval_batch_size' : 32,
    'model_name': os.path.join(MODELS_DIR,'PlanTL-GOB-ES','roberta-base-bne-capitel-ner-plus')
}
num_epochs = lr = eval_steps = save_steps = eval_batch_size = model_name = 0
for key, value in ml_params.items():
    assert not globals()[key] is None, f'La variable global {key} debe estar definida'    
    globals()[key] = value

pp = PrettyPrinter(width=150)

Carga del conjunto de datos 

In [3]:
main_dataset = load_dataset(os.path.join(DATA_DIR,'Escrituras'), 'NER',trust_remote_code=True)
train_dataset = main_dataset['train']
val_dataset = main_dataset['validation']
if train_max:
    train_dataset = train_dataset.select(range(train_max))
    val_dataset = val_dataset.select(range(train_max))
del main_dataset

Comprobación de que el servidor MLflow está funcionando para las pruebas

In [4]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

### Iniciamos el entrenamiento

In [5]:
# Servidor de seguimiento
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Named Entity Recognition")

2024/02/17 20:56:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2024/02/17 20:56:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


<Experiment: artifact_location='mlflow-artifacts:/607812590786935156', creation_time=1708174761333, experiment_id='607812590786935156', last_update_time=1708174761333, lifecycle_stage='active', name='ENTRENAMIENTO Named Entity Recognition', tags={}>

In [6]:
model  = AutoModelForTokenClassification.from_pretrained(model_name, 
                                                         num_labels=9, ignore_mismatched_sizes=True,
                                                         id2label=id2label,label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ..\Models\PlanTL-GOB-ES\roberta-base-bne-capitel-ner-plus and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([17, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([17]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
pp.pprint(train_dataset.info)

DatasetInfo(description='Dataset para entrenamiento de modelos NER en extracción de datos de escrituras.\n'
                        '                                    Las etiquetas utilizadas se corresponden con los siguientes elementos:\n'
                        "                                    'B-PROTO','I-PROTO':    Número de PROTOCOLO.\n"
                        "                                    'B-FDOC','I-FDOC':      FECHA de firma del DOCUMENTO.\n"
                        "                                    'B-NOT','I-NOT':        NOTARIO, nombre y apellidos.\n"
                        "                                    'B-TDOC','I-TDOC':      TIPO de DOCUMENTO.\n"
                        '                                ',
            citation='',
            homepage='',
            license='',
            features={'id': Value(dtype='string', id=None),
                      'ner_tags': Sequence(feature=ClassLabel(names=['O',
                                      

In [8]:
def f_preproceso(examples):
    """Función para generar los input_ids, atention_mask y otras características para el entrenamiento.
        Realinea los 'ner_tags' que"""        
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True, padding=True)
    
    ner_tags_ids = []
    for i, tags in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # indica de qué palabra viene cada token
        previous_word_idx = None
        tag_ids = []
        for word_idx in word_ids:  
            # Tokens especiales van a -100 para ser ignorados por la función de pérdida.
            if word_idx is None:
                tag_ids.append(-100)
            elif word_idx != previous_word_idx:  # Sólo se etiqueta la primera aparición de cada palabra
                tag_ids.append(tags[word_idx])
            else:
                # Depende del parámetro label_all_tokens en cuyo caso habría que etiquetar todos los tokens de cada palabra
                tag_ids.append(-100)
                # tag_ids.append(tags[word_idx])

            previous_word_idx = word_idx
        ner_tags_ids.append(tag_ids)

    tokenized_inputs["labels"] = ner_tags_ids
    return tokenized_inputs

In [9]:
tokenized_train_dataset = train_dataset.map(f_preproceso, batched=True)
tokenized_val_dataset = val_dataset.map(f_preproceso,batched=True)
del train_dataset
del val_dataset

In [10]:
pp.pprint(tokenized_train_dataset.info)

DatasetInfo(description='Dataset para entrenamiento de modelos NER en extracción de datos de escrituras.\n'
                        '                                    Las etiquetas utilizadas se corresponden con los siguientes elementos:\n'
                        "                                    'B-PROTO','I-PROTO':    Número de PROTOCOLO.\n"
                        "                                    'B-FDOC','I-FDOC':      FECHA de firma del DOCUMENTO.\n"
                        "                                    'B-NOT','I-NOT':        NOTARIO, nombre y apellidos.\n"
                        "                                    'B-TDOC','I-TDOC':      TIPO de DOCUMENTO.\n"
                        '                                ',
            citation='',
            homepage='',
            license='',
            features={'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
                      'id': Value(dtype='string', id=None),
    

In [11]:
# Cargo la lista de etiquetas definidas en el dataset
lista_etiquetas = tokenized_train_dataset.features['ner_tags'].feature.names
# Y Cargo la métrica
seqeval = evaluate.load("seqeval")

# Función de evaluación 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [lista_etiquetas[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [lista_etiquetas[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1_score": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Definición de los parámetros y el trainer

In [12]:
training_args = TrainingArguments(
    output_dir=training_output_dir,
    overwrite_output_dir=True,
    do_train=True,
    learning_rate=lr,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model='f1_score'    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [13]:
with mlflow.start_run(run_name=f"{'Prueba con ' + str(train_max) if train_max else 'Entrenamiento'}"):
    mlflow.autolog()
    trainer.train()
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)    

2024/02/17 20:56:36 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2024/02/17 20:56:36 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


  0%|          | 0/3765 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.015202583745121956, 'eval_precision': 0.9407024479245857, 'eval_recall': 0.9572953736654805, 'eval_f1_score': 0.9489263803680982, 'eval_accuracy': 0.9972696518883112, 'eval_runtime': 17.986, 'eval_samples_per_second': 83.787, 'eval_steps_per_second': 2.669, 'epoch': 0.25}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.007345446385443211, 'eval_precision': 0.979108635097493, 'eval_recall': 0.9789571406467584, 'eval_f1_score': 0.9790328820116054, 'eval_accuracy': 0.9987441014322784, 'eval_runtime': 17.684, 'eval_samples_per_second': 85.218, 'eval_steps_per_second': 2.714, 'epoch': 0.5}
{'loss': 0.0817, 'learning_rate': 8.671978751660027e-06, 'epoch': 0.66}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.005181695334613323, 'eval_precision': 0.9764470443349754, 'eval_recall': 0.9814327711589045, 'eval_f1_score': 0.9789335596882476, 'eval_accuracy': 0.9990119033327485, 'eval_runtime': 17.754, 'eval_samples_per_second': 84.882, 'eval_steps_per_second': 2.704, 'epoch': 0.75}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.003898278810083866, 'eval_precision': 0.9839926119747576, 'eval_recall': 0.989169116509361, 'eval_f1_score': 0.9865740740740742, 'eval_accuracy': 0.9992827834159825, 'eval_runtime': 17.807, 'eval_samples_per_second': 84.63, 'eval_steps_per_second': 2.696, 'epoch': 1.0}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0034599360078573227, 'eval_precision': 0.9841635916359164, 'eval_recall': 0.990406931765434, 'eval_f1_score': 0.9872753913781137, 'eval_accuracy': 0.9993135652436228, 'eval_runtime': 17.834, 'eval_samples_per_second': 84.502, 'eval_steps_per_second': 2.691, 'epoch': 1.25}
{'loss': 0.0035, 'learning_rate': 7.343957503320054e-06, 'epoch': 1.33}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0030460883863270283, 'eval_precision': 0.9869230769230769, 'eval_recall': 0.9925731084635618, 'eval_f1_score': 0.989740029314202, 'eval_accuracy': 0.9994059107265435, 'eval_runtime': 18.367, 'eval_samples_per_second': 82.049, 'eval_steps_per_second': 2.613, 'epoch': 1.51}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.00292876735329628, 'eval_precision': 0.9873495834618945, 'eval_recall': 0.9902522048584249, 'eval_f1_score': 0.9887987640015451, 'eval_accuracy': 0.9993997543610155, 'eval_runtime': 17.744, 'eval_samples_per_second': 84.93, 'eval_steps_per_second': 2.705, 'epoch': 1.76}
{'loss': 0.0019, 'learning_rate': 6.0159362549800805e-06, 'epoch': 1.99}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0030608715023845434, 'eval_precision': 0.9858570330514989, 'eval_recall': 0.9922636546495436, 'eval_f1_score': 0.9890499691548427, 'eval_accuracy': 0.9993566598023191, 'eval_runtime': 17.748, 'eval_samples_per_second': 84.911, 'eval_steps_per_second': 2.705, 'epoch': 2.01}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0027839569374918938, 'eval_precision': 0.9858308948097951, 'eval_recall': 0.990406931765434, 'eval_f1_score': 0.9881136153133684, 'eval_accuracy': 0.9994920998439362, 'eval_runtime': 17.739, 'eval_samples_per_second': 84.954, 'eval_steps_per_second': 2.706, 'epoch': 2.26}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0025701303966343403, 'eval_precision': 0.9907163855794523, 'eval_recall': 0.9907163855794523, 'eval_f1_score': 0.9907163855794523, 'eval_accuracy': 0.9995013343922282, 'eval_runtime': 17.798, 'eval_samples_per_second': 84.672, 'eval_steps_per_second': 2.697, 'epoch': 2.51}
{'loss': 0.0011, 'learning_rate': 4.687915006640107e-06, 'epoch': 2.66}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0030539745930582285, 'eval_precision': 0.9906947890818859, 'eval_recall': 0.9883954819743154, 'eval_f1_score': 0.989543799860584, 'eval_accuracy': 0.9993812852644313, 'eval_runtime': 17.736, 'eval_samples_per_second': 84.968, 'eval_steps_per_second': 2.706, 'epoch': 2.76}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.004369289148598909, 'eval_precision': 0.9834634818557648, 'eval_recall': 0.9938109237196349, 'eval_f1_score': 0.9886101277512699, 'eval_accuracy': 0.9990765451707929, 'eval_runtime': 18.567, 'eval_samples_per_second': 81.166, 'eval_steps_per_second': 2.585, 'epoch': 3.01}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0024113820400089025, 'eval_precision': 0.9879944589810682, 'eval_recall': 0.9931920160915984, 'eval_f1_score': 0.9905864197530864, 'eval_accuracy': 0.9995167253060483, 'eval_runtime': 18.2729, 'eval_samples_per_second': 82.472, 'eval_steps_per_second': 2.627, 'epoch': 3.26}
{'loss': 0.001, 'learning_rate': 3.359893758300133e-06, 'epoch': 3.32}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0024876196403056383, 'eval_precision': 0.9896604938271605, 'eval_recall': 0.9922636546495436, 'eval_f1_score': 0.9909603646758867, 'eval_accuracy': 0.9994982562094642, 'eval_runtime': 17.832, 'eval_samples_per_second': 84.511, 'eval_steps_per_second': 2.692, 'epoch': 3.51}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.002331054536625743, 'eval_precision': 0.9924230709757229, 'eval_recall': 0.9930372891845892, 'eval_f1_score': 0.9927300850734726, 'eval_accuracy': 0.9995936798751489, 'eval_runtime': 18.17, 'eval_samples_per_second': 82.939, 'eval_steps_per_second': 2.642, 'epoch': 3.76}
{'loss': 0.0009, 'learning_rate': 2.03187250996016e-06, 'epoch': 3.98}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0023382992949336767, 'eval_precision': 0.9919703520691785, 'eval_recall': 0.993965650626644, 'eval_f1_score': 0.9929669989952856, 'eval_accuracy': 0.9995782889613288, 'eval_runtime': 17.789, 'eval_samples_per_second': 84.715, 'eval_steps_per_second': 2.698, 'epoch': 4.02}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.002372560789808631, 'eval_precision': 0.9913526868437307, 'eval_recall': 0.9933467429986075, 'eval_f1_score': 0.9923487131926734, 'eval_accuracy': 0.9995598198647446, 'eval_runtime': 17.79, 'eval_samples_per_second': 84.711, 'eval_steps_per_second': 2.698, 'epoch': 4.27}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0023339290637522936, 'eval_precision': 0.990587872241938, 'eval_recall': 0.9933467429986075, 'eval_f1_score': 0.9919653893695921, 'eval_accuracy': 0.9995413507681605, 'eval_runtime': 17.743, 'eval_samples_per_second': 84.935, 'eval_steps_per_second': 2.705, 'epoch': 4.52}
{'loss': 0.0005, 'learning_rate': 7.03851261620186e-07, 'epoch': 4.65}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0023359658662229776, 'eval_precision': 0.9919629057187017, 'eval_recall': 0.9930372891845892, 'eval_f1_score': 0.9924998066960489, 'eval_accuracy': 0.9995721325958007, 'eval_runtime': 17.752, 'eval_samples_per_second': 84.892, 'eval_steps_per_second': 2.704, 'epoch': 4.77}
{'train_runtime': 1108.3384, 'train_samples_per_second': 27.176, 'train_steps_per_second': 3.397, 'train_loss': 0.012100038264852122, 'epoch': 5.0}


In [14]:
# Guardar el modelo
from datetime import datetime
n_epochs = trainer.args.num_train_epochs
g_steps = trainer.state.global_step
fecha_hora = datetime.now().strftime("%Y%m%d-%H%M")
ruta_modelo_ajustado = os.path.join(MODELS_DIR,f"{fecha_hora}_escrituras_NER_{n_epochs}-epoch_{g_steps}-steps")
trainer.save_model(ruta_modelo_ajustado)
tokenizer.save_pretrained(ruta_modelo_ajustado)

('..\\Models\\20240217-2115_escrituras_NER_5-epoch_3765-steps\\tokenizer_config.json',
 '..\\Models\\20240217-2115_escrituras_NER_5-epoch_3765-steps\\special_tokens_map.json',
 '..\\Models\\20240217-2115_escrituras_NER_5-epoch_3765-steps\\vocab.json',
 '..\\Models\\20240217-2115_escrituras_NER_5-epoch_3765-steps\\merges.txt',
 '..\\Models\\20240217-2115_escrituras_NER_5-epoch_3765-steps\\added_tokens.json',
 '..\\Models\\20240217-2115_escrituras_NER_5-epoch_3765-steps\\tokenizer.json')