In [1]:
import os
from globals import TRAINING_DIR,MODELS_DIR, DATA_DIR, id2label,label2id
import mlflow
from datasets import load_dataset
import requests
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline)
from pprint import PrettyPrinter
from numpy import argmax
from torch.cuda import is_available as cuda_is_available
from metrics.evaluar_metricas import evaluar_metricas_NER
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score
from datetime import datetime
from globals import ner_predicted_labels, group_by_labels

Definición de variables globales, parámetros de entrenamiento y MLflow

In [2]:
# VARIABLES GLOBALES
train_max = None # Número máximo de elementos para entrenamiento (para pruebas) None para ir en serio
training_output_dir = os.path.join(TRAINING_DIR,"NER")
# Defino una serie de variables que registraré en los entrenamientos de MLflow
ml_params = {
    'num_epochs': 3,
    'lr' : 1e-5,
    'eval_steps' : 0.05, 
    'eval_batch_size' : 64,
    'label_all_tokens': True,
    'model_name': os.path.join(MODELS_DIR,'PlanTL-GOB-ES','roberta-base-bne-capitel-ner-plus')
}
num_epochs = lr = eval_steps = eval_batch_size = label_all_tokens = model_name = 0
for key, value in ml_params.items():
    assert not globals()[key] is None, f'La variable global {key} debe estar definida'    
    globals()[key] = value

pp = PrettyPrinter(width=150)

Carga del conjunto de datos 

In [3]:
main_dataset = load_dataset(os.path.join(DATA_DIR,'Escrituras'), 'NER',trust_remote_code=True)
train_dataset = main_dataset['train']
val_dataset = main_dataset['validation']
if train_max:
    train_dataset = train_dataset.select(range(train_max))
    val_dataset = val_dataset.select(range(train_max))
del main_dataset

Comprobación de que el servidor MLflow está funcionando para las pruebas

In [4]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

### Iniciamos el entrenamiento

Inicio del servidor MLflow para registrar los entrenamientos

In [5]:
# Servidor de seguimiento
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("03 ENTRENAMIENTO Named Entity Recognition")

2024/02/18 19:49:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/02/18 19:49:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.


<Experiment: artifact_location='mlflow-artifacts:/826943604288671220', creation_time=1708271990580, experiment_id='826943604288671220', last_update_time=1708271990580, lifecycle_stage='active', name='03 ENTRENAMIENTO Named Entity Recognition', tags={}>

Variables

In [6]:
# Cargo la lista de etiquetas definidas en el dataset
lista_etiquetas = train_dataset.features['ner_tags'].feature.names

model  = AutoModelForTokenClassification.from_pretrained(model_name, 
                                                         num_labels=len(lista_etiquetas), ignore_mismatched_sizes=True,
                                                         id2label=id2label,label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ..\Models\PlanTL-GOB-ES\roberta-base-bne-capitel-ner-plus and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([17, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([17]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
pp.pprint(train_dataset.info)

DatasetInfo(description='Dataset para entrenamiento de modelos NER en extracción de datos de escrituras.\n'
                        '                                    Las etiquetas utilizadas se corresponden con los siguientes elementos:\n'
                        "                                    'B-PROTO','I-PROTO':    Número de PROTOCOLO.\n"
                        "                                    'B-FDOC','I-FDOC':      FECHA de firma del DOCUMENTO.\n"
                        "                                    'B-NOT','I-NOT':        NOTARIO, nombre y apellidos.\n"
                        "                                    'B-TDOC','I-TDOC':      TIPO de DOCUMENTO.\n"
                        '                                ',
            citation='',
            homepage='',
            license='',
            features={'id': Value(dtype='string', id=None),
                      'ner_tags': Sequence(feature=ClassLabel(names=['O',
                                      

Funciones de preprocesado y evaluación de métricas

In [8]:
def f_preproceso(examples):
    """Función para generar los input_ids, atention_mask y otras características para el entrenamiento.
        Realinea los 'ner_tags' que"""        
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True, padding=True)
    
    ner_tags_ids = []
    for i, tags in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # indica de qué palabra viene cada token
        previous_word_idx = None
        tag_ids = []
        for word_idx in word_ids:  
            # Tokens especiales van a -100 para ser ignorados por la función de pérdida.
            if word_idx is None:
                tag_ids.append(-100)
            elif word_idx != previous_word_idx:  
                tag_ids.append(tags[word_idx])
            else:
                # Hay dos estrategias:
                # Sólo se etiqueta la primera aparición de cada palabra
                # o se etiquetan todos los tokens de cada palabra
                tag_ids.append(tags[word_idx] if ml_params['label_all_tokens'] else -100)                
            previous_word_idx = word_idx
        ner_tags_ids.append(tag_ids)

    # Sustituyo la actual característica 'labels' que debe cambiar al realizar la tokenización de las entradas
    tokenized_inputs["labels"] = ner_tags_ids
    return tokenized_inputs

In [9]:
tokenized_train_dataset = train_dataset.map(f_preproceso, batched=True)
tokenized_val_dataset = val_dataset.map(f_preproceso,batched=True)
del train_dataset
del val_dataset

Map:   0%|          | 0/6024 [00:00<?, ? examples/s]

Map:   0%|          | 0/1507 [00:00<?, ? examples/s]

In [10]:
pp.pprint(tokenized_train_dataset.info)

DatasetInfo(description='Dataset para entrenamiento de modelos NER en extracción de datos de escrituras.\n'
                        '                                    Las etiquetas utilizadas se corresponden con los siguientes elementos:\n'
                        "                                    'B-PROTO','I-PROTO':    Número de PROTOCOLO.\n"
                        "                                    'B-FDOC','I-FDOC':      FECHA de firma del DOCUMENTO.\n"
                        "                                    'B-NOT','I-NOT':        NOTARIO, nombre y apellidos.\n"
                        "                                    'B-TDOC','I-TDOC':      TIPO de DOCUMENTO.\n"
                        '                                ',
            citation='',
            homepage='',
            license='',
            features={'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
                      'id': Value(dtype='string', id=None),
    

In [11]:
# Función de evaluación 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = argmax(predictions, axis=2)

    predictions_list = [
        [lista_etiquetas[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [lista_etiquetas[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, predictions_list),
        "recall":   recall_score(true_labels, predictions_list),
        "f1_score": f1_score(true_labels, predictions_list),
        "accuracy": accuracy_score(true_labels, predictions_list)
    }

Definición de los parámetros y el trainer

In [12]:
training_args = TrainingArguments(
    output_dir=training_output_dir,
    overwrite_output_dir=True,
    do_train=True,
    learning_rate=lr,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=eval_steps,    
    load_best_model_at_end=True,
    metric_for_best_model='f1_score'    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,    
    compute_metrics=compute_metrics
)

In [13]:
with mlflow.start_run(run_name=f"{'Prueba con ' + str(train_max) if train_max else 'Entrenamiento'}"):
    mlflow.autolog()
    trainer.train()
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)    
    # Guardar el modelo    
    n_epochs = trainer.args.num_train_epochs
    g_steps = trainer.state.global_step
    fecha_hora = datetime.now().strftime("%Y%m%d-%H%M")
    ruta_modelo_ajustado = os.path.join(MODELS_DIR,f"{fecha_hora}_escrituras_NER_{n_epochs}-epoch_{g_steps}-steps")
    trainer.save_model(ruta_modelo_ajustado)
    tokenizer.save_pretrained(ruta_modelo_ajustado)
    

2024/02/18 19:49:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/02/18 19:49:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.


  0%|          | 0/2259 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.036558691412210464, 'eval_precision': 0.8889960614098545, 'eval_recall': 0.7965430320489737, 'eval_f1_score': 0.8402339892121857, 'eval_accuracy': 0.9917805069049958, 'eval_runtime': 17.236, 'eval_samples_per_second': 87.433, 'eval_steps_per_second': 1.392, 'epoch': 0.15}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.013875742442905903, 'eval_precision': 0.9355610321832415, 'eval_recall': 0.9295642779978394, 'eval_f1_score': 0.932553014703226, 'eval_accuracy': 0.9969048714092158, 'eval_runtime': 17.177, 'eval_samples_per_second': 87.734, 'eval_steps_per_second': 1.397, 'epoch': 0.3}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.008487021550536156, 'eval_precision': 0.9635219226092554, 'eval_recall': 0.9701836514223983, 'eval_f1_score': 0.9668413119931099, 'eval_accuracy': 0.9982162612338251, 'eval_runtime': 17.039, 'eval_samples_per_second': 88.444, 'eval_steps_per_second': 1.409, 'epoch': 0.45}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.00683123804628849, 'eval_precision': 0.9711723254324152, 'eval_recall': 0.9826431400792222, 'eval_f1_score': 0.9768740602849574, 'eval_accuracy': 0.998527017116434, 'eval_runtime': 17.139, 'eval_samples_per_second': 87.928, 'eval_steps_per_second': 1.4, 'epoch': 0.6}
{'loss': 0.0884, 'learning_rate': 7.786631252766713e-06, 'epoch': 0.66}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.007308057509362698, 'eval_precision': 0.9818529130850048, 'eval_recall': 0.9624774936982355, 'eval_f1_score': 0.972068664533023, 'eval_accuracy': 0.9981649865131947, 'eval_runtime': 17.107, 'eval_samples_per_second': 88.093, 'eval_steps_per_second': 1.403, 'epoch': 0.75}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.004757819697260857, 'eval_precision': 0.9816658311251164, 'eval_recall': 0.9871804105149442, 'eval_f1_score': 0.9844153978741742, 'eval_accuracy': 0.998982274484456, 'eval_runtime': 17.06, 'eval_samples_per_second': 88.335, 'eval_steps_per_second': 1.407, 'epoch': 0.9}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.004728749394416809, 'eval_precision': 0.9802156988786516, 'eval_recall': 0.9884047533309327, 'eval_f1_score': 0.9842931937172775, 'eval_accuracy': 0.9989434299991299, 'eval_runtime': 17.071, 'eval_samples_per_second': 88.278, 'eval_steps_per_second': 1.406, 'epoch': 1.05}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.0038392606656998396, 'eval_precision': 0.9864592336502449, 'eval_recall': 0.986388188692834, 'eval_f1_score': 0.986423709892326, 'eval_accuracy': 0.9991438675434126, 'eval_runtime': 17.169, 'eval_samples_per_second': 87.774, 'eval_steps_per_second': 1.398, 'epoch': 1.2}
{'loss': 0.0043, 'learning_rate': 5.573262505533422e-06, 'epoch': 1.33}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.0035473830066621304, 'eval_precision': 0.9868847733660013, 'eval_recall': 0.9863161685271876, 'eval_f1_score': 0.9866003890209638, 'eval_accuracy': 0.99926506233763, 'eval_runtime': 17.293, 'eval_samples_per_second': 87.145, 'eval_steps_per_second': 1.388, 'epoch': 1.35}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.003044114913791418, 'eval_precision': 0.989986312225344, 'eval_recall': 0.9897011163125675, 'eval_f1_score': 0.9898436937261398, 'eval_accuracy': 0.9993738268965432, 'eval_runtime': 17.014, 'eval_samples_per_second': 88.574, 'eval_steps_per_second': 1.411, 'epoch': 1.5}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.0028446216601878405, 'eval_precision': 0.9896566585260739, 'eval_recall': 0.9922938422758373, 'eval_f1_score': 0.9909734958823319, 'eval_accuracy': 0.9994468545289562, 'eval_runtime': 17.183, 'eval_samples_per_second': 87.703, 'eval_steps_per_second': 1.397, 'epoch': 1.65}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.0027299632783979177, 'eval_precision': 0.9919250180245134, 'eval_recall': 0.9908534389629096, 'eval_f1_score': 0.9913889389299225, 'eval_accuracy': 0.9994934679113475, 'eval_runtime': 17.148, 'eval_samples_per_second': 87.882, 'eval_steps_per_second': 1.4, 'epoch': 1.8}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.0031106192618608475, 'eval_precision': 0.9894842984730625, 'eval_recall': 0.989413035649982, 'eval_f1_score': 0.989448665778386, 'eval_accuracy': 0.9993847033524345, 'eval_runtime': 17.111, 'eval_samples_per_second': 88.072, 'eval_steps_per_second': 1.403, 'epoch': 1.95}
{'loss': 0.0025, 'learning_rate': 3.359893758300133e-06, 'epoch': 1.99}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.002760678995400667, 'eval_precision': 0.9903555491579099, 'eval_recall': 0.9909974792942023, 'eval_f1_score': 0.9906764102379496, 'eval_accuracy': 0.9994468545289562, 'eval_runtime': 17.107, 'eval_samples_per_second': 88.092, 'eval_steps_per_second': 1.403, 'epoch': 2.1}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.002778151072561741, 'eval_precision': 0.9883287985106688, 'eval_recall': 0.9940943464169968, 'eval_f1_score': 0.9912031883953898, 'eval_accuracy': 0.9994515158671954, 'eval_runtime': 16.713, 'eval_samples_per_second': 90.169, 'eval_steps_per_second': 1.436, 'epoch': 2.25}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.002621967811137438, 'eval_precision': 0.9912192313228732, 'eval_recall': 0.9918617212819589, 'eval_f1_score': 0.9915403722236221, 'eval_accuracy': 0.9994856990142823, 'eval_runtime': 16.638, 'eval_samples_per_second': 90.576, 'eval_steps_per_second': 1.442, 'epoch': 2.4}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.002713553374633193, 'eval_precision': 0.9897292250233427, 'eval_recall': 0.99243788260713, 'eval_f1_score': 0.9910817031070197, 'eval_accuracy': 0.9994142251612823, 'eval_runtime': 16.74, 'eval_samples_per_second': 90.024, 'eval_steps_per_second': 1.434, 'epoch': 2.55}
{'loss': 0.0015, 'learning_rate': 1.1465250110668438e-06, 'epoch': 2.66}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.0026107176672667265, 'eval_precision': 0.9911555331847272, 'eval_recall': 0.9927259632697155, 'eval_f1_score': 0.9919401266551525, 'eval_accuracy': 0.9994763763378041, 'eval_runtime': 16.773, 'eval_samples_per_second': 89.847, 'eval_steps_per_second': 1.431, 'epoch': 2.7}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.0025513186119496822, 'eval_precision': 0.9910078411625063, 'eval_recall': 0.9921498019445445, 'eval_f1_score': 0.9915784927661412, 'eval_accuracy': 0.9994763763378041, 'eval_runtime': 16.754, 'eval_samples_per_second': 89.948, 'eval_steps_per_second': 1.432, 'epoch': 2.85}
{'train_runtime': 815.0988, 'train_samples_per_second': 22.172, 'train_steps_per_second': 2.771, 'train_loss': 0.0215389627094235, 'epoch': 3.0}


In [14]:
n_epochs = trainer.args.num_train_epochs
g_steps = trainer.state.global_step
fecha_hora = datetime.now().strftime("%Y%m%d-%H%M")
ruta_modelo_ajustado = os.path.join(MODELS_DIR,f"{fecha_hora}_escrituras_NER_{n_epochs}-epoch_{g_steps}-steps")
trainer.save_model(ruta_modelo_ajustado)
tokenizer.save_pretrained(ruta_modelo_ajustado)

('..\\Models\\20240218-2002_escrituras_NER_3-epoch_2259-steps\\tokenizer_config.json',
 '..\\Models\\20240218-2002_escrituras_NER_3-epoch_2259-steps\\special_tokens_map.json',
 '..\\Models\\20240218-2002_escrituras_NER_3-epoch_2259-steps\\vocab.json',
 '..\\Models\\20240218-2002_escrituras_NER_3-epoch_2259-steps\\merges.txt',
 '..\\Models\\20240218-2002_escrituras_NER_3-epoch_2259-steps\\added_tokens.json',
 '..\\Models\\20240218-2002_escrituras_NER_3-epoch_2259-steps\\tokenizer.json')

Prueba de resultados para ver cómo predice en comparación con un ejemplo del conjunto de pruebas

In [4]:
model = AutoModelForTokenClassification.from_pretrained('..\\Models\\20240218-2002_escrituras_NER_3-epoch_2259-steps')
tokenizer = AutoTokenizer.from_pretrained('..\\Models\\20240218-2002_escrituras_NER_3-epoch_2259-steps')

In [5]:
id_test = 2
model.eval()    
test_dataset = load_dataset(os.path.join(DATA_DIR,'Escrituras'), 'NER',trust_remote_code=True, split="test")

# # Método 3
consulta =  pipeline("ner", model=model, tokenizer=tokenizer, 
                device=0 if cuda_is_available() else None, batch_size=32)        
text = " ".join(test_dataset['tokens'][id_test])
predicciones = consulta(text)
pp.pprint(group_by_labels(ner_predicted_labels(predicciones,text, tokenizer)))

[{'label': 'PROTO', 'matches': [{'start': 7, 'text': 'siete mil doscientos siete'}]},
 {'label': 'TDOC', 'matches': [{'start': 76, 'text': 'HERENCIA'}]},
 {'label': 'FDOC', 'matches': [{'start': 125, 'text': 'veinte de junio de dos mil veintiuno'}]},
 {'label': 'NOT', 'matches': [{'start': 174, 'text': 'TARU ELINA BENTANCUR'}]}]


## Validación final con el conjunto de test

In [16]:
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("03 ENTRENAMIENTO Named Entity Recognition")

try:
    # El entrenamiento NER requiere menos memoria, es posible que no haga falta hacer restart.
    print(ruta_modelo_ajustado)
except:
    # Si hemos tenido he hacer restart del entorno, copiar la ruta del último entrenamiento
    ruta_modelo_ajustado = os.path.join(MODELS_DIR,"20240218-1730_escrituras_NER_5-epoch_3765-steps")
    
test_dataset = load_dataset(os.path.join(DATA_DIR,'Escrituras'), 'NER',trust_remote_code=True, split="test")
with mlflow.start_run(run_name="VALIDACIÓN",description="Validación del modelo NER ajsutado."):
    evaluar_metricas_NER(ruta_modelo_ajustado,test_dataset)
    

2024/02/18 20:03:01 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.


..\Models\20240218-2002_escrituras_NER_3-epoch_2259-steps
..\Models\20240218-2002_escrituras_NER_3-epoch_2259-steps
	f1: 0.956537186167693
	precision: 0.9869257086999023
	recall: 0.9279641544117647
	accuracy: 0.99427895048333
