In [1]:
from datasets import load_dataset
from metrics.evaluate import compute_exact, compute_f1
from metrics.evaluar_metricas import evaluar_metricas_QA
import mlflow
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
import requests
import torch
import numpy as np
import os
from datetime import datetime
import pprint
# Impresión elegante de datos en la terminal
pp = pprint.PrettyPrinter(width=150)
from tqdm import tqdm
from globals import TRAINING_DIR,MODELS_DIR, DATA_DIR

Definición de variables globales, parámetros de entrenamiento y MLflow

In [2]:
# VARIABLES GLOBALES
train_max = None # Número máximo de elementos para entrenamiento (para pruebas) None para ir en serio
training_output_dir = os.path.join(TRAINING_DIR,"QA")
# Defino una serie de variables que registraré en los entrenamientos de MLflow
ml_params = {
    'num_epochs': 2,
    'lr' : 1e-5,
    'eval_steps' : 0.05, 
    'eval_batch_size' : 64,
    'model_name': os.path.join(MODELS_DIR,'PlanTL-GOB-ES','roberta-large-bne-sqac')
}
num_epochs = lr = eval_steps = eval_batch_size = model_name = 0
for key, value in ml_params.items():
    assert not globals()[key] is None, f'La variable global {key} debe estar definida'    
    globals()[key] = value

Carga del conjunto de datos 

In [3]:
main_dataset = load_dataset(os.path.join(DATA_DIR,'Escrituras'), 'QA', trust_remote_code=True)
train_dataset = main_dataset['train']
val_dataset = main_dataset['validation']
if train_max:
    train_dataset = train_dataset.select(range(train_max))
    val_dataset = val_dataset.select(range(train_max))
del main_dataset

Comprobación de que el servidor MLflow está funcionando para las pruebas

In [4]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

### Iniciamos el entrenamiento

In [5]:
# Servidor de seguimiento
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("02 ENTRENAMIENTO Question-Answering")

2024/02/19 00:20:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/02/19 00:20:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.


<Experiment: artifact_location='mlflow-artifacts:/469736907013971428', creation_time=1708261517045, experiment_id='469736907013971428', last_update_time=1708261517045, lifecycle_stage='active', name='02 ENTRENAMIENTO Question-Answering', tags={}>

In [6]:
model  = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
## Extraído del tuturial en HF sobre Question-Answering
def f_preproceso(examples):
    """Función para generar los input_ids, atention_mask y otras características para el entrenamiento"""
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        stride=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    labels = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        # la secuencia indica qué tokens son de pregunta y cuales de contexto
        sequence_ids = inputs.sequence_ids(i) 

        # Busca el inicio y el final del contexto
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Si la pregunta no está íntegra en el contexto etiquetamos con (0,0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
            labels.append([0,0])
        else:
            # En otro caso, se encuentra entre los tokens de inicio y final
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start = idx - 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end =idx + 1
            end_positions.append(idx + 1)
            labels.append([start,end])
        
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["labels"] = labels 
    return inputs

In [8]:
train_tokenized = train_dataset.map(f_preproceso,batched=True, remove_columns=train_dataset.column_names)
eval_tokenized = val_dataset.map(f_preproceso,batched=True, remove_columns=val_dataset.column_names)
del train_dataset
del val_dataset

In [9]:
pp.pprint(train_tokenized.info)

DatasetInfo(description='',
            citation='',
            homepage='',
            license='',
            features={'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
                      'end_positions': Value(dtype='int64', id=None),
                      'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
                      'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
                      'start_positions': Value(dtype='int64', id=None)},
            post_processed=None,
            supervised_keys=None,
            task_templates=None,
            builder_name='escrituras',
            dataset_name='escrituras',
            config_name='QA',
            version=0.0.0,
            splits={'test': SplitInfo(name='test', num_bytes=11002385, num_examples=7532, shard_lengths=None, dataset_name='escrituras'),
                    'train': SplitInfo(name='train', num_bytes=350137

In [10]:
def compute_metrics(eval_pred):
    pred_ini = np.argmax(eval_pred.predictions[0],axis=1)
    pred_fin = np.argmax(eval_pred.predictions[1],axis=1)
    pred_txt = [tokenizer.decode(tokens[p_ini:p_fin+1]).strip() for tokens,p_ini,p_fin in zip(eval_pred.inputs,pred_ini,pred_fin)]
    
    gold_ini = eval_pred.label_ids[0]
    gold_fin = eval_pred.label_ids[1]
    gold_txt = [tokenizer.decode(tokens[g_ini:g_fin+1]).strip() for tokens,g_ini,g_fin in zip(eval_pred.inputs,gold_ini,gold_fin)]
    
    f1s = [compute_f1(g,p) for g,p in zip(gold_txt,pred_txt)]
    ems = [compute_exact(g,p) for g,p in zip(gold_txt,pred_txt)]

    return {'f1_score':np.mean(f1s), 'exact_score': np.mean(ems)}

In [11]:
training_arg = TrainingArguments(
    output_dir=training_output_dir,
    overwrite_output_dir=True,
    do_train=True,
    learning_rate=lr,
    warmup_ratio=0.2,
    evaluation_strategy='steps',
    eval_steps=eval_steps,
    save_strategy='steps',
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model='f1_score',
    logging_steps=eval_steps,
    auto_find_batch_size=True,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    include_inputs_for_metrics=True
)

trainer = Trainer(
    model=model,
    args=training_arg,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,    
    tokenizer=tokenizer,    
    compute_metrics=compute_metrics
)

In [12]:
with mlflow.start_run(run_name=f"{'Prueba con ' + str(train_max) if train_max else 'Entrenamiento'}"):
    trainer.train()    
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)    
    
    # Guardar el modelo
    n_epochs = trainer.args.num_train_epochs
    g_steps = trainer.state.global_step
    fecha_hora = datetime.now().strftime("%Y%m%d-%H%M")
    ruta_modelo_ajustado = os.path.join(MODELS_DIR,f"{fecha_hora}_escrituras_QA_{n_epochs}-epoch_{g_steps}-steps")
    trainer.save_model(ruta_modelo_ajustado)
    tokenizer.save_pretrained(ruta_modelo_ajustado)

  0%|          | 0/6024 [00:00<?, ?it/s]

{'loss': 1.0132, 'learning_rate': 2.506224066390042e-06, 'epoch': 0.1}


  0%|          | 0/95 [00:00<?, ?it/s]

Checkpoint destination directory ..\training\QA\checkpoint-302 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.2236010730266571, 'eval_f1_score': 0.9546375879610878, 'eval_exact_score': 0.942601194426012, 'eval_runtime': 141.164, 'eval_samples_per_second': 42.702, 'eval_steps_per_second': 0.673, 'epoch': 0.1}
{'loss': 0.1321, 'learning_rate': 5.012448132780084e-06, 'epoch': 0.2}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.12046411633491516, 'eval_f1_score': 0.9686049579744994, 'eval_exact_score': 0.966821499668215, 'eval_runtime': 141.251, 'eval_samples_per_second': 42.676, 'eval_steps_per_second': 0.673, 'epoch': 0.2}
{'loss': 0.1097, 'learning_rate': 7.518672199170125e-06, 'epoch': 0.3}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.10524611175060272, 'eval_f1_score': 0.9811953203604471, 'eval_exact_score': 0.9781021897810219, 'eval_runtime': 141.449, 'eval_samples_per_second': 42.616, 'eval_steps_per_second': 0.672, 'epoch': 0.3}
{'loss': 0.0774, 'learning_rate': 9.993774642041917e-06, 'epoch': 0.4}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.1250726729631424, 'eval_f1_score': 0.9739426979428766, 'eval_exact_score': 0.9698075646980756, 'eval_runtime': 141.447, 'eval_samples_per_second': 42.617, 'eval_steps_per_second': 0.672, 'epoch': 0.4}
{'loss': 0.1118, 'learning_rate': 9.367088607594937e-06, 'epoch': 0.5}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.07257575541734695, 'eval_f1_score': 0.9793948972080436, 'eval_exact_score': 0.9774386197743862, 'eval_runtime': 141.556, 'eval_samples_per_second': 42.584, 'eval_steps_per_second': 0.671, 'epoch': 0.5}
{'loss': 0.0405, 'learning_rate': 8.740402573147956e-06, 'epoch': 0.6}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.08856762200593948, 'eval_f1_score': 0.988843183247588, 'eval_exact_score': 0.9854014598540146, 'eval_runtime': 141.576, 'eval_samples_per_second': 42.578, 'eval_steps_per_second': 0.671, 'epoch': 0.6}
{'loss': 0.0473, 'learning_rate': 8.113716538700976e-06, 'epoch': 0.7}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.08544055372476578, 'eval_f1_score': 0.9869060164353542, 'eval_exact_score': 0.9850696748506967, 'eval_runtime': 141.333, 'eval_samples_per_second': 42.651, 'eval_steps_per_second': 0.672, 'epoch': 0.7}
{'loss': 0.0629, 'learning_rate': 7.487030504253995e-06, 'epoch': 0.8}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.06621473282575607, 'eval_f1_score': 0.9881833381329653, 'eval_exact_score': 0.9858991373589914, 'eval_runtime': 141.314, 'eval_samples_per_second': 42.657, 'eval_steps_per_second': 0.672, 'epoch': 0.8}
{'loss': 0.0693, 'learning_rate': 6.8603444698070145e-06, 'epoch': 0.9}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.06027235463261604, 'eval_f1_score': 0.9896903907617908, 'eval_exact_score': 0.9883875248838753, 'eval_runtime': 141.278, 'eval_samples_per_second': 42.668, 'eval_steps_per_second': 0.672, 'epoch': 0.9}
{'loss': 0.0277, 'learning_rate': 6.233658435360034e-06, 'epoch': 1.0}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.05897032469511032, 'eval_f1_score': 0.9899613196307058, 'eval_exact_score': 0.988885202388852, 'eval_runtime': 141.241, 'eval_samples_per_second': 42.679, 'eval_steps_per_second': 0.673, 'epoch': 1.0}
{'loss': 0.0179, 'learning_rate': 5.606972400913053e-06, 'epoch': 1.1}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.0520549900829792, 'eval_f1_score': 0.9923955156050476, 'eval_exact_score': 0.9918712674187127, 'eval_runtime': 141.359, 'eval_samples_per_second': 42.643, 'eval_steps_per_second': 0.672, 'epoch': 1.1}
{'loss': 0.0203, 'learning_rate': 4.9802863664660725e-06, 'epoch': 1.2}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.03773544728755951, 'eval_f1_score': 0.9936564812151959, 'eval_exact_score': 0.9927007299270073, 'eval_runtime': 141.186, 'eval_samples_per_second': 42.695, 'eval_steps_per_second': 0.673, 'epoch': 1.2}
{'loss': 0.0205, 'learning_rate': 4.353600332019091e-06, 'epoch': 1.3}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.05073569715023041, 'eval_f1_score': 0.992079740544747, 'eval_exact_score': 0.9903782349037823, 'eval_runtime': 141.118, 'eval_samples_per_second': 42.716, 'eval_steps_per_second': 0.673, 'epoch': 1.3}
{'loss': 0.0235, 'learning_rate': 3.7269142975721105e-06, 'epoch': 1.4}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.038729164749383926, 'eval_f1_score': 0.9935578648777007, 'eval_exact_score': 0.9925348374253484, 'eval_runtime': 141.062, 'eval_samples_per_second': 42.733, 'eval_steps_per_second': 0.673, 'epoch': 1.4}
{'loss': 0.0115, 'learning_rate': 3.10022826312513e-06, 'epoch': 1.5}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.029656022787094116, 'eval_f1_score': 0.9949539355434138, 'eval_exact_score': 0.9943596549435966, 'eval_runtime': 141.244, 'eval_samples_per_second': 42.678, 'eval_steps_per_second': 0.673, 'epoch': 1.5}
{'loss': 0.0114, 'learning_rate': 2.473542228678149e-06, 'epoch': 1.6}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.03464436158537865, 'eval_f1_score': 0.9942903655367781, 'eval_exact_score': 0.9936960849369608, 'eval_runtime': 141.283, 'eval_samples_per_second': 42.666, 'eval_steps_per_second': 0.672, 'epoch': 1.6}
{'loss': 0.02, 'learning_rate': 1.8468561942311686e-06, 'epoch': 1.7}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.029813602566719055, 'eval_f1_score': 0.9951663943613278, 'eval_exact_score': 0.9943596549435966, 'eval_runtime': 141.311, 'eval_samples_per_second': 42.658, 'eval_steps_per_second': 0.672, 'epoch': 1.7}
{'loss': 0.0096, 'learning_rate': 1.2201701597841876e-06, 'epoch': 1.8}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.03482209891080856, 'eval_f1_score': 0.9945945017898195, 'eval_exact_score': 0.9938619774386198, 'eval_runtime': 141.397, 'eval_samples_per_second': 42.632, 'eval_steps_per_second': 0.672, 'epoch': 1.8}
{'loss': 0.0077, 'learning_rate': 5.934841253372069e-07, 'epoch': 1.91}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.029949210584163666, 'eval_f1_score': 0.9956087743657517, 'eval_exact_score': 0.9950232249502322, 'eval_runtime': 144.119, 'eval_samples_per_second': 41.827, 'eval_steps_per_second': 0.659, 'epoch': 1.91}
{'train_runtime': 6284.563, 'train_samples_per_second': 7.668, 'train_steps_per_second': 0.959, 'train_loss': 0.0920549368320075, 'epoch': 2.0}


Una comprobación de que el modelo está respondiendo a las preguntas correctamente

In [13]:
tuned_pipeline = pipeline(
    task="question-answering",
    model=trainer.model,
    tokenizer=tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
comprobando = ({'question': "¿qué notario ha firmado el documento?", 'context': "DOS MIL TREINTA. En mi residencia, a quince de abril de dos mil quince. Ante mí, Paquito de los Palotes, notario del ilustre colegio de la Palmilla COMPARECEN Manolito y Jacinta para firmar la siguiente escritura de HERENCIA y para lo cual se sientan cómodamente."})
print(tuned_pipeline(comprobando))

comprobando = ({'question': "¿cuál es el número de protocolo?", 'context': "DOS MIL TREINTA. En mi residencia, a quince de abril de dos mil quince. Ante mí, Paquito de los Palotes, notario del ilustre colegio de la Palmilla COMPARECEN Manolito y Jacinta para firmar la siguiente escritura de HERENCIA y para lo cual se sientan cómodamente."})
print(tuned_pipeline(comprobando))

{'score': 0.9999687671661377, 'start': 81, 'end': 103, 'answer': 'Paquito de los Palotes'}
{'score': 0.9892922043800354, 'start': 0, 'end': 15, 'answer': 'DOS MIL TREINTA'}


## Evaluación del modelo generado
previo a la evaluación hay que hacer un Restart del entorno en VS Code... el sistema se me queda sin memoria

In [2]:
from datasets import load_dataset
from metrics.evaluar_metricas import evaluar_metricas_QA
import mlflow
import requests
import os
from globals import MODELS_DIR, DATA_DIR

In [3]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

In [4]:
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("02 ENTRENAMIENTO Question-Answering")

split_test = 'test'
test_dataset = load_dataset(os.path.join(DATA_DIR,'Escrituras'),'QA',trust_remote_code=True,split='test')
# Copiar la ruta del último modelo ajustado
ruta_modelo_ajustado = os.path.join(MODELS_DIR,os.path.join(MODELS_DIR,'20240219-0205_escrituras_QA_2-epoch_6024-steps'))

with mlflow.start_run(run_name="VALIDACIÓN",description="Validación del modelo QA ajustado"):
    evaluar_metricas_QA(ruta_modelo_ajustado, test_dataset)    
    

# for ruta in tqdm([r for r in os.listdir(MODELS_DIR) if os.path.isdir(MODELS_DIR+r)]):
#     if "escrituras_QA" in ruta:
#         ruta_modelo_ajustado = os.path.join(MODELS_DIR,ruta)
#         with mlflow.start_run(run_name="VALIDACIÓN",description="Validación del modelo ajustado"):
#             evaluar_metricas_QA(ruta_modelo_ajustado, test_dataset)            

2024/02/19 02:16:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2024/02/19 02:16:06 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


..\Models\..\Models\20240219-0205_escrituras_QA_2-epoch_6024-steps
	f1: 0.9691215891561086
	exact: 0.9676048858204992
