In [1]:
from datasets import load_dataset
from custom_evaluate import get_raw_scores_by_prediction, compute_exact, compute_f1
import evaluate
from statistics import mean
import mlflow
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, \
    pipeline, DataCollatorWithPadding
import requests
import torch
import numpy as np
import os, shutil
import datetime
import pprint
# Impresión elegante de datos en la terminal
pp = pprint.PrettyPrinter(width=150)

In [2]:
# VARIABLES GLOBALES
train_max = None # Número máximo de elementos para entrenamiento (para pruebas) None para ir en serio
training_output_dir = "../training/QA"
ml_params = {
    'num_epochs': 2,
    # 'batch_size': 8, 
    'lr' : 1e-5,
    'eval_steps' : 0.05, 
    'save_steps' : 0.05, 
    'eval_batch_size' : 128,
    'model_name': 'PlanTL-GOB-ES/roberta-large-bne-sqac' 
}
num_epochs = lr = eval_steps = save_steps = eval_batch_size = model_name = 0
for key, value in ml_params.items():
    assert not globals()[key] is None, f'La variable global {key} debe estar definida'    
    globals()[key] = value

In [3]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

In [4]:
# Servidor de seguimiento
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

2024/02/13 20:48:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/02/13 20:48:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.


<Experiment: artifact_location='mlflow-artifacts:/423598931169215837', creation_time=1707602045463, experiment_id='423598931169215837', last_update_time=1707602045463, lifecycle_stage='active', name='ENTRENAMIENTO Question-Answering', tags={}>

In [5]:
main_dataset = load_dataset('../Dataset/Escrituras', 'QA', trust_remote_code=True)
train_dataset = main_dataset['train']
val_dataset = main_dataset['validation']
if train_max:
    train_dataset = train_dataset.select(range(train_max))
    val_dataset = val_dataset.select(range(train_max))

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.is_fast

True

In [7]:
## Extraído del tuturial en HF sobre Question-Answering
def f_preproceso(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        stride=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    labels = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        # la secuencia indica qué tokens son de pregunta y cuales de contexto
        sequence_ids = inputs.sequence_ids(i) 

        # Busca el inicio y el final del contexto
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Si la pregunta no está íntegra en el contexto etiquetamos con (0,0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
            labels.append([0,0])
        else:
            # En otro caso, se encuentra entre los tokens de inicio y final
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start = idx - 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end =idx + 1
            end_positions.append(idx + 1)
            labels.append([start,end])
        
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["labels"] = labels 
    return inputs

In [8]:
train_tokenized = train_dataset.map(f_preproceso,batched=True, remove_columns=train_dataset.column_names)
eval_tokenized = val_dataset.map(f_preproceso,batched=True, remove_columns=val_dataset.column_names)

In [9]:
pp.pprint(train_tokenized.info)

DatasetInfo(description='',
            citation='',
            homepage='',
            license='',
            features={'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
                      'end_positions': Value(dtype='int64', id=None),
                      'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
                      'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
                      'start_positions': Value(dtype='int64', id=None)},
            post_processed=None,
            supervised_keys=None,
            task_templates=None,
            builder_name='escrituras',
            dataset_name='escrituras',
            config_name='QA',
            version=0.0.0,
            splits={'test': SplitInfo(name='test', num_bytes=11287326, num_examples=7532, shard_lengths=None, dataset_name='escrituras'),
                    'train': SplitInfo(name='train', num_bytes=358776

In [10]:
model  = AutoModelForQuestionAnswering.from_pretrained(model_name)

# metric = evaluate.load('squad_v2')
     
def compute_metrics(eval_pred):
    pred_ini = np.argmax(eval_pred.predictions[0],axis=1)
    pred_fin = np.argmax(eval_pred.predictions[1],axis=1)
    pred_txt = [tokenizer.decode(tokens[p_ini:p_fin+1]).strip() for tokens,p_ini,p_fin in zip(eval_pred.inputs,pred_ini,pred_fin)]
    
    gold_ini = eval_pred.label_ids[0]
    gold_fin = eval_pred.label_ids[1]
    gold_txt = [tokenizer.decode(tokens[g_ini:g_fin+1]).strip() for tokens,g_ini,g_fin in zip(eval_pred.inputs,gold_ini,gold_fin)]
    
    f1s = [compute_f1(g,p) for g,p in zip(gold_txt,pred_txt)]
    ems = [compute_exact(g,p) for g,p in zip(gold_txt,pred_txt)]

    # return metric.compute(predictions=eval_pred.predictions, references=eval_pred.label_ids)
    return {'f1_score':np.mean(f1s), 'exact_score': np.mean(ems)}

In [11]:
# Borrar el directorio de entrenamiento si existe
# if os.path.exists(training_output_dir):
#     shutil.rmtree(training_output_dir)

training_arg = TrainingArguments(
    output_dir=training_output_dir,
    overwrite_output_dir=True,
    do_train=True,
    learning_rate=lr,
    warmup_ratio=0.2,
    evaluation_strategy='steps',
    eval_steps=eval_steps,
    save_strategy='steps',
    save_steps=save_steps,
    load_best_model_at_end=True,
    metric_for_best_model='f1_score',
    logging_steps=eval_steps,
    # per_device_train_batch_size=batch_size,
    auto_find_batch_size=True,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    include_inputs_for_metrics=True
)

trainer = Trainer(
    model=model,
    args=training_arg,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,    
    tokenizer=tokenizer,    
    compute_metrics=compute_metrics
)

In [12]:
with mlflow.start_run(run_name=f"{'Prueba con ' + str(train_max) if train_max else 'Entrenamiento'}"):
    trainer.train()    
    batch_size = trainer._train_batch_size
    mlflow.log_param('batch_size',batch_size)
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)    

  0%|          | 0/6024 [00:00<?, ?it/s]

{'loss': 1.0487, 'learning_rate': 2.506224066390042e-06, 'epoch': 0.1}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.17089803516864777, 'eval_f1_score': 0.9661937282827577, 'eval_exact_score': 0.9570338420703384, 'eval_runtime': 142.373, 'eval_samples_per_second': 42.339, 'eval_steps_per_second': 0.337, 'epoch': 0.1}
{'loss': 0.1365, 'learning_rate': 5.012448132780084e-06, 'epoch': 0.2}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.10775606334209442, 'eval_f1_score': 0.9789161994247927, 'eval_exact_score': 0.9757796947577969, 'eval_runtime': 142.629, 'eval_samples_per_second': 42.263, 'eval_steps_per_second': 0.337, 'epoch': 0.2}
{'loss': 0.1005, 'learning_rate': 7.518672199170125e-06, 'epoch': 0.3}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.07987275719642639, 'eval_f1_score': 0.9862370092853056, 'eval_exact_score': 0.9820836098208361, 'eval_runtime': 142.508, 'eval_samples_per_second': 42.299, 'eval_steps_per_second': 0.337, 'epoch': 0.3}
{'loss': 0.1176, 'learning_rate': 9.993774642041917e-06, 'epoch': 0.4}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.09616925567388535, 'eval_f1_score': 0.9857208424420072, 'eval_exact_score': 0.9820836098208361, 'eval_runtime': 142.658, 'eval_samples_per_second': 42.255, 'eval_steps_per_second': 0.336, 'epoch': 0.4}
{'loss': 0.1049, 'learning_rate': 9.367088607594937e-06, 'epoch': 0.5}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.06462197750806808, 'eval_f1_score': 0.9848949190882225, 'eval_exact_score': 0.9819177173191772, 'eval_runtime': 142.539, 'eval_samples_per_second': 42.29, 'eval_steps_per_second': 0.337, 'epoch': 0.5}
{'loss': 0.099, 'learning_rate': 8.740402573147956e-06, 'epoch': 0.6}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.06479574739933014, 'eval_f1_score': 0.9877687719718143, 'eval_exact_score': 0.9829130723291307, 'eval_runtime': 142.787, 'eval_samples_per_second': 42.217, 'eval_steps_per_second': 0.336, 'epoch': 0.6}
{'loss': 0.0819, 'learning_rate': 8.113716538700976e-06, 'epoch': 0.7}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.06312631070613861, 'eval_f1_score': 0.9877083787438665, 'eval_exact_score': 0.9854014598540146, 'eval_runtime': 142.749, 'eval_samples_per_second': 42.228, 'eval_steps_per_second': 0.336, 'epoch': 0.7}
{'loss': 0.0848, 'learning_rate': 7.487030504253995e-06, 'epoch': 0.8}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.06419671326875687, 'eval_f1_score': 0.9891364647771282, 'eval_exact_score': 0.9873921698739216, 'eval_runtime': 142.651, 'eval_samples_per_second': 42.257, 'eval_steps_per_second': 0.336, 'epoch': 0.8}
{'loss': 0.0771, 'learning_rate': 6.8603444698070145e-06, 'epoch': 0.9}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.04588242992758751, 'eval_f1_score': 0.9909040588348684, 'eval_exact_score': 0.9890510948905109, 'eval_runtime': 149.828, 'eval_samples_per_second': 40.233, 'eval_steps_per_second': 0.32, 'epoch': 0.9}
{'loss': 0.0547, 'learning_rate': 6.233658435360034e-06, 'epoch': 1.0}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.04843584820628166, 'eval_f1_score': 0.9916750041920095, 'eval_exact_score': 0.9892169873921699, 'eval_runtime': 147.957, 'eval_samples_per_second': 40.742, 'eval_steps_per_second': 0.324, 'epoch': 1.0}
{'loss': 0.0295, 'learning_rate': 5.606972400913053e-06, 'epoch': 1.1}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.05349377542734146, 'eval_f1_score': 0.9917925737556933, 'eval_exact_score': 0.9900464499004645, 'eval_runtime': 149.332, 'eval_samples_per_second': 40.366, 'eval_steps_per_second': 0.321, 'epoch': 1.1}
{'loss': 0.0229, 'learning_rate': 4.9802863664660725e-06, 'epoch': 1.2}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.04320317134261131, 'eval_f1_score': 0.9933081898457763, 'eval_exact_score': 0.9915394824153948, 'eval_runtime': 149.41, 'eval_samples_per_second': 40.345, 'eval_steps_per_second': 0.321, 'epoch': 1.2}
{'loss': 0.0296, 'learning_rate': 4.353600332019091e-06, 'epoch': 1.3}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.03293217346072197, 'eval_f1_score': 0.9945102748022455, 'eval_exact_score': 0.9936960849369608, 'eval_runtime': 149.913, 'eval_samples_per_second': 40.21, 'eval_steps_per_second': 0.32, 'epoch': 1.3}
{'loss': 0.0302, 'learning_rate': 3.7269142975721105e-06, 'epoch': 1.4}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.04069961607456207, 'eval_f1_score': 0.9939959664700349, 'eval_exact_score': 0.9928666224286662, 'eval_runtime': 148.924, 'eval_samples_per_second': 40.477, 'eval_steps_per_second': 0.322, 'epoch': 1.4}
{'loss': 0.0377, 'learning_rate': 3.10022826312513e-06, 'epoch': 1.5}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.028856083750724792, 'eval_f1_score': 0.993830073968376, 'eval_exact_score': 0.9927007299270073, 'eval_runtime': 147.677, 'eval_samples_per_second': 40.819, 'eval_steps_per_second': 0.325, 'epoch': 1.5}
{'loss': 0.0208, 'learning_rate': 2.473542228678149e-06, 'epoch': 1.6}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.0328320674598217, 'eval_f1_score': 0.992823493822118, 'eval_exact_score': 0.9915394824153948, 'eval_runtime': 147.252, 'eval_samples_per_second': 40.937, 'eval_steps_per_second': 0.326, 'epoch': 1.6}
{'loss': 0.0186, 'learning_rate': 1.8468561942311686e-06, 'epoch': 1.7}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.029805831611156464, 'eval_f1_score': 0.9955708392857837, 'eval_exact_score': 0.9945255474452555, 'eval_runtime': 147.997, 'eval_samples_per_second': 40.731, 'eval_steps_per_second': 0.324, 'epoch': 1.7}
{'loss': 0.032, 'learning_rate': 1.2201701597841876e-06, 'epoch': 1.8}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.02788090892136097, 'eval_f1_score': 0.9956876943812918, 'eval_exact_score': 0.9945255474452555, 'eval_runtime': 147.509, 'eval_samples_per_second': 40.865, 'eval_steps_per_second': 0.325, 'epoch': 1.8}
{'loss': 0.0241, 'learning_rate': 5.934841253372069e-07, 'epoch': 1.91}


  0%|          | 0/48 [00:00<?, ?it/s]

{'eval_loss': 0.021794382482767105, 'eval_f1_score': 0.9957652792213685, 'eval_exact_score': 0.9946914399469144, 'eval_runtime': 148.116, 'eval_samples_per_second': 40.698, 'eval_steps_per_second': 0.324, 'epoch': 1.91}
{'train_runtime': 6428.2526, 'train_samples_per_second': 7.497, 'train_steps_per_second': 0.937, 'train_loss': 0.10857136078564769, 'epoch': 2.0}


In [13]:
tuned_pipeline = pipeline(
    task="question-answering",
    model=trainer.model,
    # batch_size=64,
    tokenizer=tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
comprobando = ({'question': "¿qué notario ha firmado el documento?", 'context': "DOS MIL TREINTA. En mi residencia, a quince de abril de dos mil quince. Ante mí, Paquito de los Palotes, notario del ilustre colegio de la Palmilla COMPARECEN Manolito y Jacinta para firmar la siguiente escritura de HERENCIA y para lo cual se sientan cómodamente."})

tuned_pipeline(comprobando)

{'score': 0.992911696434021,
 'start': 81,
 'end': 103,
 'answer': 'Paquito de los Palotes'}

In [14]:
# Guardar el modelo
n_epochs = trainer.args.num_train_epochs
g_steps = trainer.state.global_step
fecha_hora = datetime.datetime.now().strftime("%Y%m%d-%H%M")
ruta_modelo_ajustado = f"../Models/{fecha_hora}_escrituras_QA_{n_epochs}-epoch_{g_steps}-steps"
trainer.save_model(ruta_modelo_ajustado)
tokenizer.save_pretrained(ruta_modelo_ajustado)

('../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\tokenizer_config.json',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\special_tokens_map.json',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\vocab.json',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\merges.txt',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\added_tokens.json',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\tokenizer.json')

## Evaluación del modelo generado
previo a la evaluación hay que hacer un Restart del entorno en VS Code... el sistema se me queda sin memoria

In [7]:
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

split_test = 'validation'
test_dataset = main_dataset[split_test] # load_dataset('../Dataset/Escrituras','QA',trust_remote_code=True,split='validation')
ruta_modelo_ajustado = "../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps"
modelo_ajustado = AutoModelForQuestionAnswering.from_pretrained(ruta_modelo_ajustado)
tokenizer_ajustado = AutoTokenizer.from_pretrained(ruta_modelo_ajustado)
val_batch_size = 512

with mlflow.start_run(run_name="VALIDACIÓN",description="Validación del modelo ajustado"):
    # Definición del pipeline y el conjunto de datos
    qc_dataset_test = [{'question':q, 'context':c} for q,c in zip(test_dataset['question'],test_dataset['context'])]
    consulta_qc = pipeline("question-answering", model=modelo_ajustado, tokenizer=tokenizer_ajustado, 
                    device=0 if torch.cuda.is_available() else None, batch_size=val_batch_size)
    # Ejecución y cálculo de métricas
    predicciones = consulta_qc(qc_dataset_test)
    exact_scores, f1_scores  = get_raw_scores_by_prediction(test_dataset,predicciones)
    f1_mean = mean(f1_scores.values())
    exact_mean = mean(exact_scores.values())
    
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)
    mlflow.log_param('split', split_test)
    mlflow.log_metric('f1', f1_mean)
    mlflow.log_metric('exact', exact_mean)
    print(len(f1_scores), 'f1:', f1_mean)
    print(len(exact_scores), 'exact:', exact_mean)

2024/02/13 20:49:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/02/13 20:49:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.


OSError: Incorrect path_or_model_id: '../20240213-2010_escrituras_QA_2-epoch_6024-steps'. Please provide either the path to a local folder or the repo_id of a model on the Hub.