In [None]:
from datasets import load_dataset
from custom_evaluate import get_raw_scores_by_prediction, compute_exact, compute_f1
import evaluate
from statistics import mean
import mlflow
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, \
    pipeline, DataCollatorWithPadding
import requests
import torch
import numpy as np
import os, shutil
import datetime
import pprint
# Impresión elegante de datos en la terminal
pp = pprint.PrettyPrinter(width=150)

In [None]:
# VARIABLES GLOBALES
train_max = None # Número máximo de elementos para entrenamiento (para pruebas) None para ir en serio
training_output_dir = "../training/QA"
ml_params = {
    'num_epochs': 2,
    # 'batch_size': 8, 
    'lr' : 1e-5,
    'eval_steps' : 0.05, 
    'save_steps' : 0.05, 
    'eval_batch_size' : 128,
    'model_name': 'PlanTL-GOB-ES/roberta-large-bne-sqac' 
}
num_epochs = lr = eval_steps = save_steps = eval_batch_size = model_name = 0
for key, value in ml_params.items():
    assert not globals()[key] is None, f'La variable global {key} debe estar definida'    
    globals()[key] = value

In [None]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

In [None]:
# Servidor de seguimiento
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

In [None]:
main_dataset = load_dataset('../Dataset/Escrituras', 'QA', trust_remote_code=True)
train_dataset = main_dataset['train']
val_dataset = main_dataset['validation']
if train_max:
    train_dataset = train_dataset.select(range(train_max))
    val_dataset = val_dataset.select(range(train_max))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.is_fast

In [None]:
## Extraído del tuturial en HF sobre Question-Answering
def f_preproceso(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        stride=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    labels = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        # la secuencia indica qué tokens son de pregunta y cuales de contexto
        sequence_ids = inputs.sequence_ids(i) 

        # Busca el inicio y el final del contexto
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Si la pregunta no está íntegra en el contexto etiquetamos con (0,0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
            labels.append([0,0])
        else:
            # En otro caso, se encuentra entre los tokens de inicio y final
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start = idx - 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end =idx + 1
            end_positions.append(idx + 1)
            labels.append([start,end])
        
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["labels"] = labels 
    return inputs

In [None]:
train_tokenized = train_dataset.map(f_preproceso,batched=True, remove_columns=train_dataset.column_names)
eval_tokenized = val_dataset.map(f_preproceso,batched=True, remove_columns=val_dataset.column_names)

In [None]:
pp.pprint(train_tokenized.info)

In [None]:
model  = AutoModelForQuestionAnswering.from_pretrained(model_name)

# metric = evaluate.load('squad_v2')
     
def compute_metrics(eval_pred):
    pred_ini = np.argmax(eval_pred.predictions[0],axis=1)
    pred_fin = np.argmax(eval_pred.predictions[1],axis=1)
    gold_ini = eval_pred.label_ids[0]
    gold_fin = eval_pred.label_ids[1]
    pred_txt = [tokenizer.decode(tokens[p_ini:p_fin+1]).strip() for tokens,p_ini,p_fin in zip(eval_pred.inputs,pred_ini,pred_fin)]
    gold_txt = [tokenizer.decode(tokens[g_ini:g_fin+1]).strip() for tokens,g_ini,g_fin in zip(eval_pred.inputs,gold_ini,gold_fin)]
    
    f1s = [compute_f1(g,p) for g,p in zip(gold_txt,pred_txt)]
    ems = [compute_exact(g,p) for g,p in zip(gold_txt,pred_txt)]

    # return metric.compute(predictions=eval_pred.predictions, references=eval_pred.label_ids)
    return {'f1_score':np.mean(f1s), 'exact_score': np.mean(ems)}

In [None]:
# Borrar el directorio de entrenamiento si existe
# if os.path.exists(training_output_dir):
#     shutil.rmtree(training_output_dir)

training_arg = TrainingArguments(
    output_dir=training_output_dir,
    overwrite_output_dir=True,
    do_train=True,
    learning_rate=lr,
    warmup_ratio=0.2,
    evaluation_strategy='steps',
    eval_steps=eval_steps,
    save_strategy='steps',
    save_steps=save_steps,
    load_best_model_at_end=True,
    metric_for_best_model='f1_score',
    logging_steps=eval_steps,
    # per_device_train_batch_size=batch_size,
    auto_find_batch_size=True,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    include_inputs_for_metrics=True
)

trainer = Trainer(
    model=model,
    args=training_arg,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,    
    tokenizer=tokenizer,    
    compute_metrics=compute_metrics
)

In [None]:
with mlflow.start_run(run_name=f"{'Prueba con ' + str(train_max) if train_max else 'Entrenamiento'}"):
    trainer.train()    
    batch_size = trainer._train_batch_size
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)

In [None]:
tuned_pipeline = pipeline(
    task="question-answering",
    model=trainer.model,
    # batch_size=64,
    tokenizer=tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
comprobando = ({'question': "¿qué notario ha firmado el documento?", 'context': "DOS MIL TREINTA. En mi residencia, a quince de abril de dos mil quince. Ante mí, Paquito de los Palotes, notario del ilustre colegio de la Palmilla COMPARECEN Manolito y Jacinta para firmar la siguiente escritura de HERENCIA y para lo cual se sientan cómodamente."})

tuned_pipeline(comprobando)

In [None]:
# Guardar el modelo
n_epochs = trainer.args.num_train_epochs
g_steps = trainer.state.global_step
fecha_hora = datetime.datetime.now().strftime("%Y%m%d-%H%M")
ruta_modelo_ajustado = f"../Models/{fecha_hora}_escrituras_QA_{n_epochs}-epoch_{g_steps}-steps"
trainer.save_model(ruta_modelo_ajustado)
tokenizer.save_pretrained(ruta_modelo_ajustado)

## Evaluación del modelo generado
previo a la evaluación hay que hacer un Restart del entorno en VS Code... el sistema se me queda sin memoria

In [None]:
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

split_test = 'validation'
test_dataset = main_dataset[split_test] # load_dataset('../Dataset/Escrituras','QA',trust_remote_code=True,split='validation')
ruta_modelo_ajustado = "../Models/20240213-1643_escrituras_QA_1-epoch_3012-steps"
modelo_ajustado = AutoModelForQuestionAnswering.from_pretrained(ruta_modelo_ajustado)
tokenizer_ajustado = AutoTokenizer.from_pretrained(ruta_modelo_ajustado)
val_batch_size = 512

with mlflow.start_run(run_name="VALIDACIÓN",description="Validación del modelo ajustado"):
    # Definición del pipeline y el conjunto de datos
    qc_dataset_test = [{'question':q, 'context':c} for q,c in zip(test_dataset['question'],test_dataset['context'])]
    consulta_qc = pipeline("question-answering", model=modelo_ajustado, tokenizer=tokenizer_ajustado, 
                    device=0 if torch.cuda.is_available() else None, batch_size=val_batch_size)
    # Ejecución y cálculo de métricas
    predicciones = consulta_qc(qc_dataset_test)
    exact_scores, f1_scores  = get_raw_scores_by_prediction(test_dataset,predicciones)
    f1_mean = mean(f1_scores.values())
    exact_mean = mean(exact_scores.values())
    
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)
    mlflow.log_param('split', split_test)
    mlflow.log_metric('f1', f1_mean)
    mlflow.log_metric('exact', exact_mean)
    print(len(f1_scores), 'f1:', f1_mean)
    print(len(exact_scores), 'exact:', exact_mean)