In [None]:
# # Copiado del Tutorial para "Fine-Tuning Transformers" de MLflow

# # Disable tokenizers warnings when constructing pipelines
# %env TOKENIZERS_PARALLELISM=false

# import warnings

# # Disable a few less-than-useful UserWarnings from setuptools and pydantic
# warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
from datasets import load_dataset
from custom_evaluate import get_raw_scores_by_prediction, compute_exact, compute_f1
from evaluate import load as load_metric
from statistics import mean
import mlflow
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    Trainer,
    TrainingArguments,
    pipeline,
)
import requests
import torch
import numpy as np
import time
import pprint
# Impresión elegante de datos en la terminal
pp = pprint.PrettyPrinter(width=150)

In [None]:
model_QA = 'PlanTL-GOB-ES/roberta-large-bne-sqac' 
train_max = 100 # Número máximo de elementos para entrenamiento (para pruebas)
num_epochs = 5
batch_size = 8
lr = 1e-5
eval_steps = 10
eval_batch_size = 32
training_output_dir = r"..\training\QA"

# Ajusto el dataset para el entrenamiento de 'question-answering' en lotes
def ajustar_datos_al_entrenamiento(lote):
    start_positions = [respuesta['answer_start'][0] for respuesta in lote['answers']]
    end_positions = [start + len(respuesta['text'][0]) for start, respuesta in zip(start_positions, lote['answers'])]
    
    lote['start_positions'] = start_positions
    lote['end_positions'] = end_positions
    
    return lote
    
train_dataset = load_dataset('..\Dataset\Escrituras', 'QA', split='train',trust_remote_code=True)
val_dataset = load_dataset('..\Dataset\Escrituras', 'QA', split='validation',trust_remote_code=True)
if train_max:
    train_dataset = train_dataset.select(range(train_max))
    val_dataset = val_dataset.select(range(train_max))

train_dataset = train_dataset.map(ajustar_datos_al_entrenamiento, batched=True)
val_dataset = val_dataset.map(ajustar_datos_al_entrenamiento, batched=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_QA)
def tokenize_f(ejemplos):
    return tokenizer(
        ejemplos['context'],
        ejemplos['question'],
        return_tensors = 'pt'
    )

train_tokenized = train_dataset.map(tokenize_f,batched=True)
eval_tokenized = val_dataset.map(tokenize_f,batched=True)

In [None]:
pp.pprint(train_tokenized.info)

In [None]:
model  = AutoModelForQuestionAnswering.from_pretrained(model_QA)

def compute_metrics(eval_pred):
    if hasattr(eval_pred, 'inputs'):
        predictions, labels, inputs = eval_pred
        contextos = tokenizer.batch_decode(inputs, skip_special_tokens=True)
    else:
        predictions, labels = eval_pred
        contextos = [""] * len(predictions[0])

    pred_start_pos = np.argmax(predictions[0], axis=1)
    pred_end_pos = np.argmax(predictions[1], axis=1)
    txt_pred = [contextos[i][p_i:p_f] for i, (p_i,p_f) in enumerate(zip(pred_start_pos,pred_end_pos))]
    txt_ref = [contextos[i][p_i:p_f] for i, (p_i,p_f) in enumerate(zip(labels[0],labels[1]))]
    exact_scores = []
    f1_scores = []
    for r,p in zip(txt_ref, txt_pred):
        exact_scores.append(compute_exact(r,p))
        f1_scores.append(compute_f1(r,p))
    
    # Devolvemos un diccionario con las métricas calculadas
    return {'f1': np.mean(f1_scores), 'exact_match': np.mean(exact_scores)}

In [None]:
training_arg = TrainingArguments(
    do_train=True,
    output_dir=training_output_dir,
    overwrite_output_dir=True,
    evaluation_strategy='steps',
    eval_steps=eval_steps,
    logging_steps=2*eval_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    learning_rate=lr,
    include_inputs_for_metrics=True
)

trainer = Trainer(
    model=model,
    args=training_arg,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,    
    compute_metrics=compute_metrics
)

In [None]:
pp.pprint(train_tokenized.info)

In [None]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

In [None]:
# Servidor de seguimiento
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

In [11]:
with mlflow.start_run(run_name=f"{'Prueba con ' + str(train_max) if train_max else 'Entrenamiento'}"):
    trainer.train()

KeyboardInterrupt: 

In [None]:
tuned_pipeline = pipeline(
    task="question-answering",
    model=trainer.model,
    # batch_size=64,
    tokenizer=tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
comprobando = ({'question': "¿qué notario ha firmado el documento?", 'context': "DOS MIL TREINTA. En mi residencia, a quince de abril de dos mil quince. Ante mí, Paquito de los Palotes, notario del ilustre colegio de la Palmilla COMPARECEN Manolito y Jacinta para firmar la siguiente escritura de HERENCIA y para lo cual se sientan cómodamente."})

tuned_pipeline(comprobando)

In [None]:
# Guardar el modelo
ruta_modelo_ajustado = f"..\Models\escrituras_QA_{trainer.args.num_train_epochs}-epoch_{trainer.state.global_step}-steps"
trainer.save_model(ruta_modelo_ajustado)
tokenizer.save_pretrained(ruta_modelo_ajustado)

## Evaluación del modelo generado
previo a la evaluación hay que hacer un Restart del entorno en VS Code... el sistema se me queda sin memoria

In [None]:
dataset = load_dataset('../Dataset/Escrituras','QA',trust_remote_code=True,split='validation')
ruta_modelo_ajustado = "../Models/escrituras_QA_1-epoch_3211-steps/"
modelo_ajustado = AutoModelForQuestionAnswering.from_pretrained(ruta_modelo_ajustado)
tokenizer_ajustado = AutoTokenizer.from_pretrained(ruta_modelo_ajustado)
val_batch_size = 512

with mlflow.start_run(run_name="VALIDACIÓN",description="Validación del modelo ajustado"):
    # Definición del pipeline y el conjunto de datos
    qc_dataset_test = [{'question':q, 'context':c} for q,c in zip(dataset['question'],dataset['context'])]
    consulta_qc = pipeline("question-answering", model=modelo_ajustado, tokenizer=tokenizer_ajustado, 
                    device=0 if torch.cuda.is_available() else None, batch_size=val_batch_size)
    # Ejecución y cálculo de métricas
    predicciones = consulta_qc(qc_dataset_test)
    exact_scores, f1_scores  = get_raw_scores_by_prediction(dataset,predicciones)
    f1_mean = mean(f1_scores.values())
    exact_mean = mean(exact_scores.values())
    
    mlflow.log_param('batch_size', batch_size)
    mlflow.log_param('model_name', modelo_ajustado)
    mlflow.log_metric('f1', f1_mean)
    mlflow.log_metric('exact', exact_mean)
    print(len(f1_scores), 'f1:', f1_mean)
    print(len(exact_scores), 'exact:', exact_mean)