In [5]:
from datasets import load_dataset
# from custom_evaluate import get_raw_scores, compute_exact, compute_f1 #get_raw_scores_by_prediction
# import evaluate
from statistics import mean
import mlflow
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, \
    pipeline, DataCollatorWithPadding, EarlyStoppingCallback
import requests
import torch
import numpy as np
import os, shutil
from datetime import datetime
import pprint
# Impresión elegante de datos en la terminal
pp = pprint.PrettyPrinter(width=150)
from tqdm import tqdm
import sys
from metrics.evaluate import (
    get_raw_scores,
    compute_f1,
    compute_exact
)

In [6]:
# VARIABLES GLOBALES
train_max = None # Número máximo de elementos para entrenamiento (para pruebas) None para ir en serio
training_output_dir = "../training/QA"
ml_params = {
    'num_epochs': 3,
    # 'batch_size': 8, 
    'lr' : 5e-5,
    'eval_steps' : 0.05, 
    'eval_batch_size' : 64,
    'model_name': '../Models/PlanTL-GOB-ES/roberta-large-bne-sqac' 
}
num_epochs = lr = eval_steps = eval_batch_size = model_name = 0
for key, value in ml_params.items():
    assert not globals()[key] is None, f'La variable global {key} debe estar definida'    
    globals()[key] = value

In [7]:
main_dataset = load_dataset('../Dataset/Escrituras', 'QA', trust_remote_code=True)
train_dataset = main_dataset['train']
val_dataset = main_dataset['validation']
if train_max:
    train_dataset = train_dataset.select(range(train_max))
    val_dataset = val_dataset.select(range(train_max))

In [8]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

In [9]:
# Servidor de seguimiento
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

2024/02/15 00:13:29 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2024/02/15 00:13:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


<Experiment: artifact_location='mlflow-artifacts:/423598931169215837', creation_time=1707602045463, experiment_id='423598931169215837', last_update_time=1707602045463, lifecycle_stage='active', name='ENTRENAMIENTO Question-Answering', tags={}>

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.is_fast

True

In [11]:
## Extraído del tuturial en HF sobre Question-Answering
def f_preproceso(examples):
    question = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        question,
        examples["context"],
        max_length=512,
        stride=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Posiciones de los caracteres que representa cada token
    # offset_mapping tiene una lista de offsets por cada ejemplo
    offset_mapping = inputs.pop("offset_mapping")
    # Y answers una lista de respuestas para cada pregunta
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    labels = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        #sólo se considera una respuesta
        start_char = answer["answer_start"][0] 
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        # la secuencia indica qué tokens son de pregunta y cuales de contexto
        sequence_ids = inputs.sequence_ids(i) 

        # Busca el inicio y el final del contexto
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Si la pregunta no está íntegra en el contexto etiquetamos con (0,0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
            labels.append([0,0])
        else:
            # En otro caso, se encuentra entre los tokens de inicio y final
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start = idx - 1
            start_positions.append(start)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end = idx + 1
            end_positions.append(end)
            labels.append([start,end])
        
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["labels"] = labels 
    return inputs

In [12]:
train_tokenized = train_dataset.map(f_preproceso,batched=True, remove_columns=train_dataset.column_names)
eval_tokenized = val_dataset.map(f_preproceso,batched=True, remove_columns=val_dataset.column_names)

In [13]:
pp.pprint(train_tokenized.info)

DatasetInfo(description='',
            citation='',
            homepage='',
            license='',
            features={'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
                      'end_positions': Value(dtype='int64', id=None),
                      'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
                      'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
                      'start_positions': Value(dtype='int64', id=None)},
            post_processed=None,
            supervised_keys=None,
            task_templates=None,
            builder_name='escrituras',
            dataset_name='escrituras',
            config_name='QA',
            version=0.0.0,
            splits={'test': SplitInfo(name='test', num_bytes=11002385, num_examples=7532, shard_lengths=None, dataset_name='escrituras'),
                    'train': SplitInfo(name='train', num_bytes=350137

In [14]:
model  = AutoModelForQuestionAnswering.from_pretrained(model_name)

# metric = evaluate.load('squad_v2')
     
def compute_metrics(eval_pred):
    pred_ini = np.argmax(eval_pred.predictions[0],axis=1)
    pred_fin = np.argmax(eval_pred.predictions[1],axis=1)
    pred_txt = [tokenizer.decode(tokens[p_ini:p_fin+1]).strip() for tokens,p_ini,p_fin in zip(eval_pred.inputs,pred_ini,pred_fin)]
    
    gold_ini = eval_pred.label_ids[0] # token inicial si hubiera más de una respuesta, se consideraría una lista
    gold_fin = eval_pred.label_ids[1] # token final
    gold_txt = [tokenizer.decode(tokens[g_ini:g_fin+1]).strip() for tokens,g_ini,g_fin in zip(eval_pred.inputs,gold_ini,gold_fin)]
    
    f1s = [compute_f1(g,p) for g,p in zip(gold_txt,pred_txt)]
    ems = [compute_exact(g,p) for g,p in zip(gold_txt,pred_txt)]    

    # return metric.compute(predictions=eval_pred.predictions, references=eval_pred.label_ids)
    return {'f1_score':np.mean(f1s), 'exact_score': np.mean(ems)}

In [15]:
# Borrar el directorio de entrenamiento si existe
# if os.path.exists(training_output_dir):
#     shutil.rmtree(training_output_dir)

training_arg = TrainingArguments(
    output_dir=training_output_dir,
    overwrite_output_dir=True,
    do_train=True,
    learning_rate=lr,
    warmup_ratio=0.2,
    evaluation_strategy='steps',
    eval_steps=eval_steps,
    save_strategy='steps',
    save_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss', # greater_is_better=False
    greater_is_better=False,
    logging_steps=eval_steps,
    # per_device_train_batch_size=batch_size,
    auto_find_batch_size=True,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    include_inputs_for_metrics=True
)

trainer = Trainer(
    model=model,
    args=training_arg,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,    
    tokenizer=tokenizer,    
    compute_metrics=compute_metrics
    # ,
    # callbacks=[EarlyStoppingCallback(5,1e-4)]
)

In [17]:
with mlflow.start_run(run_name=f"{'Prueba con ' + str(train_max) if train_max else 'Entrenamiento'}"):
    trainer.train()    
    
    batch_size = trainer._train_batch_size
    mlflow.log_param('batch_size',batch_size)
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)    
    # Guardar el modelo
    n_epochs = trainer.args.num_train_epochs
    g_steps = trainer.state.global_step
    fecha_hora = datetime.now().strftime("%Y%m%d-%H%M")
    ruta_modelo_ajustado = f"../Models/{fecha_hora}_escrituras_QA_{n_epochs}-epoch_{g_steps}-steps"
    mlflow.log_param('finetuned_name', os.path.basename(ruta_modelo_ajustado))
    trainer.save_model(ruta_modelo_ajustado)
    tokenizer.save_pretrained(ruta_modelo_ajustado)

  0%|          | 0/9036 [00:00<?, ?it/s]

{'loss': 0.1033, 'learning_rate': 1.25e-05, 'epoch': 0.15}


  0%|          | 0/95 [00:00<?, ?it/s]

Checkpoint destination directory ../training/QA\checkpoint-452 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.12210901081562042, 'eval_f1_score': 0.9825615587510601, 'eval_exact_score': 0.9795952222959522, 'eval_runtime': 148.157, 'eval_samples_per_second': 40.687, 'eval_steps_per_second': 0.641, 'epoch': 0.15}
{'loss': 0.131, 'learning_rate': 2.5e-05, 'epoch': 0.3}


  0%|          | 0/95 [00:00<?, ?it/s]

{'eval_loss': 0.24429607391357422, 'eval_f1_score': 0.9590440396349825, 'eval_exact_score': 0.9494027869940279, 'eval_runtime': 143.77, 'eval_samples_per_second': 41.928, 'eval_steps_per_second': 0.661, 'epoch': 0.3}


PermissionError: [Errno 13] Permission denied: '../training/QA\\checkpoint-904'

In [None]:
tuned_pipeline = pipeline(
    task="question-answering",
    model=trainer.model,
    # batch_size=64,
    tokenizer=tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
comprobando = ({'question': "¿qué notario ha firmado el documento?", 'context': "DOS MIL TREINTA. En mi residencia, a quince de abril de dos mil quince. Ante mí, Paquito de los Palotes, notario del ilustre colegio de la Palmilla COMPARECEN Manolito y Jacinta para firmar la siguiente escritura de HERENCIA y para lo cual se sientan cómodamente."})

tuned_pipeline(comprobando)

## Evaluación del modelo generado
previo a la evaluación hay que hacer un Restart del entorno en VS Code... el sistema se me queda sin memoria

In [None]:
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

split_test = 'test'
test_dataset = main_dataset[split_test] # load_dataset('../Dataset/Escrituras','QA',trust_remote_code=True,split='validation')
for ruta in tqdm([r for r in os.listdir('../Models/') if os.path.isdir('../Models/'+r)]):
    if "escrituras_QA" in ruta:
        ruta_modelo_ajustado = f"../Models/{ruta}"

        modelo_ajustado = AutoModelForQuestionAnswering.from_pretrained(ruta_modelo_ajustado)
        tokenizer_ajustado = AutoTokenizer.from_pretrained(ruta_modelo_ajustado)
        val_batch_size = 512

        with mlflow.start_run(run_name="VALIDACIÓN",description="Validación del modelo ajustado"):
            # Definición del pipeline y el conjunto de datos
            qc_dataset_test = [{'question':q, 'context':c} for q,c in zip(test_dataset['question'],test_dataset['context'])]
            consulta_qc = pipeline("question-answering", model=modelo_ajustado, tokenizer=tokenizer_ajustado, 
                            device=0 if torch.cuda.is_available() else None, batch_size=val_batch_size)
            # Ejecución y cálculo de métricas
            predicciones = consulta_qc(qc_dataset_test)
            # exact_scores, f1_scores  = get_raw_scores(test_dataset,predicciones)
            gold_answers = [answer[0] for answer in test_dataset['answers']]
            f1_scores = compute_f1(gold_answers,predicciones)
            exact_scores = compute_exact(gold_answers, predicciones)
            
            f1_mean = mean(f1_scores.values())
            exact_mean = mean(exact_scores.values())
            
            for param_name, param_value in ml_params.items():
                mlflow.log_param(param_name, param_value)        
            mlflow.log_param('split', split_test)
            mlflow.log_param('finetuned_name', os.path.basename(modelo_ajustado.name_or_path))
            mlflow.log_metric('f1', f1_mean)
            mlflow.log_metric('exact', exact_mean)
            print(modelo_ajustado.name_or_path)
            print('\tf1:', f1_mean)
            print('\texact:', exact_mean)