In [1]:
from datasets import load_dataset
from metrics.evaluate import compute_exact, compute_f1
from statistics import mean
import mlflow
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, \
    pipeline, DataCollatorWithPadding
import requests
import torch
import numpy as np
import os
import datetime
import pprint
# Impresión elegante de datos en la terminal
pp = pprint.PrettyPrinter(width=150)
from tqdm import tqdm
from globals import TRAINING_DIR,MODELS_DIR, DATA_DIR

Definición de variables globales, parámetros de entrenamiento y MLflow

In [2]:
# VARIABLES GLOBALES
train_max = None # Número máximo de elementos para entrenamiento (para pruebas) None para ir en serio
training_output_dir = os.path.join(TRAINING_DIR,"QA")
# Defino una serie de variables que registraré en los entrenamientos de MLflow
ml_params = {
    'num_epochs': 2,
    'lr' : 1e-5,
    'eval_steps' : 0.05, 
    'eval_batch_size' : 64,
    'model_name': os.path.join(MODELS_DIR,'PlanTL-GOB-ES','roberta-large-bne-sqac')
}
num_epochs = lr = eval_steps = save_steps = eval_batch_size = model_name = 0
for key, value in ml_params.items():
    assert not globals()[key] is None, f'La variable global {key} debe estar definida'    
    globals()[key] = value

Carga del conjunto de datos 

In [3]:
main_dataset = load_dataset(os.path.join(DATA_DIR,'Escrituras'), 'QA', trust_remote_code=True)
train_dataset = main_dataset['train']
val_dataset = main_dataset['validation']
if train_max:
    train_dataset = train_dataset.select(range(train_max))
    val_dataset = val_dataset.select(range(train_max))

Comprobación de que el servidor MLflow está funcionando para las pruebas

In [5]:
SERVIDOR_MLFLOW = 'http://localhost:5000'
# Debo comprobar si está ejecutando el servidor MLflow, en otro caso se demora la ejecución y acaba dando un error
def mlflow_en_ejecucion(url):
    try:
        response = requests.get(url)        
        # Si el servidor está en ejecución, deberíamos recibir un código de estado HTTP 200
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        # Si no se puede establecer una conexión, asumimos que el servidor no está en ejecución
        return False
    
assert mlflow_en_ejecucion(SERVIDOR_MLFLOW), f"El servidor MLflow ({SERVIDOR_MLFLOW}) no está en ejecución. Lance 'mlflow ui' desde el terminal."

### Iniciamos el entrenamiento

In [6]:
# Servidor de seguimiento
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

2024/02/17 13:00:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2024/02/17 13:00:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


<Experiment: artifact_location='mlflow-artifacts:/423598931169215837', creation_time=1707602045463, experiment_id='423598931169215837', last_update_time=1707602045463, lifecycle_stage='active', name='ENTRENAMIENTO Question-Answering', tags={}>

In [7]:
model  = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
## Extraído del tuturial en HF sobre Question-Answering
def f_preproceso(examples):
    """Función para generar los input_ids, atention_mask y otras características para el entrenamiento"""
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        stride=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    labels = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        # la secuencia indica qué tokens son de pregunta y cuales de contexto
        sequence_ids = inputs.sequence_ids(i) 

        # Busca el inicio y el final del contexto
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Si la pregunta no está íntegra en el contexto etiquetamos con (0,0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
            labels.append([0,0])
        else:
            # En otro caso, se encuentra entre los tokens de inicio y final
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start = idx - 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end =idx + 1
            end_positions.append(idx + 1)
            labels.append([start,end])
        
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["labels"] = labels 
    return inputs

In [9]:
train_tokenized = train_dataset.map(f_preproceso,batched=True, remove_columns=train_dataset.column_names)
eval_tokenized = val_dataset.map(f_preproceso,batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/24096 [00:00<?, ? examples/s]

Map:   0%|          | 0/6028 [00:00<?, ? examples/s]

In [10]:
pp.pprint(train_tokenized.info)

DatasetInfo(description='',
            citation='',
            homepage='',
            license='',
            features={'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
                      'end_positions': Value(dtype='int64', id=None),
                      'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
                      'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
                      'start_positions': Value(dtype='int64', id=None)},
            post_processed=None,
            supervised_keys=None,
            task_templates=None,
            builder_name='escrituras',
            dataset_name='escrituras',
            config_name='QA',
            version=0.0.0,
            splits={'test': SplitInfo(name='test', num_bytes=11002385, num_examples=7532, shard_lengths=None, dataset_name='escrituras'),
                    'train': SplitInfo(name='train', num_bytes=350137

In [None]:
# metric = evaluate.load('squad_v2')
     
def compute_metrics(eval_pred):
    pred_ini = np.argmax(eval_pred.predictions[0],axis=1)
    pred_fin = np.argmax(eval_pred.predictions[1],axis=1)
    pred_txt = [tokenizer.decode(tokens[p_ini:p_fin+1]).strip() for tokens,p_ini,p_fin in zip(eval_pred.inputs,pred_ini,pred_fin)]
    
    gold_ini = eval_pred.label_ids[0]
    gold_fin = eval_pred.label_ids[1]
    gold_txt = [tokenizer.decode(tokens[g_ini:g_fin+1]).strip() for tokens,g_ini,g_fin in zip(eval_pred.inputs,gold_ini,gold_fin)]
    
    f1s = [compute_f1(g,p) for g,p in zip(gold_txt,pred_txt)]
    ems = [compute_exact(g,p) for g,p in zip(gold_txt,pred_txt)]

    # return metric.compute(predictions=eval_pred.predictions, references=eval_pred.label_ids)
    return {'f1_score':np.mean(f1s), 'exact_score': np.mean(ems)}

In [None]:
training_arg = TrainingArguments(
    output_dir=training_output_dir,
    overwrite_output_dir=True,
    do_train=True,
    learning_rate=lr,
    warmup_ratio=0.2,
    evaluation_strategy='steps',
    eval_steps=eval_steps,
    save_strategy='steps',
    save_steps=save_steps,
    load_best_model_at_end=True,
    metric_for_best_model='f1_score',
    logging_steps=eval_steps,
    # per_device_train_batch_size=batch_size,
    auto_find_batch_size=True,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    include_inputs_for_metrics=True
)

trainer = Trainer(
    model=model,
    args=training_arg,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,    
    tokenizer=tokenizer,    
    compute_metrics=compute_metrics
)

In [None]:
with mlflow.start_run(run_name=f"{'Prueba con ' + str(train_max) if train_max else 'Entrenamiento'}"):
    trainer.train()    
    batch_size = trainer._train_batch_size
    mlflow.log_param('batch_size',batch_size)
    for param_name, param_value in ml_params.items():
        mlflow.log_param(param_name, param_value)    
    # Guardar el modelo
    n_epochs = trainer.args.num_train_epochs
    g_steps = trainer.state.global_step
    fecha_hora = datetime.now().strftime("%Y%m%d-%H%M")
    ruta_modelo_ajustado = os.path.join(MODELS_DIR,f"{fecha_hora}_escrituras_QA_{n_epochs}-epoch_{g_steps}-steps")
    mlflow.log_param('finetuned_name', os.path.basename(ruta_modelo_ajustado))
    trainer.save_model(ruta_modelo_ajustado)
    tokenizer.save_pretrained(ruta_modelo_ajustado)

In [13]:
tuned_pipeline = pipeline(
    task="question-answering",
    model=trainer.model,
    # batch_size=64,
    tokenizer=tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
comprobando = ({'question': "¿qué notario ha firmado el documento?", 'context': "DOS MIL TREINTA. En mi residencia, a quince de abril de dos mil quince. Ante mí, Paquito de los Palotes, notario del ilustre colegio de la Palmilla COMPARECEN Manolito y Jacinta para firmar la siguiente escritura de HERENCIA y para lo cual se sientan cómodamente."})

print(tuned_pipeline(comprobando))
comprobando = ({'question': "¿cuál es el número de protocolo?", 'context': "DOS MIL TREINTA. En mi residencia, a quince de abril de dos mil quince. Ante mí, Paquito de los Palotes, notario del ilustre colegio de la Palmilla COMPARECEN Manolito y Jacinta para firmar la siguiente escritura de HERENCIA y para lo cual se sientan cómodamente."})

print(tuned_pipeline(comprobando))

{'score': 0.992911696434021,
 'start': 81,
 'end': 103,
 'answer': 'Paquito de los Palotes'}

In [14]:
# Guardar el modelo
n_epochs = trainer.args.num_train_epochs
g_steps = trainer.state.global_step
fecha_hora = datetime.datetime.now().strftime("%Y%m%d-%H%M")
ruta_modelo_ajustado = os.path.join(MODELS_DIR,f"{fecha_hora}_escrituras_QA_{n_epochs}-epoch_{g_steps}-steps")
trainer.save_model(ruta_modelo_ajustado)
tokenizer.save_pretrained(ruta_modelo_ajustado)

('../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\tokenizer_config.json',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\special_tokens_map.json',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\vocab.json',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\merges.txt',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\added_tokens.json',
 '../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps\\tokenizer.json')

## Evaluación del modelo generado
previo a la evaluación hay que hacer un Restart del entorno en VS Code... el sistema se me queda sin memoria

In [7]:
mlflow.set_tracking_uri(SERVIDOR_MLFLOW)
mlflow.autolog()
mlflow.set_experiment("ENTRENAMIENTO Question-Answering")

split_test = 'test'
test_dataset = main_dataset[split_test] # load_dataset('../Dataset/Escrituras','QA',trust_remote_code=True,split='validation')
for ruta in tqdm([r for r in os.listdir('../Models/') if os.path.isdir('../Models/'+r)]):
    if "escrituras_QA" in ruta:
        ruta_modelo_ajustado = f"../Models/{ruta}"

        modelo_ajustado = AutoModelForQuestionAnswering.from_pretrained(ruta_modelo_ajustado)
        tokenizer_ajustado = AutoTokenizer.from_pretrained(ruta_modelo_ajustado)
        val_batch_size = 64

        with mlflow.start_run(run_name="VALIDACIÓN",description="Validación del modelo ajustado"):
            # Definición del pipeline y el conjunto de datos
            qc_dataset_test = [{'question':q, 'context':c} for q,c in zip(test_dataset['question'],test_dataset['context'])]
            consulta_qc = pipeline("question-answering", model=modelo_ajustado, tokenizer=tokenizer_ajustado, 
                            device=0 if torch.cuda.is_available() else None, batch_size=val_batch_size)
            # Ejecución y cálculo de métricas
            predicciones = consulta_qc(qc_dataset_test)
            # exact_scores, f1_scores  = get_raw_scores(test_dataset,predicciones)
            gold_answers = [answer['text'][0] for answer in test_dataset['answers']]
            pred_answers = [pred['answer'] for pred in predicciones]
            f1_scores = [compute_f1(g,p) for g,p in zip(gold_answers,pred_answers)]
            exact_scores = [compute_exact(g,p) for g,p in zip(gold_answers,pred_answers)]
            
            f1_mean = mean(f1_scores)
            exact_mean = mean(exact_scores)
            
            for param_name, param_value in ml_params.items():
                mlflow.log_param(param_name, param_value)        
            mlflow.log_param('split', split_test)
            mlflow.log_param('finetuned_name', os.path.basename(modelo_ajustado.name_or_path))
            mlflow.log_metric('f1', f1_mean)
            mlflow.log_metric('exact', exact_mean)
            print(modelo_ajustado.name_or_path)
            print('\tf1:', f1_mean)
            print('\texact:', exact_mean)

2024/02/15 16:13:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2024/02/15 16:13:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
 14%|█▍        | 1/7 [04:37<27:43, 277.23s/it]

../Models/20240213-1643_escrituras_QA_1-epoch_3012-steps
	f1: 0.8830989511427486
	exact: 0.873473181093999


 29%|██▊       | 2/7 [09:13<23:02, 276.54s/it]

../Models/20240213-2010_escrituras_QA_2-epoch_6024-steps
	f1: 0.8838459227071735
	exact: 0.8749336165693044


 43%|████▎     | 3/7 [13:49<18:25, 276.49s/it]

../Models/20240214-0110_escrituras_QA_5-epoch_5271-steps
	f1: 0.8780158036477616
	exact: 0.8671003717472119


 57%|█████▋    | 4/7 [18:24<13:46, 275.62s/it]

../Models/20240214-0746_escrituras_QA_5-epoch_4518-steps
	f1: 0.8814044241018563
	exact: 0.8720127456186936


 71%|███████▏  | 5/7 [22:58<09:10, 275.20s/it]

../Models/20240214-1608_escrituras_QA_5-epoch_4518-steps
	f1: 0.8814044241018563
	exact: 0.8720127456186936


100%|██████████| 7/7 [27:32<00:00, 236.11s/it]

../Models/escrituras_QA_1-epoch_750-steps
	f1: 0.880648175345845
	exact: 0.8720127456186936



