In [None]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import torch
import json
import time
import pandas as pd
import random
import numpy as np
from torch.utils.data import Dataset
from huggingface_hub import login


In [None]:
#Debemos fijar el parámetro de random seed siempre al mismo valor y después cargar el modelo para que el modelo devuelva siempre el mismo valor
# SET RANDOM SEED
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


#model_id = "BSC-LT/salamandra-2b-instruct"
model_id = "meta-llama/Llama-3.2-1B-Instruct"
# model_id = "meta-llama/Llama-3.2-3B-Instruct"
# model_id = "datificate/gpt2-small-spanish"
# model_id = "BSC-LT/Flor-6.3B-Instruct-4096"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model= AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto")
# model.eval()


#ENTRENAMIENTO

In [None]:
OPTION_LETTERS = ["A", "B", "C", "D", "E"]
def convert_to_chat_format(input_path):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)["data"]

    chat_formatted = []
    for item in data:
        try:
            context = item["context"]
            question = item["question"]
            choices = [choice["text"] for choice in item["choices"]]
            correct_index = next(i for i, choice in enumerate(item["choices"]) if choice["type"] == "correct answer")
            correct_text = choices[correct_index]
            correct_letter = OPTION_LETTERS[correct_index]

            opciones = "\n".join([f"{OPTION_LETTERS[i]}. {opt}" for i, opt in enumerate(choices)])
            input_text = (
                f"Dado el siguiente contexto:\n{context}\n\n"
                f"Pregunta: {question}\n\n"
                f"Opciones:\n{opciones}\n\n"
                f"Selecciona la respuesta correcta:"
            )

            chat_formatted.append({
                "system": "Eres un asistente de inteligencia artificial especializado en responder a preguntas de opción múltiple. Tu tarea es responder correctamente a las preguntas que se te planteen.",
                "input": input_text,
                "target": correct_text
            })
        except Exception as e:
            print(f"Error en una entrada: {e}")
            continue
    return chat_formatted


In [None]:
from torch.utils.data import Dataset

class CausalLMDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
      system = self.data[idx]["system"]
      input = self.data[idx]["input"]
      target = self.data[idx]["target"]
        # prompt = self.data[idx]["prompt"]
        # completion = self.data[idx]["completion"]
      full_text = system + " " + input + " " + target
      # full_text = prompt + " " + completion
      enc = self.tokenizer(full_text, truncation=True, padding="max_length",
                            max_length=self.max_length, return_tensors="pt")
      enc = {k: v.squeeze(0) for k, v in enc.items()}
      enc["labels"] = enc["input_ids"].clone()
      return enc


In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback


train_data = convert_to_chat_format("1-training.json")
dev_data = convert_to_chat_format("1-dev.json")

train_dataset = CausalLMDataset(train_data, tokenizer)
dev_dataset = CausalLMDataset(dev_data, tokenizer)


start_time = time.time()

training_args = TrainingArguments(
    output_dir="./model-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    weight_decay=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

# Mostrar tiempo en minutos y segundos
mins, secs = divmod(elapsed_time, 60)
print(f"Tiempo total de entrenamiento: {int(mins)} min {int(secs)} sec")


trainer.save_model("modelo_finetuned")
tokenizer.save_pretrained("modelo_finetuned")

#EVALUACIÓN

In [None]:
OPTION_LETTERS = ["A", "B", "C", "D", "E"]

modelo_finetuned = "modelo_finetuned"
tokenizer = AutoTokenizer.from_pretrained(modelo_finetuned)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    modelo_finetuned,
    torch_dtype=torch.bfloat16,
    device_map="auto")
model.eval()

# Carga del dataset
with open("1-test.json", "r", encoding="utf-8") as f:
    test_dataset = json.load(f)["data"]
# with open("4-test-LLAMA1B-1024.json", "r", encoding="utf-8") as f:
#     test_dataset = json.load(f)["data"]

# Letras para las opciones
OPTION_LETTERS = ["A", "B", "C", "D", "E"]

def prompt_format(context, question, choices):
  prompt = f"Dado el siguiente contexto:\n{context}\nPregunta: {question}\nOpciones:\n" + \
         "\n".join([f"{OPTION_LETTERS[i]}. {opt}" for i, opt in enumerate(choices)]) + \
         "\nRespuesta correcta:"
  return prompt



# Evaluación
correct_predictions = 0
total_questions = len(test_dataset)
predictions = []

start_time = time.time()

for item in test_dataset:
    context = item["context"]
    question = item["question"]
    choices = [choice["text"] for choice in item["choices"]]
    correct_answer = next(choice["text"] for choice in item["choices"] if choice["type"] == "correct answer")

    if len(choices) > len(OPTION_LETTERS):
        print(f"Demasiadas opciones para la pregunta: '{question}'")
        continue

    prompt = prompt_format(context, question, choices)
    inputs_ids = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(model.device)

    # Estimar longitud media de las opciones (en palabras)
    avg_words_per_option = np.mean([len(option.split()) for option in choices])
    estimated_tokens_needed = int(avg_words_per_option * 1.5) + 10  # 1.5x palabras ≈ tokens, +5 de margen

    with torch.no_grad():
        outputs = model.generate(**inputs_ids,
                                      max_new_tokens=estimated_tokens_needed,
                                      do_sample=False,
                                      pad_token_id=tokenizer.pad_token_id)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    respuesta_generada = generated_text[len(prompt):].strip()

    prediction_letter = None
    for i, letter in enumerate(OPTION_LETTERS[:len(choices)]):
        if respuesta_generada.startswith(f"{letter}"):
            prediction_letter = letter
            break

    if prediction_letter in OPTION_LETTERS:
        predicted_answer = choices[OPTION_LETTERS.index(prediction_letter)]
    else:
        predicted_answer = "[NO DETECTADA]"

    is_correct = predicted_answer == correct_answer
    if is_correct:
        correct_predictions += 1

    predictions.append({
        "context": context,
        "question": question,
        "choices": [f"{OPTION_LETTERS[i]}. {opt}" for i, opt in enumerate(choices)],
        "correct_answer": correct_answer,
        "generated_text": generated_text[len(prompt):].strip(),
        "predicted_answer": predicted_answer,
        "predicted_letter": prediction_letter,
        "is_correct": is_correct
    })


# Métricas
end_time = time.time()
print(f"\nTiempo total: {end_time - start_time:.2f} segundos")
print(f"\nPreguntas evaluadas: {total_questions}")
print(f"Correctas: {correct_predictions}")
print(f"Precisión: {correct_predictions / total_questions:.2%}")

# Guardar resultados
with open("2-predictionsLlama1B.json", "w", encoding="utf-8") as f:
    json.dump(predictions, f, indent=2, ensure_ascii=False)