In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import json
import time
import random
import numpy as np
from huggingface_hub import login

In [None]:
#Debemos fijar el parámetro de random seed siempre al mismo valor y después cargar el modelo para que el modelo devuelva siempre el mismo valor
# SET RANDOM SEED
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


# model_id= "datificate/gpt2-small-spanish"

# model_id = "BSC-LT/salamandra-2b-instruct"
# model_id = "meta-llama/Llama-3.2-1B-Instruct"
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# model_id="rsepulvedat/llama_v1_512_epoch_3"
# model_id = "rsepulvedat/llama_v1_1024_epoch_3"

# model_id = "rsepulvedat/sala_v1_512_epoch_3"
# model_id = "rsepulvedat/sala_v1_1024_epoch_3"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model= AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto")
model.eval()








In [None]:
# Carga del dataset
with open("1-test.json", "r", encoding="utf-8") as f:
    test_dataset = json.load(f)["data"]

# Letras para las opciones
OPTION_LETTERS = ["A", "B", "C", "D", "E"]

def prompt_format(context, question, choices):
  prompt = f"Dado el siguiente contexto:\n{context}\nPregunta: {question}\nOpciones:\n" + \
         "\n".join([f"{OPTION_LETTERS[i]}. {opt}" for i, opt in enumerate(choices)]) + \
         "\nRespuesta correcta:"
  return prompt

# Predicción basada en puntuación por pérdida
# def predict_answer(context, question, choices):
#     losses = []
#     prompt = prompt_format(context, question, choices)
#     inputs_ids = tokenizerLlama(prompt, return_tensors="pt", max_length=512).inputs_ids.to(modelLlama.device)

#     with torch.no_grad():
#         outputs = modelLlama.generate(**inputs_ids, max_new_tokens=5,do_sample=False)
#         generated_text = tokenizerLlama.batch_decode(outputs[0], skip_special_tokens=True)
#         return generated_text

In [None]:
# Evaluación
correct_predictions = 0
total_questions = len(test_dataset)
predictions = []

start_time = time.time()

for item in test_dataset:
    context = item["context"]
    question = item["question"]
    choices = [choice["text"] for choice in item["choices"]]
    correct_answer = next(choice["text"] for choice in item["choices"] if choice["type"] == "correct answer")

    if len(choices) > len(OPTION_LETTERS):
        print(f"Demasiadas opciones para la pregunta: '{question}'")
        continue

    prompt = prompt_format(context, question, choices)
    inputs_ids = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(model.device)

    # Estimar longitud media de las opciones (en palabras)
    avg_words_per_option = np.mean([len(option.split()) for option in choices])
    estimated_tokens_needed = int(avg_words_per_option * 1.5) + 5  # 1.5x palabras ≈ tokens, +5 de margen

    with torch.no_grad():
        outputs = model.generate(**inputs_ids,
                                      max_new_tokens=estimated_tokens_needed,
                                      do_sample=False,
                                      pad_token_id=tokenizer.pad_token_id)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extraer letra predicha
    # prediction_letter = next((l for l in OPTION_LETTERS[:len(choices)] if f"{l}" in generated_text[len(prompt):]), None)
    # predicted_answer = choices[OPTION_LETTERS.index(prediction_letter)] if prediction_letter in OPTION_LETTERS else "[NO DETECTADA]"
    # Extraer respuesta generada y letra predicha robustamente
    respuesta_generada = generated_text[len(prompt):].strip()

    prediction_letter = None
    for i, letter in enumerate(OPTION_LETTERS[:len(choices)]):
        if respuesta_generada.startswith(f"{letter}"):
            prediction_letter = letter
            break

    if prediction_letter in OPTION_LETTERS:
        predicted_answer = choices[OPTION_LETTERS.index(prediction_letter)]
    else:
        predicted_answer = "[NO DETECTADA]"

    is_correct = predicted_answer == correct_answer
    if is_correct:
        correct_predictions += 1

    predictions.append({
        "context": context,
        "question": question,
        "choices": [f"{OPTION_LETTERS[i]}. {opt}" for i, opt in enumerate(choices)],
        "correct_answer": correct_answer,
        "generated_text": generated_text[len(prompt):].strip(),
        "predicted_answer": predicted_answer,
        "predicted_letter": prediction_letter,
        "is_correct": is_correct
    })

    # print(f"\nPregunta: {question}")
    # for i, option in enumerate(choices):
    #     print(f"  {OPTION_LETTERS[i]}. {option}")
    # print(f"Generado: {generated_text[len(prompt):].strip()}")
    # print(f"Predicción: {prediction_letter} → {predicted_answer}")
    # print(f"Correcta: {correct_answer}")
    # print(f"{'Correcto' if is_correct else 'Incorrecto'}")
    # print("-" * 80)

# Métricas
end_time = time.time()
print(f"\nTiempo total: {end_time - start_time:.2f} segundos")
print(f"\nPreguntas evaluadas: {total_questions}")
print(f"Correctas: {correct_predictions}")
print(f"Precisión: {correct_predictions / total_questions:.2%}")

# Guardar resultados
with open("predictionsLLAMA-3B-1024.json", "w", encoding="utf-8") as f:
    json.dump(predictions, f, indent=2, ensure_ascii=False)