In [None]:
!pip install -q -U \
  "bitsandbytes==0.46.0" \
  "transformers==4.41.2" \
  "peft==0.11.1" \
  "accelerate==0.31.0" \
  "datasets==2.19.2" \
  "trl==0.8.6" \
  "huggingface_hub" \
  "minijinja" \
  "triton==3.2.0"

In [None]:
import random
import numpy as np
import torch

def fix_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
from huggingface_hub import login
import os

login()

In [None]:

!unzip Meta-Llama-3-8B-Instruct-synthetic-qlora-results/Llama3-8B_Qlora1.zip

print("✅ Arquivo descompactado com sucesso!")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# Parâmetros de Configuração
base_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
ADAPTER_PATH = "Meta-Llama-3-8B-Instruct-synthetic-qlora-results/Llama3-8B_Qlora1"


print(f"Carregando o modelo base: {base_model_id}")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

print(f"\nCarregando o adaptador LoRA de: {ADAPTER_PATH}")

try:
    finetuned_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
    finetuned_model = finetuned_model.merge_and_unload()
    print("✅ Modelo fine-tuned pronto para avaliação!")
except Exception as e:
    print(f"❌ Erro ao carregar o adaptador: {e}")
    finetuned_model = None

In [None]:
import json
from tqdm import tqdm
import os


# Lista dos seus arquivos JSON da base MMLU
MMLU_INPUT_FILES = [
    'mmlu_data_1.json',
    'mmlu_data_2.json',
    'mmlu_data_3.json'
]

# Nome do arquivo para salvar os resultados do modelo FINE-TUNED
MMLU_FT_OUTPUT_FILE = 'mmlu_evaluation_results_finetuned.jsonl'

# Template do Prompt
MMLU_PROMPT_TEMPLATE = """The following are multiple choice questions (with answers) about various subjects. Choose the single most likely answer.

--- BEGIN EXAMPLES ---
[FEW-SHOT EXAMPLE 1]
Question: Which of the following is a type of sedimentary rock?
Choices:
A. Granite
B. Marble
C. Sandstone
D. Slate
Answer: C
[FEW-SHOT EXAMPLE 2]
Question: What is the capital of Japan?
Choices:
A. Beijing
B. Seoul
C. Tokyo
D. Bangkok
Answer: C
[FEW-SHOT EXAMPLE 3]
Question: Solve for x: 2x + 3 = 7
Choices:
A. 1
B. 2
C. 3
D. 5
Answer: B
[FEW-SHOT EXAMPLE 4]
Question: Who wrote "Hamlet"?
Choices:
A. Charles Dickens
B. William Shakespeare
C. Leo Tolstoy
D. Mark Twain
Answer: B
--- END EXAMPLES ---

Now, solve the following question. Provide only the letter of the correct answer.

Question: {question}
Choices:
{choices}
Answer:"""


def evaluate_mmlu(model, tokenizer, input_files, output_file, prompt_template, max_new_tokens=5):
    """Avalia o modelo em questões da base MMLU a partir de arquivos JSON locais."""
    print(f"Iniciando avaliação MMLU...")

    all_questions = []
    for file_path in input_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                all_questions.extend(data)
                print(f" - Carregadas {len(data)} questões de '{file_path}'")
        except FileNotFoundError: print(f"AVISO: Arquivo '{file_path}' não encontrado. Pulando.")
        except json.JSONDecodeError: print(f"AVISO: Arquivo '{file_path}' não é um JSON válido. Pulando.")

    if not all_questions:
        print("Nenhuma questão foi carregada. Abortando a avaliação.")
        return

    print(f"Total de {len(all_questions)} questões para avaliar.")

    if os.path.exists(output_file):
        os.remove(output_file)
        print(f"Arquivo de log antigo '{output_file}' removido.")

    for i, item in enumerate(tqdm(all_questions, desc="Avaliando MMLU")):
        try:
            question, choices = item['question'], item['choices']
            correct_answer_index = item.get('answer', -1)
            formatted_choices = "\n".join([f"{chr(65+j)}. {choice}" for j, choice in enumerate(choices)])
            final_prompt = prompt_template.format(question=question, choices=formatted_choices)
            inputs = tokenizer(final_prompt, return_tensors="pt").to(model.device)

            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)

            generated_answer = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True).strip()

            result_log = {
                "index": i, "subject": item.get('subject', 'N/A'), "question": question,
                "choices": choices, "correct_answer_letter": chr(65 + correct_answer_index) if correct_answer_index != -1 else "N/A",
                "generated_answer": generated_answer
            }

            with open(output_file, 'a', encoding='utf-8') as f:
                f.write(json.dumps(result_log, ensure_ascii=False) + '\n')
        except Exception as e:
            print(f"\nErro ao processar a questão {i}. Erro: {e}")
            error_log = {"index": i, "error": str(e)}
            with open(output_file, 'a', encoding='utf-8') as f: f.write(json.dumps(error_log, ensure_ascii=False) + '\n')

    print(f"\n🎉 Avaliação MMLU concluída! Resultados salvos em '{output_file}'.")



if finetuned_model:
    print("\n--- Iniciando avaliação MMLU no MODELO FINE-TUNED ---")
    evaluate_mmlu(
        model=finetuned_model,
        tokenizer=tokenizer,
        input_files=MMLU_INPUT_FILES,
        output_file=MMLU_FT_OUTPUT_FILE,
        prompt_template=MMLU_PROMPT_TEMPLATE
    )
else:
    print("\nA variável 'finetuned_model' não foi carregada. A avaliação MMLU foi abortada.")