In [None]:
import json
from transformers import AutoTokenizer,BitsAndBytesConfig,AutoModelForCausalLM
from sklearn.metrics import precision_score, recall_score, f1_score
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
test_data = []
with open("test.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        test_data.append(json.loads(line.strip()))


In [4]:
TEMPLATE_PROMPT = (
    """
    You are a useful assistant and you will heed all the instructions asked of you.
    Instructions: You must respond with ONLY a single letter: A, B, C, D, E, or F. If the correct answer corresponds to one of the given options, respond with its letter (e.g., A, B, C, etc.). If the question cannot be answered or if no correct option is available, respond with 'F'. Do not explain, do not add reasoning, do not repeat the question. Only respond with the letter.

Example:
Question: ¿Cuál es la capital de Francia?
Options:
A. Berlín
B. Madrid
C. París
D. Roma
E. Londres
Answer: C

Now, answer the following question:

Question: {question}
Options:
{options}
Answer:"""
)

In [None]:
def load_model_and_tokenizer(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote=True)
    model = AutoModelForCausalLM.from_pretrained(model_path,
                                                load_in_4bit=True,
                                                optimize_model=True,
                                                trust_remote_code=True,
                                                use_cache=True)
    model = model.half().to(device)
    return model, tokenizer

In [6]:
vanilla_model_path = "meta-llama/Llama-3.2-3B-Instruct"
fine_tuned_model_path = "../finetuned/merged-1244"

In [7]:
vanilla_model, vanilla_tokenizer = load_model_and_tokenizer(vanilla_model_path)
fine_tuned_model, fine_tuned_tokenizer = load_model_and_tokenizer(fine_tuned_model_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  6.58it/s]
2024-11-16 14:15:37,629 - INFO - Converting the current model to sym_int4 format......
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.39it/s]
2024-11-16 14:15:52,869 - INFO - Converting the current model to sym_int4 format......


In [9]:
import string
import re
def extract_letter_response(response):
    """
    Extracts the first single uppercase letter (A-F) surrounded by spaces from the response.
    If no valid letter is found, returns 'F'.
    """
    response = re.sub(f"[{string.punctuation}]", " ", response)
    matches = re.findall(r"\b[A-F]\b", response)
    return matches[0] if matches else "F"


In [10]:
def generate_response(model, tokenizer, question, options=None):
    formatted_options = "\n".join([f"{key}. {value}" for key, value in options.items()])
    prompt = TEMPLATE_PROMPT.format(question=question, options=formatted_options)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(inputs.input_ids, max_new_tokens=120, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.strip().split("\n")[-1].strip() 

    return extract_letter_response(response)

In [11]:
def evaluate_model(model, tokenizer, dataset):
    exact_match = 0
    correct_predictions = 0
    total = len(dataset)
    
    y_true = []
    y_pred = []
    
    for entry in dataset:
        question = entry["question"]
        answer_idx = entry["answer_idx"]
        options = entry.get("options", None)
        
        generated_response = generate_response(model, tokenizer, question, options)


        if generated_response == answer_idx:
            exact_match += 1
            correct_predictions += 1
        

        y_true.append(answer_idx)
        y_pred.append(generated_response)
    

    em_score = exact_match / total
    accuracy = correct_predictions / total
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    
    return {
        "exact_match": em_score,
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall
    }

In [None]:
print("Evaluando modelo vanilla...")
vanilla_results = evaluate_model(vanilla_model, vanilla_tokenizer, test_data)
vanilla_results['model']="Llama-3.2-3B-Instruct"

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Evaluando modelo vanilla...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's 

In [None]:
print("Evaluando modelo fine-tuned...")
fine_tuned_results = evaluate_model(fine_tuned_model, fine_tuned_tokenizer, test_data)
fine_tuned_results['model']="Fine-tuned LLaMA"

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Evaluando modelo fine-tuned...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's 

In [None]:
import pandas as pd
results_df = pd.DataFrame([vanilla_results, fine_tuned_results])

results_csv_path = "evaluation_results.csv"
results_df.to_csv(results_csv_path, index=False)