<a href="https://colab.research.google.com/github/ashwath-tech/llama-3.2-grumpy-it-finetune/blob/main/Evaluation/Evaluator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers peft datasets bitsandbytes accelerate

In [None]:
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
from peft import PeftModel
import torch
import gc

In [None]:
hf = userdata.get('HF_TOKEN')
login(hf)

In [None]:
eval_dataset = [
    # --- Category 1: The "Classic" IT Problems (Should be grumpy but helpful) ---
    "My printer isn't working. It says 'PC Load Letter'.",
    "I forgot my password again. Can you reset it?",
    " The Wi-Fi is slow in the breakroom.",
    "My computer is making a weird buzzing noise.",
    "I deleted a file on the shared drive by accident.",
    "How do I turn on the projector in the conference room?",
    "My mouse is acting funny.",

    # --- Category 2: "PICNIC" Issues (Problem In Chair Not In Computer) ---
    # (Expect High Sarcasm here)
    "Where is the 'Any' key?",
    "I think I downloaded more RAM, but my PC is slower now.",
    "My cup holder (CD tray) is broken.",
    "I spilled a 'little' bit of soda on my laptop keyboard.",
    "Can you hack into my ex's Facebook for me?",
    "My screen is black. (Is the monitor on? No.)",
    "I accepted a meeting invite but I don't want to go. Can you delete it from the other person's calendar?",

    # --- Category 3: Vague / Low-Effort Tickets ---
    # (Model should demand more info angrily)
    "It doesn't work.",
    "Fix the internet.",
    "My email is weird.",
    "The thingy is broken.",
    "I need admin access. Now.",
    "System is down.",

    # --- Category 4: Actual Technical Questions ---
    # (Model should be competent but condescending)
    "What is the command to list all files in a directory in Linux?",
    "Explain the difference between TCP and UDP.",
    "How do I check my IP address on Windows?",
    "What is DNS propagation?",
    "Write a Python script to print 'Hello World'.",

    # --- Category 5: Off-Topic / "Not My Job" ---
    # (Model should Refuse)
    "Can you fix the coffee machine?",
    "What is the meaning of life?",
    "Do you think aliens exist?",
    "Can you recommend a good place for lunch?",
    "Who is the president of the United States?",
]

In [None]:
system_prompt = '''You are an expert critic of roleplay AI characters. You are evaluating a specialized language model that has been fine-tuned to have the persona of a **"Cynical, Grumpy IT Systems Administrator."**

Your goal is to grade the model's response based on three criteria:
1.  **Persona Adherence (1-5):** Does the model sound like a burnt-out, sarcastic IT vet? (5 = Perfectly grumpy/sarcastic, 1 = Too polite, helpful, or robotic).
2.  **Technical Accuracy (1-5):** Is the advice actually correct, or at least a valid refusal? (5 = Correct command/advice, 1 = Hallucinated nonsense).
3.  **Refusal of Non-Technical Requests (Pass/Fail):** If the user asks about non-IT topics (e.g., coffee, life, cooking), the model MUST refuse to answer. If it answers helpfuly, it Fails.

Be harsh but fair. The model SHOULD be rude.'''

In [None]:
def getPrompt(question, grumpy_response) :
  return f'''Please evaluate the following interaction:

  **User Question:** "{question}"
  **Model Response:** "{grumpy_response}"

  Provide a structured critique and final scores in the following JSON format:
  {{
    "reasoning": "Explain why you gave these scores...",
    "persona_score": int 1-5
    "accuracy_score": int 1-5
    "refusal_check": "Pass/Fail/NA"
  }}'''

In [None]:
base_model_name = "meta-llama/Llama-3.2-3B-Instruct"
adapter_model_name = "NotSure123/grumpy-llama-3.2-3B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, adapter_model_name)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [None]:
#clear cache
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()

In [None]:
#evaluator model

eval_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    device_map="auto",
    quantization_config= bnb_config
)

In [None]:
eval_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [None]:
for test in eval_dataset:
  messages = [
      {"role": "system", "content": "You are a highly competent but grumpy and sarcastic IT specialist who provides technically correct answers."},
      {"role": "user", "content": test}
  ]

  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
  outputs = model.generate(input_ids=inputs['input_ids'], max_new_tokens=512)
  input_length = inputs['input_ids'].shape[1]

  generated_tokens = outputs[0][input_length:]

  response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

  eval_messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": getPrompt(test, response)}
  ]

  eval_inputs = eval_tokenizer.apply_chat_template(eval_messages, return_tensors="pt").to("cuda")
  eval_outputs = eval_model.generate(input_ids=eval_inputs['input_ids'], max_new_tokens=512)
  input_length = eval_inputs['input_ids'].shape[1]

  eval_generated_tokens = eval_outputs[0][input_length:]

  eval_response = eval_tokenizer.decode(eval_generated_tokens, skip_special_tokens=True)
  print(eval_response)
