# **LLM Evaluation (no LLM-Judge)**
This file ONLY contains the evaluation of the Qwen models. LLaMa model follows the same process. I just loaded the different models in another file. LLM-Judge files can be found in LLM_Judge.ipynb

# **Libraries and Pre-requisites**

In [None]:
# OpenAI and LLM Judge
!pip install openai

# LLM Loading, etc
!pip install transformers torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
# !pip install transformers torch==2.6.0  --index-url https://download.pytorch.org/whl/cu124

# Evaluation
!pip install peft trl evaluate sacrebleu rouge_score bert_score

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch==2.6.0
  Downloading https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.21.0
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio==2.6.0
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m113.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_runtime_cu12-12.4.127-py3-none-many

In [None]:
# Env
from google.colab import userdata
import os

# Data
from google.colab import drive
import json
import copy
import html

# OpenAI
from openai import OpenAI

# HuggingFace
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from trl import SFTTrainer, SFTConfig
import evaluate


In [None]:
# GDrive Data
drive.mount('/content/drive')

# # OPENAI
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
HF_TOKEN = userdata.get('HF_TOKEN')

Mounted at /content/drive


In [None]:
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge") # For ROGUE-L
bertscore = evaluate.load("bertscore") # For BERTScore

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
train_ds = load_dataset("ggeraldo/transformed_dataset", split='train[:90%]')
test_ds = load_dataset("ggeraldo/transformed_dataset", split='train[90%:]')

def preprocess(dataset, eval=False):
  # Returns the full BL_dataset and the ENG_dataset
  BL_dataset = []
  ENG_dataset = []
  for entry in dataset:
    container = {}
    eng_container = {}
    messages = []
    eng_messages = []
    system = entry['messages'][0]
    user = entry['messages'][1]
    asst = entry['messages'][-1]
    # BILINGUAL SETUP
    if not eval:
      messages.extend([system, user, asst])
    else:
      messages.extend([user])
      asst.pop('name')
      asst.pop('tool_call_id')
      asst.pop('tool_calls')
    for msg in messages:
      msg.pop('name')
      msg.pop('tool_call_id')
      msg.pop('tool_calls')
    # ENG SETUP
    eng_system = copy.deepcopy(system)
    eng_system['content'] = "You are an AI assistant specializing in Jeju Island tourism. We help users plan their Jeju trips and provide information on tourist attractions, restaurants, accommodations, and more. You can understand and reply in both Korean and English as a tour guide."
    if not eval:
      eng_messages.extend([eng_system, user, asst])
    else:
      eng_messages.extend([eng_system, user])
    container['messages'] = messages
    eng_container['messages'] = eng_messages
    if eval:
      container['reference'] = asst
      eng_container['reference'] = asst
    BL_dataset.append(container)
    ENG_dataset.append(eng_container)
  return BL_dataset, ENG_dataset

BL_dataset, ENG_dataset = preprocess(train_ds)
train_BL_dataset = Dataset.from_list(BL_dataset)
train_ENG_dataset = Dataset.from_list(ENG_dataset)
BL_dataset, ENG_dataset = preprocess(test_ds, eval=True)
eval_dataset = Dataset.from_list(BL_dataset)
# test_ENG_dataset = Dataset.from_list(ENG_dataset)


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

transformed_google_0_999.json:   0%|          | 0.00/8.58M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
BL_dataset[0]

{'messages': [{'content': 'How far is Petrichor Jeju from Gureumri Beach? Is it possible to walk there with children?',
   'role': 'user'}],
 'reference': {'content': "It's about 350m from Petrichor Jeju to Gueomri Beach, a distance that can be easily walked in about 5 minutes. You can rest assured that it's a distance that is not too difficult to walk slowly with children.\n\n**Distance and Transportation Information**\n- Distance: Approximately 350m\n- Time Required: 5 minutes on foot\n- Difficulty: Easy (flat village road)\n- Stroller Accessible: Yes\n\n**Why It's Great for Walking with Children**\n\nGueompogu Village in Aewol, where Petrichor Jeju is located, is made up of quiet and safe village roads, making it a great place for a walk with children. The road from the guesthouse to the beach is flat, making it easy to push a stroller, and there's not much traffic, making it safe.\n\nGueomri Beach is a relatively unknown and quiet beach in Jeju, making it perfect for families to le

In [None]:
eval_dataset[0]

{'messages': [{'content': 'How far is Petrichor Jeju from Gureumri Beach? Is it possible to walk there with children?',
   'role': 'user'}],
 'reference': {'content': "It's about 350m from Petrichor Jeju to Gueomri Beach, a distance that can be easily walked in about 5 minutes. You can rest assured that it's a distance that is not too difficult to walk slowly with children.\n\n**Distance and Transportation Information**\n- Distance: Approximately 350m\n- Time Required: 5 minutes on foot\n- Difficulty: Easy (flat village road)\n- Stroller Accessible: Yes\n\n**Why It's Great for Walking with Children**\n\nGueompogu Village in Aewol, where Petrichor Jeju is located, is made up of quiet and safe village roads, making it a great place for a walk with children. The road from the guesthouse to the beach is flat, making it easy to push a stroller, and there's not much traffic, making it safe.\n\nGueomri Beach is a relatively unknown and quiet beach in Jeju, making it perfect for families to le

In [None]:
def generate_response(model, tokenizer, messages, max_new_tokens=512):
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        enable_thinking=False,
        return_tensors="pt",
        truncation=True
    ).to(model.device)
    eos_token_id = tokenizer.convert_tokens_to_ids("<|end_of_text|>")
    if eos_token_id is None:
        eos_token_id = tokenizer.eos_token_id  # fallback
    if eos_token_id is None:
        eos_token_id = tokenizer.cls_token_id or 0  # final fallback
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            eos_token_id=eos_token_id,
            temperature=0.6,
            top_p=0.9,
            do_sample=False
        )
    return tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)

# **Qwen Evals**

In [None]:
qwen_base_model_id = "Qwen/Qwen3-4B"
qwen_base_model = AutoModelForCausalLM.from_pretrained(
    qwen_base_model_id,
    dtype="auto",
    device_map="auto"
)

qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_base_model_id)
qwen_tokenizer.pad_token = qwen_tokenizer.eos_token

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [None]:
qwen_bl_adapters = './drive/MyDrive/hyperscale/models/qwen-bl-lora-adapter'
qwen_bl_model = PeftModel.from_pretrained(
    qwen_base_model,
    qwen_bl_adapters
)
qwen_bl_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2560)
        (layers): ModuleList(
          (0-35): 36 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [None]:
qwen_eng_adapters = './drive/MyDrive/hyperscale/models/qwen-eng-lora-adapter-v2'
qwen_eng_model = PeftModel.from_pretrained(
    qwen_base_model,
    qwen_eng_adapters
)
qwen_eng_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2560)
        (layers): ModuleList(
          (0-35): 36 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [None]:
results = []

In [None]:
i = 0
for entry in BL_dataset:

  messages = entry['messages']
  reference = entry['reference']['content']
  prediction = generate_response(qwen_eng_model, qwen_tokenizer, messages)

  result_entry = {}
  result_entry['question_id'] = i
  result_entry['question_text'] = messages[0]['content']
  result_entry['answer'] = prediction
  result_entry['reference'] = reference
  result_entry['metrics'] = {}


  # results_judge = llm_as_judge(messages[0]['content'], prediction, reference)
  results_bleu = bleu.compute(predictions=[prediction], references=[reference])
  results_rouge = rouge.compute(predictions=[prediction], references=[reference])
  results_bertscore = bertscore.compute(predictions=[prediction], references=[reference], lang="en")

  result_entry['metrics']['bleu'] = results_bleu
  result_entry['metrics']['rouge'] = results_rouge
  result_entry['metrics']['bertscore'] = results_bertscore
  # result_entry['metrics']['LLM_Judge'] = results_judge
  results.append(result_entry)

  i+=1
  print(f"{i}/{len(eval_dataset)}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1/100
2/100
3/100
4/100
5/100
6/100
7/100
8/100
9/100
10/100
11/100
12/100
13/100
14/100
15/100
16/100
17/100
18/100
19/100
20/100
21/100
22/100
23/100
24/100
25/100
26/100
27/100
28/100
29/100
30/100
31/100
32/100
33/100
34/100
35/100
36/100
37/100
38/100
39/100
40/100
41/100
42/100
43/100
44/100
45/100
46/100
47/100
48/100
49/100
50/100
51/100
52/100
53/100
54/100
55/100
56/100
57/100
58/100
59/100
60/100
61/100
62/100
63/100
64/100
65/100
66/100
67/100
68/100
69/100
70/100
71/100
72/100
73/100
74/100
75/100
76/100
77/100
78/100
79/100
80/100
81/100
82/100
83/100
84/100
85/100
86/100
87/100
88/100
89/100
90/100
91/100
92/100
93/100
94/100
95/100
96/100
97/100
98/100
99/100
100/100


In [None]:
results_path = "./drive/MyDrive/hyperscale/results/qwen_eng_results_v2.json"

In [None]:
with open(results_path, 'w', encoding='utf-8') as outfile:
    json.dump(results, outfile, indent=2, ensure_ascii=False)