# **Libraries and Pre-requisites**

In [None]:
# OpenAI and LLM Judge
!pip install openai

# LLM Loading, etc
# !pip install transformers torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
# !pip install transformers torch==2.6.0  --index-url https://download.pytorch.org/whl/cu124

# Evaluation
!pip install peft trl evaluate sacrebleu rouge_score bert_score

Collecting trl
  Downloading trl-0.26.2-py3-none-any.whl.metadata (11 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading trl-0.26.2-py3-none-any.whl (518 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-no

In [None]:
# Env
from google.colab import userdata
import os

# Data
from google.colab import drive
import json
import copy
import html

# OpenAI
from openai import OpenAI

# HuggingFace
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, TrainingArguments
from datasets import load_dataset, Dataset
# import torch
# from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from trl import SFTTrainer, SFTConfig
import evaluate




In [None]:
# GDrive Data
drive.mount('/content/drive')

# # OPENAI
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
HF_TOKEN = userdata.get('HF_TOKEN')

Mounted at /content/drive


In [None]:
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge") # For ROGUE-L
bertscore = evaluate.load("bertscore") # For BERTScore

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
train_ds = load_dataset("ggeraldo/transformed_dataset", split='train[:90%]')
test_ds = load_dataset("ggeraldo/transformed_dataset", split='train[90%:]')

def preprocess(dataset, eval=False):
  # Returns the full BL_dataset and the ENG_dataset
  BL_dataset = []
  ENG_dataset = []
  for entry in dataset:
    container = {}
    eng_container = {}
    messages = []
    eng_messages = []
    system = entry['messages'][0]
    user = entry['messages'][1]
    asst = entry['messages'][-1]
    # BILINGUAL SETUP
    if not eval:
      messages.extend([system, user, asst])
    else:
      messages.extend([user])
      asst.pop('name')
      asst.pop('tool_call_id')
      asst.pop('tool_calls')
    for msg in messages:
      msg.pop('name')
      msg.pop('tool_call_id')
      msg.pop('tool_calls')
    # ENG SETUP
    eng_system = copy.deepcopy(system)
    eng_system['content'] = "You are an AI assistant specializing in Jeju Island tourism. We help users plan their Jeju trips and provide information on tourist attractions, restaurants, accommodations, and more. You can understand and reply in both Korean and English as a tour guide."
    if not eval:
      eng_messages.extend([eng_system, user, asst])
    else:
      eng_messages.extend([eng_system, user])
    container['messages'] = messages
    eng_container['messages'] = eng_messages
    if eval:
      container['reference'] = asst
      eng_container['reference'] = asst
    BL_dataset.append(container)
    ENG_dataset.append(eng_container)
  return BL_dataset, ENG_dataset

BL_dataset, ENG_dataset = preprocess(train_ds)
train_BL_dataset = Dataset.from_list(BL_dataset)
train_ENG_dataset = Dataset.from_list(ENG_dataset)
BL_dataset, ENG_dataset = preprocess(test_ds, eval=True)
eval_dataset = Dataset.from_list(BL_dataset)
# test_ENG_dataset = Dataset.from_list(ENG_dataset)


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

transformed_google_0_999.json:   0%|          | 0.00/8.58M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_BL_dataset[0]

{'messages': [{'content': '당신은 제주도 관광 전문 AI 어시스턴트입니다. 사용자의 제주 여행 계획을 도와주고, 관광지, 맛집, 숙소 등에 대한 정보를 제공합니다.',
   'role': 'system'},
  {'content': "I'm planning to visit Masichju in Samdo-il-dong, Jeju-si next Tuesday. Can you check if they're open that day?",
   'role': 'user'},
  {'content': "Unfortunately, Masichju will be closed on Tuesday, January 16th.\n\n**Masichju Operation Information**\n\n- Business Hours: 11:00 - 21:00 every day\n- Closed: Every Tuesday\n- Location: Samdo-il-dong, Jeju-si\n\n**Recommended Visit Schedule**\n\nWe recommend that you adjust your visit plans to avoid Tuesdays. If you reschedule your visit to Monday or Wednesday, you will be able to taste Masichju's signature menu items.\n\n**Recommended Menu**\n\nHere are the must-try menu items when you visit Masichju:\n\n1. Stir-fried octopus on a hot plate (10,000-13,000 won)\n - A harmony of spicy yet chewy octopus and crunchy vegetables\n - You can adjust the spiciness so everyone can enjoy it\n\n2. Octopus Rice 

In [None]:
eval_dataset[0]

{'messages': [{'content': 'How far is Petrichor Jeju from Gureumri Beach? Is it possible to walk there with children?',
   'role': 'user'}],
 'reference': {'content': "It's about 350m from Petrichor Jeju to Gueomri Beach, a distance that can be easily walked in about 5 minutes. You can rest assured that it's a distance that is not too difficult to walk slowly with children.\n\n**Distance and Transportation Information**\n- Distance: Approximately 350m\n- Time Required: 5 minutes on foot\n- Difficulty: Easy (flat village road)\n- Stroller Accessible: Yes\n\n**Why It's Great for Walking with Children**\n\nGueompogu Village in Aewol, where Petrichor Jeju is located, is made up of quiet and safe village roads, making it a great place for a walk with children. The road from the guesthouse to the beach is flat, making it easy to push a stroller, and there's not much traffic, making it safe.\n\nGueomri Beach is a relatively unknown and quiet beach in Jeju, making it perfect for families to le

In [None]:
client = OpenAI()
def generate_response(model_id, message):
  response = client.responses.create(
    model=model_id,
    input=messages[0]['content'],
    max_output_tokens = 512,
    temperature=0.6,
    top_p=0.9,
  )
  return response.output_text

JUDGE_PROMPT = """You are an expert judge evaluating answers from a Jeju tour guide LLM.

Question: {question}
Answer: {generated_answer}
Reference (optional): {reference_answer}

Score the answer from 1 (worst) to 5 (best) on the following criteria:
1. Relevance
2. Factual correctness
3. Helpfulness
4. Conciseness

Return your response in strict JSON format:

{
  "relevance": <int>,
  "factual_correctness": <int>,
  "helpfulness": <int>,
  "conciseness": <int>
}
"""

client = OpenAI()
def llm_as_judge(question, generated_answer, reference_answer):
    prompt = (
        JUDGE_PROMPT
        .replace("{question}", question)
        .replace("{generated_answer}", generated_answer)
        .replace("{reference_answer}", reference_answer)
    )

    response = client.responses.create(
        model="gpt-4.1-mini",  # good balance of cost & reasoning
        input=[
            {
                "role": "system",
                "content": "You are a strict evaluator. Output JSON only."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0,
        max_output_tokens=150,
    )

    # Extract text output
    output_text = response.output_text.strip()

    # Parse JSON safely
    try:
        scores = json.loads(output_text)
    except json.JSONDecodeError:
        raise ValueError(f"Invalid JSON from judge:\n{output_text}")

    return scores

# **OpenAI Evals**

In [None]:
models = {
  "bl": "ft:gpt-4.1-nano-2025-04-14:personal:bl:CsuvF7jo",
  "eng": "ft:gpt-4.1-nano-2025-04-14:personal:english:CtAHprZc",
  "base": "gpt-4.1-nano-2025-04-14"
}

In [None]:
results = []

In [None]:
i = 0
for entry in BL_dataset:

  messages = entry['messages']
  reference = entry['reference']['content']
  prediction = generate_response(models['eng'], messages)

  result_entry = {}
  result_entry['question_id'] = i
  result_entry['question_text'] = messages[0]['content']
  result_entry['answer'] = prediction
  result_entry['reference'] = reference
  result_entry['metrics'] = {}


  # results_judge = llm_as_judge(messages[0]['content'], prediction, reference)
  results_bleu = bleu.compute(predictions=[prediction], references=[reference])
  results_rouge = rouge.compute(predictions=[prediction], references=[reference])
  results_bertscore = bertscore.compute(predictions=[prediction], references=[reference], lang="en")

  result_entry['metrics']['bleu'] = results_bleu
  result_entry['metrics']['rouge'] = results_rouge
  result_entry['metrics']['bertscore'] = results_bertscore
  # result_entry['metrics']['LLM_Judge'] = results_judge
  results.append(result_entry)

  i+=1
  print(f"{i}/{len(eval_dataset)}")


1/100
2/100
3/100
4/100
5/100
6/100
7/100
8/100
9/100
10/100
11/100
12/100
13/100
14/100
15/100
16/100
17/100
18/100
19/100
20/100
21/100
22/100
23/100
24/100
25/100
26/100
27/100
28/100
29/100
30/100
31/100
32/100
33/100
34/100
35/100
36/100
37/100
38/100
39/100
40/100
41/100
42/100
43/100
44/100
45/100
46/100
47/100
48/100
49/100
50/100
51/100
52/100
53/100
54/100
55/100
56/100
57/100
58/100
59/100
60/100
61/100
62/100
63/100
64/100
65/100
66/100
67/100
68/100
69/100
70/100
71/100
72/100
73/100
74/100
75/100
76/100
77/100
78/100
79/100
80/100
81/100
82/100
83/100
84/100
85/100
86/100
87/100
88/100
89/100
90/100
91/100
92/100
93/100
94/100
95/100
96/100
97/100
98/100
99/100
100/100


In [None]:
results_path = "./drive/MyDrive/hyperscale/results/openai_eng_results.json"

In [None]:
with open(results_path, 'w', encoding='utf-8') as outfile:
    json.dump(results, outfile, indent=2, ensure_ascii=False)