In [None]:

import os
import json
import re
from datetime import datetime
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.evaluation.scoring.eval_chain import ScoreStringResultOutputParser
from dotenv import load_dotenv
load_dotenv()

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "scratch"

LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0)


In [None]:

PROMPTS = {
    "Unusual": "What are the best unusual attractions and things to do in {city}?",
    "Food": "What are the best local food to eat in {city}?"
}

CRITERIA_TEXT_SCORE = (
    "a) **Correctness** – Does the answer provide recommendations that *match* the reference response?\n"
    "- 0: No overlap with reference ideas; mostly generic or irrelevant\n"
    "- 1: One idea overlaps partially with a reference point\n"
    "- 2: Multiple partial overlaps, but many important ideas missing\n"
    "- 3: Substantial overlap with reference, but some key omissions\n"
    "- 4: Near-complete coverage of specific reference items\n"
    "- 5: Covers all reference ideas, including less obvious ones\n"
    "\n"
    "b) **Interesting** – Are the recommendations offbeat or non-mainstream?\n"
    "- 0: Entirely standard tourist traps\n"
    "- 5: Entirely unusual, creative, or niche\n"
    "\n"
    "c) **Believability** – Does the response justify why each recommendation is interesting?\n"
    "- 0: No justification provided\n"
    "- 5: Every recommendation well-justified with insight\n"
    "\n"
    "Sum the three criterion scores (0–15) and map to a final score (1–10).\n"
    "Output each criterion score, and on a new line, output the final score wrapped in double brackets, e.g. [[9]]."
)


In [None]:

def evaluate_response(input_text, prediction, reference, criteria_text):
    prompt = (
        f"### Input\n{input_text}\n\n"
        f"### Submission\n{prediction}\n\n"
        f"### Reference\n{reference}\n\n"
        f"### Evaluation Criteria\n{criteria_text}\n\n"
    )
    response = LLM.invoke(prompt)
    return ScoreStringResultOutputParser().parse(response.content)


In [None]:

input_text = PROMPTS["Unusual"].format(city="Auckland")
submission = "Auckland has many attractions including Sky Tower, harbor cruises, and Waiheke Island vineyards."
reference = "Explore lava caves, WWII tunnels, and behind-the-scenes tours in Auckland."

criteria = CRITERIA_TEXT_SCORE


In [None]:

result = evaluate_response(input_text, submission, reference, criteria)
print(result)
