In [None]:
import pandas as pd
from langchain.chat_models import init_chat_model

# --- your bedrock model (as you specified) ---
judge_llm = init_chat_model(
    # "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
    "us.anthropic.claude-sonnet-4-5-20250929-v1:0",
        # "us.anthropic.claude-sonnet-4-20250514-v1:0",
    model_provider="bedrock_converse",
    region_name="us-east-1",
    max_tokens=4096,
    temperature=0.0,
    # top_p=1.0,
)

In [None]:
import pandas as pd

# --- Load ---
df  = pd.read_csv("exp1_300m_rerank600m_rev_eval.csv")[["prompt", "recommendations"]]
exp = pd.read_csv("final_benchmark.csv")

# --- Ensure we have Clarification Ground Truth; create blank if missing ---
clar_col = None
for c in exp.columns:
    if c.strip().lower() == "clarification ground truth":
        clar_col = c
        break
if clar_col is None:
    clar_col = "Clarification Ground Truth"
    exp[clar_col] = ""

# Keep only needed cols from exp
exp = exp[["query", clar_col]]

# --- Build canonical keys for robust matching (strip, collapse spaces, lowercase) ---
def canon(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.strip()
         .str.replace(r"\s+", " ", regex=True)
         .str.lower()
    )

exp["_key"] = canon(exp["query"])
df["_key"]  = canon(df["prompt"])

# --- Collapse duplicates on the right (df) so the merge is many-to-one ---
# Keep first occurrence; rename recommendations -> answer
df_uni = (
    df.sort_index()
      .drop_duplicates(subset=["_key"], keep="first")
      .rename(columns={"recommendations": "answer"})
)

# --- Left-merge from exp -> df_uni; preserves exactly len(exp) rows (214) ---
merged = exp.merge(
    df_uni[["_key", "prompt", "answer"]],
    on="_key",
    how="left",
    validate="many_to_one"
).drop(columns=["_key"])

# --- Final columns & optional fill for missing matches ---
final = merged[["query", "prompt", "answer", clar_col]].fillna({"prompt": "", "answer": ""})

final


In [None]:
import pandas as pd
from typing import Mapping, Optional

# EXACT header copied from your function (unchanged)
RELEVANCE_HEADER = (
    "You are a Relevance Judge for HOTEL RECOMMENDATIONS.\n"
    "Evaluate ONLY using the provided hotel descriptions and reviews (ignore any outside knowledge).\n\n"
    "Task: Rate how well the ANSWER addresses the USER QUERY on a 1–5 scale:\n"
    "1 = Not relevant at all — completely misses the user's needs.\n"
    "2 = Slightly relevant — touches minor aspects but not the core requirements.\n"
    "3 = Moderately relevant — addresses some key points but misses important requirements.\n"
    "4 = Very relevant — covers most requirements well, with minor omissions.\n"
    "5 = Perfectly relevant — comprehensively addresses all requirements with appropriate detail.\n\n"
    "When evaluating, consider:\n"
    "• The answer represents the top 3 retrieved documents corresponding to the top 3 hotel candidates — these documents are raw text.\n"
    "• Focus on evaluating the knowledge contained in these documents and how well it satisfies the user's stated requirements, rather than the presentation quality or completeness of the text.\n\n"
        
    "Output format: Return ONLY a valid JSON object with two fields:"
    "- score: an integer from 1 to 5"
    "- explanation: a brief explanation for the chosen score"
    "Example:\n"
    '{\n  "score": 4,\n  "explanation": "The answer is mostly correct and relevant but misses a minor detail."\n}'
    "Do not include any text outside the JSON object."
)

def _nz(x: Optional[str]) -> str:
    return ("" if x is None else str(x)).strip()

def build_prompt_exact(
    row: Mapping,
    query_col: str = "query",
    answer_col: str = "answer",
    clar_col: str = "Clarification Ground Truth",
) -> str:
    """
    Build the prompt using your EXACT header, plus:
      - User's query
      - Answer
      - Clarification of the user
    """
    query = _nz(row.get(query_col))
    answer = _nz(row.get(answer_col))
    clar = _nz(row.get(clar_col)) or "(no implicit explanation provided)"

    final_prompt = (
        f"{RELEVANCE_HEADER}\n\n"
        "**********\n"
        "User's query:\n"
        f"{query}\n\n"
        "**********\n"
        "Answer:\n"
        f"{answer}\n"
        "**********\n"
        "Clarification of the user:\n"
        f"{clar}\n"
    )
    return final_prompt

def add_prompt_column_exact(
    df: pd.DataFrame,
    query_col: str = "query",
    answer_col: str = "answer",
    clar_col: str = "Clarification Ground Truth",
    new_col: str = "prompt"
) -> pd.DataFrame:
    out = df.copy()
    out[new_col] = out.apply(
        lambda r: build_prompt_exact(r, query_col, answer_col, clar_col),
        axis=1
    )
    return out


In [None]:
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm

def parse_json_maybe(s: str):
    # strip optional ```json fences
    clean = s.strip().removeprefix("```json").removesuffix("```").strip()
    try:
        return json.loads(clean)
    except Exception:
        return clean  # fall back to raw text

# If judge_llm isn't thread-safe, uncomment the lock below.
# from threading import Lock
# _invoke_lock = Lock()

def call_judge(i, prompt):
    try:
        # If not thread-safe, wrap the next line with the lock:
        # with _invoke_lock:
        answer = judge_llm.invoke(prompt)
        text = getattr(answer, "content", str(answer))
        return i, parse_json_maybe(text)
    except Exception as e:
        # return the error payload so you can inspect failures later
        return i, {"error": str(e)}

evaluations = []
final_with_prompts = add_prompt_column_exact(final)
prompts = final_with_prompts["prompt"].tolist()

evaluations = [None] * len(prompts)

with ThreadPoolExecutor(max_workers=4) as ex:
    futures = [ex.submit(call_judge, i, p) for i, p in enumerate(prompts)]
    for fut in tqdm(as_completed(futures), total=len(futures), desc="Judging"):
        i, result = fut.result()
        evaluations[i] = result
