In [None]:
import os
import pandas as pd
import nest_asyncio
nest_asyncio.apply()
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "http://127.0.0.1:6006"

In [None]:
import phoenix as px
client = px.Client()

df = pd.read_csv("hotels.csv", encoding="latin1") # description data
exp = pd.read_csv("final_benchmark.csv")


In [None]:
import json
import re
from collections import Counter
from typing import List, Tuple, Dict
import pandas as pd
from typing import List, Tuple, Dict, Optional, Callable


def _normalize(s: str) -> str:
    return (s or "").strip()

def build_implicit_lookup(
    exp_df: pd.DataFrame,
    query_col: str = "query",
    implicit_col: str = 'Clarification Ground Truth'
) -> Dict[str, str]:
    """
    Create a mapping: normalized query text -> implicit explanation.
    Drops NaNs and duplicates (keeps the first).
    """
    if query_col not in exp_df.columns:
        raise KeyError(f"'{query_col}' not found in exp_df columns: {list(exp_df.columns)}")
    if implicit_col not in exp_df.columns:
        raise KeyError(f"'{implicit_col}' not found in exp_df columns: {list(exp_df.columns)}")

    tmp = exp_df[[query_col, implicit_col]].dropna()
    # Keep first occurrence per normalized query
    tmp["_key"] = tmp[query_col].astype(str).map(_normalize)
    tmp = tmp.drop_duplicates(subset=["_key"], keep="first")
    return dict(zip(tmp["_key"], tmp[implicit_col].astype(str)))



_CITATION_RE = re.compile(
    r"\[(Hotel review|Hotel description|Web)(?::\s*([^\]]*?))?\]",
    flags=re.IGNORECASE
)



def extract_citations(text: str, return_unique: bool = True) -> Tuple[List[dict], List[dict]]:
    """
    Extract only citations where the value (hotel name, URL, etc.) is not empty.
    Returns (all_hits, unique_hits).
    """
    type_map = {
        "hotel review": "review",
        "hotel description": "description",
        "web": "web",
    }

    all_hits = []
    for m in _CITATION_RE.finditer(text):
        value = m.group(2).strip() if m.group(2) else None
        if not value:
            continue
        kind_raw = m.group(1).strip().lower()
        kind = type_map.get(kind_raw, kind_raw)
        all_hits.append({
            "raw": m.group(0),
            "type": kind,
            "value": value,
            "start": m.start(),
            "end": m.end(),
        })

    if not return_unique:
        return all_hits, []

    keyseq = [(hit["type"], hit["value"].lower()) for hit in all_hits]
    counts = Counter(keyseq)

    first_seen_value = {}
    for hit in all_hits:
        k = (hit["type"], hit["value"].lower())
        first_seen_value.setdefault(k, hit["value"])

    unique_hits = [
        {"type": t, "value": first_seen_value[(t, v)], "count": c}
        for (t, v), c in counts.items()
    ]
    unique_hits.sort(key=lambda d: (d["type"], d["value"]))
    return all_hits, unique_hits


def get_descriptions_from_df(
    unique_hits: List[Dict],
    df: pd.DataFrame,
    name_col: str = "Name",
    desc_col: str = "description",
) -> List[Tuple[str, str]]:
    """
    Return list of (hotel_name, description) for names found in unique_hits (type='description').
    """
    names = [hit["value"] for hit in unique_hits if hit["type"] == "description"]
    if not names:
        return []
    mask = df[name_col].isin(names)
    rows = df.loc[mask, [name_col, desc_col]].dropna().drop_duplicates()
    return [(str(n), str(d)) for n, d in rows.itertuples(index=False, name=None)]





def build_prompt_from_phoenix_row(
    row: dict,
    hotels_df: pd.DataFrame,
    name_col: str = " HotelName",
    desc_col: str = " Description",
    implicit_lookup: Optional[Dict[str, str]] = None,
) -> str:
    """
    Build an evaluation prompt for an LLM-as-a-judge from a Phoenix dataset row.
    Includes relevance + factuality instructions, query, answer, hotel descriptions, reviews, web results,
    and (optionally) 'implicit' explanation if present in implicit_lookup.
    """

    # 1) Query (Phoenix traces: last user message)
    query = row["input"]["messages"][0][-1]
    norm_query = _normalize(str(query))

    # 2) Answer
    answer = row["output"]["messages"][-1]["content"]

    # 3) Citations in answer
    _, unique_hits = extract_citations(answer, return_unique=True)

    # Descriptions (from hotels_df)
    desc_pairs = get_descriptions_from_df(unique_hits, hotels_df, name_col, desc_col)
    descriptions = "\n".join(f"\n- {name}: {desc.strip()}" for name, desc in desc_pairs)
    if not descriptions:
        descriptions = "(no matching hotel descriptions found)"

    # Reviews and web results (filtered)
    allowed_names = [d["value"] for d in unique_hits if d["type"] == "review"]
    filtered_reviews = []
    website = []

    for msg in row["output"]["messages"]:
        if msg.get("name") == "reviews":
            content = msg["content"]
            chunks = re.split(r"\n\*\*\*\n|\n\n", content.strip())
            for chunk in chunks:
                try:
                    text, json_str = chunk.rsplit("\n", 1)
                    meta = json.loads(json_str)
                    if meta.get("Name") in allowed_names:
                        filtered_reviews.append(chunk.strip())
                except Exception:
                    continue

        elif msg.get("name") == "web_search":
            content = msg["content"]
            try:
                data = json.loads(content)
                website += [sample["content"] for sample in data.get("results", [])]
            except Exception:
                continue

    reviews = "\n\n***\n\n".join(filtered_reviews) or "(no matching hotel reviews found)"
    web = "\n\n***\n\n".join(website) or "(no websites found)"

    # 4) Implicit (optional)
    implicit = None
    if implicit_lookup:
        implicit = implicit_lookup.get(norm_query)
    implicit_block = implicit if implicit else "(no implicit explanation provided)"



    # prompt_header = (
    #     "You are a Relevance Judge for HOTEL RECOMMENDATIONS.\n"
    #     "Evaluate ONLY using the provided hotel descriptions and reviews (ignore any outside knowledge).\n\n"
    #     "Task: Rate how well the ANSWER addresses the USER QUERY on a 1–5 scale:\n"
    #     "1 = Not relevant at all — completely misses the user's needs.\n"
    #     "2 = Slightly relevant — touches minor aspects but not the core requirements.\n"
    #     "3 = Moderately relevant — addresses some key points but misses important requirements.\n"
    #     "4 = Very relevant — covers most requirements well, with minor omissions.\n"
    #     "5 = Perfectly relevant — comprehensively addresses all requirements with appropriate detail.\n\n"
    #     "When evaluating, consider:\n"
    #     "• Does the answer directly address the specific hotel requirements (location, budget, amenities, dates, party size)?\n"
    #     "• Are concrete hotel recommendations provided (names + pertinent details) rather than generic advice?\n"
    #     "• Is the reasoning clear, structured, and evidence-based from the given context?\n"
    #     "• Are trade-offs/limitations explained when relevant, and is a clear recommendation given?\n"
    #     "• IMPORTANT: If no concrete hotel recommendation is made when the query calls for it, score = 1.\n"
    #     # "• IMPORTANT: Only consider claims that are supported within the given context—do not rely solely on the answer text.\n\n"

    #     # "Output format: Return ONLY the integer score (1–5) with no additional text."
    #     "Output format: Return ONLY a valid JSON object with two fields:"
    #     "- score: an integer from 1 to 5"
    #     "- explanation: a brief explanation for the chosen score"
    #     "Example:\n"
    #     '{\n  "score": 4,\n  "explanation": "The answer is mostly correct and relevant but misses a minor detail."\n}'
    #     "Do not include any text outside the JSON object."
    # )

    prompt_header = (
    "You are a Factuality Judge for HOTEL RECOMMENDATIONS.\n"
    "Your goal is to assess the factual accuracy of the ANSWER strictly based on the provided hotel descriptions and reviews.\n"
    "IGNORE any outside knowledge or assumptions — only consider information verifiable from the given sources.\n\n"
    "Task: Rate how FACTUALLY ACCURATE the ANSWER is on a 1–5 scale:\n"
    "1 = Completely inaccurate — contains mostly false or unsupported statements.\n"
    "2 = Poor factuality — some facts are correct, but most claims lack evidence or contradict the sources.\n"
    "3 = Partially factual — roughly half the claims are supported, others are vague or unverified.\n"
    "4 = Mostly factual — nearly all claims align with the sources, with only minor inaccuracies or omissions.\n"
    "5 = Fully factual — every factual statement is accurate and directly supported by a cited source.\n\n"
    "When evaluating, consider:\n"
    "• Does each factual statement about the hotel (e.g., location, amenities, ratings, accessibility, services) have explicit evidence from the provided descriptions or reviews?\n"
    "• Are there any hallucinated details or claims not grounded in the sources?\n"
    "• Are sources cited clearly and correctly linked to each factual statement?\n"
    "• Is the information consistent with the evidence, without contradictions or exaggerations?\n"
    "• IMPORTANT: If any factual statement lacks an explicit source, deduct points proportionally.\n\n"
    "Output format: Return ONLY a valid JSON object with two fields:\n"
    "- score: an integer from 1 to 5\n"
    "- explanation: a concise justification mentioning which parts are well-supported and which are not.\n\n"
    "Example:\n"
    '{\n  "score": 4,\n  "explanation": "Most details (location, breakfast, and accessibility) are supported by the descriptions, but the mention of a rooftop bar lacks evidence."\n}'
    "Do not include any text outside the JSON object."
)


    


    final_prompt = f"""{prompt_header}

**********
User's query:
{query}

**********
Answer:
{answer}
**********
Clarification of the user:
{implicit_block}

Descriptions:
{descriptions}

Reviews:
{reviews}

Web:
{web}
"""

    return final_prompt



In [None]:
implicit_lookup = build_implicit_lookup(
    exp,
    query_col="query",  
    implicit_col='Clarification Ground Truth'
)

In [None]:
import json
import re
import pandas as pd
from phoenix.experiments.types import Example
from phoenix.experiments import run_experiment
from langchain.chat_models import init_chat_model

# --- your bedrock model (as you specified) ---
judge_llm = init_chat_model(
    "us.anthropic.claude-sonnet-4-5-20250929-v1:0",
    model_provider="bedrock_converse",
    region_name="us-east-1",
    max_tokens=4096,
    temperature=0.0,
)

# --- helpers: robust JSON parsing from model output ---
_JSON_RE = re.compile(r"\{.*\}", re.DOTALL)

def parse_judge_json(text: str) -> dict:
    """
    Extract the first JSON object from text and parse it.
    Fallback to an empty schema if parsing fails.
    """
    if not isinstance(text, str):
        return {}
    m = _JSON_RE.search(text)
    candidate = m.group(0) if m else text
    try:
        data = json.loads(candidate)
        assert isinstance(data, dict)
        return data
    except Exception:
        return {}



def my_task(example: Example) -> dict:
    row_like = {"input": example.input, "output": example.output}
    prompt = build_prompt_from_phoenix_row(row_like, df, " HotelName", " Description", implicit_lookup)
    # print(prompt)

    resp = judge_llm.invoke(prompt)
    # text = getattr(resp, "content", None) or getattr(resp, "text", None) or str(resp)

    # return {"judge_text": text}  # super minimal
    text = getattr(resp, "content", None) or getattr(resp, "text", None) or str(resp)

    # --- inside your function ---
    parsed = {"score": None, "explanation": None, "raw_text": text}

    try:
        # 1) Strip ```json fences (and any stray whitespace)
        cleaned = re.sub(r"^\s*```(?:json)?\s*|\s*```\s*$", "", text.strip(), flags=re.IGNORECASE)

        # 2) If still fails, try extracting first {...} block
        try:
            j = json.loads(cleaned)
        except json.JSONDecodeError:
            match = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)
            if match:
                j = json.loads(match.group(0))
            else:
                raise  # re-raise to go to outer except

        if isinstance(j, dict):
            try:
                parsed["score"] = max(1, min(5, int(j.get("score", None))))  # clamp 1–5
            except (TypeError, ValueError):
                parsed["score"] = None
            parsed["explanation"] = (j.get("explanation") or "").strip()

    except Exception as e:
        parsed["error"] = f"Failed to parse JSON: {e}"

    print(parsed)
    return parsed



import re

def _int_from_text(text: str) -> float:
    try:
        v = int(str(text).strip())
        return float(max(1, min(5, v)))
    except Exception:
        m = re.search(r"\b([1-5])\b", str(text))
        return float(m.group(1)) if m else 1.0

def relevance_metric(input, output, metadata=None) -> float:
    # return _int_from_text(output.get("judge_text", ""))
    return _int_from_text(output.get("score", ""))

# If you drop factuality entirely:
evaluators = [relevance_metric]




dataset = client.get_dataset(name="agent", version_id="...")

experiment = run_experiment(dataset, my_task, evaluators=evaluators)


In [None]:
experiment.eval_summaries[0].stats