In [None]:
QUALIFIER_PROMPT = """
You are a classifier that extracts qualifiers from a hotel search query and assigns two independent label sets:
(1) Qualifier Type — how the qualifier is expressed; (2) Qualifier Content — what the qualifier is about.
A qualifier may have only Type, only Content, or both.

TASK
1) Split the user query into the maximum number of distinct, minimal qualifier phrases possible. 
   - Each qualifier should represent a single, atomic idea, property, or constraint.
   - Break compound statements into separate qualifiers whenever possible 
     (e.g., "family-friendly hotel with pool and near the beach" → 
      ["family-friendly", "with pool", "near the beach"]).
2) For each qualifier, assign zero, one, or multiple labels from Qualifier Type and/or Qualifier Content.
3) Return ONLY a valid JSON list of objects, one per qualifier.

QUALIFIER TYPE (zero or more):
- Explicit — directly stated fact or constraint
  Examples: "from Marriott chain", "must have gym", "with pool"
- Implicit — inferred preference, not literally stated
  Examples: "budget-friendly", "luxury", "popular hotel"
- Negation — excludes something
  Example: "not too loud"
- Similarity — compares to another known entity
  Example: "similar to Hilton Miami"
- Range — price/distance/numeric limit
  Example: "up to 60 pounds a night"
- Time-sensitive — urgency, trend, or recency
  Example: "popular hotel", "last-minute deal"
- Optional vs. Mandatory — indicates importance
  Examples: "optionally pet-friendly" (Optional), "must be wheelchair-accessible" (Mandatory)
- Objective — factual, measurable, or verifiable constraint
  Examples: "within 5 km", "under $100", "4-star rating", "must have gym"
- Subjective — opinion-based, emotional, or qualitative
  Examples: "quiet place", "romantic vibe", "luxury", "not too loud"

QUALIFIER CONTENT (zero or more):
- Purpose — reason for the trip
  Examples: "solo trip", "business conference"
- Location — place/region/proximity
  Examples: "{City}/{State}", "next to the beach", "near TICC"
- Population — group type/size
  Examples: "for a couple", "for a family of 4",
            "for a family of 4 with a 1 year old and a 5 year old"
- Seasonality — time of year/season
  Examples: "summer getaway", "monsoon-season travel"
- Description — amenities/property/style
  Explicit examples: "with gym", "with pool", "{Chain}"
  Implicit examples: "luxury", "family-friendly", "cheap"
- Reviews — rating/reputation
  Examples: "4 star ratings", "highly recommended by backpackers"

RULES
- Split the query as granularly as possible to maximize the number of qualifiers.
- Keep "text" exactly as it appears in the query.
- "type" and "content" MUST be present for every qualifier, but each may be an empty array [].
- A qualifier can be both Objective and Subjective only if it contains mixed factual and opinion-based elements.
- Do not invent information not present or reasonably implied by the query.
- Return ONLY a JSON array of objects.

EXAMPLE
Input Query:
"Looking for a budget-friendly hotel near TICC for a business conference, must have gym and not too loud."

Output:
[
  {
    "text": "budget-friendly",
    "type": ["Implicit", "Subjective"],
    "content": ["Description"]
  },
  {
    "text": "near TICC",
    "type": ["Explicit", "Objective"],
    "content": ["Location"]
  },
  {
    "text": "business conference",
    "type": ["Objective"],
    "content": ["Purpose"]
  },
  {
    "text": "must have gym",
    "type": ["Explicit", "Mandatory", "Objective"],
    "content": ["Description"]
  },
  {
    "text": "not too loud",
    "type": ["Negation", "Subjective"],
    "content": ["Description"]
  }
]

USER QUERY
<<QUERY>>
"""
# Example usage:
# prompt = QUALIFIER_PROMPT.replace("<<QUERY>>", "Looking for a quiet romantic hotel with pool and near the beach")


In [None]:
from langchain.chat_models import init_chat_model

# --- your bedrock model (as you specified) ---
judge_llm = init_chat_model(
    "us.anthropic.claude-sonnet-4-5-20250929-v1:0",
    model_provider="bedrock_converse",
    region_name="us-east-1",
    max_tokens=4096,
    temperature=0.0,
)

In [None]:
import pandas as pd

df = pd.read_csv("final_benchmark.csv")
queries = df["query"].to_list()

In [None]:
from tqdm.auto import tqdm

answers = []

for query in tqdm(queries):
    prompt = QUALIFIER_PROMPT.replace("<<QUERY>>", query)
    answer = judge_llm.invoke(prompt)
    answers.append(answer)
answers_content = [x.content for x in answers]

In [None]:
import json
from collections import Counter

type_counter = Counter()
content_counter = Counter()

for item in answers_content:
    try:
        clean = item.strip().removeprefix("```json").removesuffix("```").strip()
        qualifiers = json.loads(clean)
    except json.JSONDecodeError as e:
        print(f"Skipping invalid JSON: {e}")
        continue
    
    for q in qualifiers:
        for t in q.get("type", []):
            type_counter[t] += 1
        
        for c in q.get("content", []):
            content_counter[c] += 1

total_types = sum(type_counter.values())
total_contents = sum(content_counter.values())

print("=== Type Statistics ===")
for t, count in type_counter.most_common():
    pct = (count / total_types) * 100 if total_types > 0 else 0
    print(f"{t}: {count} ({pct:.2f}%)")

print("\n=== Content Statistics ===")
for c, count in content_counter.most_common():
    pct = (count / total_contents) * 100 if total_contents > 0 else 0
    print(f"{c}: {count} ({pct:.2f}%)")


In [None]:
import json
from collections import Counter

type_counter = Counter()
content_counter = Counter()

total_queries = len(answers_content)

for item in answers_content:
    try:
        clean = item.strip().removeprefix("```json").removesuffix("```").strip()
        qualifiers = json.loads(clean)
    except json.JSONDecodeError:
        continue

    types_in_query = set()
    contents_in_query = set()

    for q in qualifiers:
        types_in_query.update(q.get("type", []))
        contents_in_query.update(q.get("content", []))

    for t in types_in_query:
        type_counter[t] += 1
    for c in contents_in_query:
        content_counter[c] += 1

print("=== Type Presence per Query ===")
for t, count in type_counter.most_common():
    pct = (count / total_queries) * 100
    print(f"{t}: {count}/{total_queries} queries ({pct:.2f}%)")

print("\n=== Content Presence per Query ===")
for c, count in content_counter.most_common():
    pct = (count / total_queries) * 100
    print(f"{c}: {count}/{total_queries} queries ({pct:.2f}%)")


In [None]:
import re
import json
import pandas as pd
from typing import Iterable, Optional, List, Tuple, Set

def build_presence_df(
    queries: List[str],
    answers_content: List[str],
    target_types: Optional[Iterable[str]] = None,
    target_contents: Optional[Iterable[str]] = None,
    drop_all_zero_cols: bool = False,
) -> Tuple[pd.DataFrame, Set[str], Set[str]]:
    """
    Build a dataframe with:
      - 'query' column (aligned to input 'queries')
      - one 0/1 column per unique 'type' and 'content' found in answers_content
        (or restricted to target_types / target_contents if provided).

    Returns (df, all_types, all_contents)
    """
    # --- helpers ---
    def strip_fences(s: str) -> str:
        s = s.strip()
        s = re.sub(r'^```(?:json)?\s*', '', s)
        s = re.sub(r'\s*```$', '', s)
        return s.strip()

    def ensure_list(x):
        if x is None:
            return []
        return x if isinstance(x, list) else [x]

    def colify(prefix: str, name: str) -> str:
        safe = re.sub(r'\W+', '_', str(name).strip().lower()).strip('_')
        return f"{prefix}_{safe}" if safe else f"{prefix}_empty"

    n = min(len(queries), len(answers_content))
    queries = queries[:n]
    answers_content = answers_content[:n]

    all_types: Set[str] = set()
    all_contents: Set[str] = set()

    parsed_cache = []  
    for raw in answers_content:
        try:
            clean = strip_fences(raw)
            data = json.loads(clean)
        except json.JSONDecodeError:
            parsed_cache.append((set(), set(), False))
            continue

        qualifiers = data if isinstance(data, list) else [data]

        types_in_q, contents_in_q = set(), set()
        for qobj in qualifiers:
            if not isinstance(qobj, dict):
                continue
            types_in_q.update(ensure_list(qobj.get("type", [])))
            contents_in_q.update(ensure_list(qobj.get("content", [])))

        parsed_cache.append((types_in_q, contents_in_q, True))
        all_types.update(types_in_q)
        all_contents.update(contents_in_q)

    if target_types is not None:
        all_types = {t for t in all_types if t in set(target_types)}
    if target_contents is not None:
        all_contents = {c for c in all_contents if c in set(target_contents)}

    df = pd.DataFrame({"query": queries})
    type_cols = [colify("type", t) for t in sorted(all_types)]
    content_cols = [colify("content", c) for c in sorted(all_contents)]
    for col in type_cols + content_cols:
        df[col] = 0

    for i, (types_in_q, contents_in_q, valid) in enumerate(parsed_cache):
        if not valid:
            continue  
        for t in types_in_q:
            if t in all_types:
                df.at[i, colify("type", t)] = 1
        for c in contents_in_q:
            if c in all_contents:
                df.at[i, colify("content", c)] = 1

    if drop_all_zero_cols:
        keep = ["query"]
        keep += [c for c in type_cols + content_cols if df[c].sum() > 0]
        df = df[keep]

    return df, all_types, all_contents



df, all_types, all_contents = build_presence_df(queries, answers_content)


In [None]:
bench = pd.read_csv("final_benchmark.csv")
bench

In [None]:
merged_df = pd.merge(
    df,
    bench,
    on="query",
    how="left",    # keeps all rows from df; use 'inner' if you want only matching ones
    suffixes=("", "_bench")  # avoids column name collisions
)
merged_df

In [None]:
import numpy as np

if "Unnamed: 0" in merged_df.columns:
    merged_df = merged_df.drop(columns=["Unnamed: 0"])

merged_df["need_clarifiaction"] = np.where(
    merged_df["Clarification Ground Truth"].notna(), 1, 0
)

complexity_map = {
    "Simple": 1,
    "Moderate": 2,
    "Complex": 3
}
merged_df["human_complexity_rating"] = (
    merged_df["human_complexity_rating"]
    .map(complexity_map)
    .fillna(0)  # Optional: replace missing or unexpected values with 0
    .astype(int)
)

merged_df = merged_df.drop(columns=["Clarification Ground Truth"])
merged_df

In [None]:
merged_df["query_word_count"] = (
    merged_df["query"]
    .fillna("")
    .apply(lambda x: len(str(x).split()))
)

merged_df

In [None]:
type_cols = [col for col in merged_df.columns if col.startswith("type_")]
merged_df["num_types_in_query"] = merged_df[type_cols].sum(axis=1)
merged_df

In [None]:
import pandas as pd
import ast


df = pd.read_csv("queries_with_answers.csv")
complexity_df = pd.read_csv("dataset_v1.csv")
complexity_map = {"Simple": 0, "Moderate": 0, "Complex": 1}

# part 1: keep original prompts with mapped complexity
df1 = complexity_df.dropna(subset=["human_complexity_rating"]).copy()
df1 = df1[["prompt", "human_complexity_rating"]]
df1["complexity"] = df1["human_complexity_rating"].map(complexity_map)
complexity_df = df1[["prompt", "complexity"]].dropna()

df = pd.concat([df, complexity_df], axis=1).drop(columns=["prompt"])


df_agent = pd.read_csv("exp9_eval_v3.csv").dropna()
df_agent2 = pd.read_csv("retrievals_baselines/bm25_top3_recommendations.csv").dropna()
df_agent3 = pd.read_csv("retrievals_baselines/dense_sentence-transformers-all-MiniLM-L6-v2.csv").dropna()
df_agent4 = pd.read_csv("retrievals_baselines/reranked_BAAI-bge-m3__Qwen-Qwen3-Reranker-4B.csv").dropna()
df_agent5 = pd.read_csv("retrievals_baselines/reranked_BAAI-bge-m3__Qwen-Qwen3-Reranker-0.6B.csv").dropna()
df_agent6 = pd.read_csv("exp_haiku.csv").dropna()
df_agent7 = pd.read_csv("retrievals_baselines/qwen25_32B_answers.csv").dropna()

# Convert to dict and extract judge_text as int
df_agent6["relevance_score"] = df_agent6["output"].apply(lambda x: int(ast.literal_eval(x)["judge_text"]))
df_agent["relevance_score"] = df_agent["output"].apply(lambda x: int(ast.literal_eval(x)["judge_text"]))

dfs = [df_agent, df_agent2, df_agent3, df_agent4, df_agent5, df_agent6, df_agent7]

# rename relevance_score column in each df
renamed = []
for i, d in enumerate(dfs, start=1):
    d_temp = d[["relevance_score"]].copy()
    d_temp.rename(columns={"relevance_score": f"relevance_score_{i}"}, inplace=True)
    renamed.append(d_temp)

# concat with your base df
df = pd.concat([df] + renamed, axis=1)



    

In [None]:
import pandas as pd
import ast


def parse_items(cell):
    try:
        items = ast.literal_eval(cell)   # safer than eval
    except Exception:
        return []
    return items

df["parsed"] = df["answers"].apply(parse_items)

df["num_items"] = df["parsed"].apply(len)

all_types = set()
all_contents = set()
for items in df["parsed"]:
    for it in items:
        all_types.update(it.get("type", []))
        all_contents.update(it.get("content", []))

for t in all_types:
    df[f"type_{t}"] = df["parsed"].apply(lambda items: int(any(t in it.get("type", []) for it in items)))

for c in all_contents:
    df[f"content_{c}"] = df["parsed"].apply(lambda items: int(any(c in it.get("content", []) for it in items)))

df = df.drop(columns=["parsed","answers"])
df["num_words"] = [len(x.split()) for x in df["query"]]
df


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, spearmanr, kruskal

def test_feature_effects(df, score_cols, feature_cols):
    results = []

    for score in score_cols:
        for feat in feature_cols:
            x = df[feat].dropna()
            y = df[score].dropna()


            common_idx = x.index.intersection(y.index)
            x, y = x.loc[common_idx], y.loc[common_idx]

            # binary feature
            if set(x.unique()) <= {0, 1}:
                group0 = y[x == 0]
                group1 = y[x == 1]
                if len(group0) > 1 and len(group1) > 1:
                    u, p_u = mannwhitneyu(group0, group1, alternative="two-sided")
                    results.append({
                        "score": score,
                        "feature": feat,
                        "mean_0": group0.mean(),
                        "mean_1": group1.mean(),
                        "mannwhitney_p": p_u
                    })

            # categorical with >2 groups
            elif x.nunique() > 2 and x.nunique() < 20:
                groups = [y[x == val] for val in x.unique()]
                if all(len(g) > 1 for g in groups):
                    stat, p_kw = kruskal(*groups)
                    results.append({
                        "score": score,
                        "feature": feat,
                        "kruskal_p": p_kw
                    })

            # numeric feature
            else:
                rho, p_rho = spearmanr(x, y)
                results.append({
                    "score": score,
                    "feature": feat,
                    "spearman_rho": rho,
                    "spearman_p": p_rho
                })
    
    return pd.DataFrame(results)


In [None]:
score_cols = [f"relevance_score_{i}" for i in range(1, 8)]
feature_cols = [c for c in df.columns if c not in ["query"] + score_cols]

res = test_feature_effects(df, score_cols, feature_cols)

# keep only significant
sig = res[
    (res.get("mannwhitney_p", 1) < 0.05) |
    (res.get("kruskal_p", 1) < 0.05) |
    (res.get("spearman_p", 1) < 0.05)
]

# map relevance_score_* → meaningful names
SCORE_NAME_MAP = {
    "relevance_score_1": "Sonnet",
    "relevance_score_2": "BM25",
    "relevance_score_3": "Dense MiniLM",
    "relevance_score_4": "BGE-M3 + Qwen Reranker (4B)",
    "relevance_score_5": "BGE-M3 + Qwen Reranker (0.6B)",
    "relevance_score_6": "Haiku",
    "relevance_score_7": "Qwen2.5 32B"
}

sig["score_name"] = sig["score"].map(SCORE_NAME_MAP)
sig = sig[["score_name", "feature"] + [c for c in sig.columns if c not in ["score", "score_name", "feature"]]]
