In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
from openai import OpenAI
from tqdm.auto import tqdm 
import numpy as np
from scipy.stats import entropy
import json
import random
import matplotlib.pyplot as plt
import seaborn as sns
import re
from qdrant_client import QdrantClient, models

load_dotenv()

# docker run -p 6333:6333 -p 6334:6334 -v "${PWD}\qdrant_storage:/qdrant/storage" qdrant/qdrant

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")


OUTPUT_DIR = "../data/experiments_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [67]:

poi_data = pd.read_csv('../data/krakow_pois_selected.csv')
documents = poi_data.to_dict(orient='records')

df_question = pd.read_csv("../data/ground-truth-retrieval.csv")
ground_truth = df_question.to_dict(orient='records')


In [68]:
text_columns = ['name','amenity','leisure','natural','tourism','historic','wiki_summary_en']

In [69]:

qdrant_client = QdrantClient("http://localhost:6333")

In [70]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [71]:
qdrant_client.delete_collection('hybrid_search')

True

In [72]:

qdrant_client.create_collection(
    collection_name="hybrid_search",
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [73]:
qdrant_client.upsert(
    collection_name="hybrid_search",
    points=[
        models.PointStruct(
            id=doc['id'],
            vector={
                "jina-small": models.Document(
                    text=doc['name'] + ' ' + doc['amenity'] + ' ' + doc['leisure'] + ' ' + doc['natural'] + ' ' + doc['tourism'] + ' ' + doc['historic'] + ' ' + doc['wiki_summary_en'],
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                "bm25": models.Document(
                    text=doc['name'] + ' ' + doc['amenity'] + ' ' + doc['leisure'] + ' ' + doc['natural'] + ' ' + doc['tourism'] + ' ' + doc['historic'] + ' ' + doc['wiki_summary_en'],
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "name": doc['name'],
            "wiki_summary_en": doc['wiki_summary_en'],
            'id'    : doc['id'],
            }
        )
        for doc in documents
    ]
)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [74]:

def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = qdrant_client.query_points(
        collection_name="hybrid_search",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )

    return results.points

In [75]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [76]:
entry_template = """

phone : {phone}
cemetery : {cemetery}
emergency : {emergency}
opening_hours : {opening_hours}
website : {website}
pets_allowed : {pets_allowed}
geometry : {geometry}
historic : {historic}
wiki_summary_en : {wiki_summary_en}
postal_code : {postal_code}
toilets : {toilets}
natural : {natural}
description : {description}
visiting_time : {visiting_time}
leisure : {leisure}
tourism : {tourism}
public_transport : {public_transport}
brand : {brand}
alt_name : {alt_name}
amenity : {amenity}
reservation : {reservation}
attraction : {attraction}
highchair : {highchair}
parking : {parking}
swimming_pool : {swimming_pool}
contact_phone : {contact_phone}
community_centre : {community_centre}
addr_street : {addr_street}
contact_twitter : {contact_twitter}
social_facility : {social_facility}
contact_facebook : {contact_facebook}
zoo : {zoo}
email : {email}
wheelchair : {wheelchair}
cuisine : {cuisine}
contact_website : {contact_website}
internet_access : {internet_access}
opening_hours_reception : {opening_hours_reception}
guest_house : {guest_house}
addr_city : {addr_city}
contact_instagram : {contact_instagram}
image : {image}
location : {location}
outdoor_seating : {outdoor_seating}
museum : {museum}
takeaway : {takeaway}
smoking : {smoking}
name : {name}
id : {id} """.strip()

In [77]:
def build_context(search_results):
    
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    return context

In [78]:
def build_prompt(prompt_template, query, context):
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [79]:
def openai_llm(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0
        # using temperature=0 ensures deterministic outputs.
# This is important when comparing models/prompts,
# because it removes randomness from the generation.
# That way, differences come only from the model/prompt,
# not from sampling noise.
    )
    
    return response.choices[0].message.content

In [80]:
def filter_rrf_results(results):
    context_selected_ids = []
    for record in results:
        context_selected_ids.append(record.id)
    return [doc for doc in documents if doc["id"] in context_selected_ids]

In [81]:

def rag(llm, query, prompt_template):
    search_results = rrf_search(query)
    search_results =filter_rrf_results(search_results)
    context = build_context(search_results)
    prompt = build_prompt(prompt_template,query, context)
    answer = llm(prompt)
    return answer

In [82]:
from mistralai import Mistral

def mistral_llm(prompt):
    mistral_client = Mistral(api_key=MISTRAL_API_KEY)
    response = 	mistral_client.chat.complete(
        model="mistral-large-latest",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content


In [None]:
MODELS = {'openai_llm': openai_llm, 'mistral_llm': mistral_llm}

PROMPTS = {
    "prompt_A": """### Act as a **Kraków travel expert** with access to a comprehensive POI (Points of Interest) database.
Your role is to provide **precise, personalized, and actionable recommendations** for visitors by dynamically retrieving and synthesizing information from the POI database.

---
**Guidelines:**

1. **Always query the POI database** for the most accurate and up-to-date information.

2. **Filter POIs** based on:
   - User preferences (e.g., historical sites, vegan food, family-friendly)
   - Location (proximity to the user’s current area or planned route)
   - Time of day/year (e.g., seasonal attractions, nightlife)
   - Budget (free, mid-range, luxury)

3. **Structure your response** as follows:
   - **Direct Answer:** Start with a concise reply to the user’s question.
   - **POI Details:** Include **name, address, opening hours, price range, and a brief description** for each relevant POI.
   - **Contextual Tips:** Add practical advice (e.g., best time to visit, how to skip lines, nearby (geometry in context) POIs to combine).
   - **Personalization:** Tailor suggestions to the user’s interests, group size, and mobility.

4. **Proactive Suggestions:**
   - If the user mentions a POI, suggest nearby attractions or activities (e.g., *"After visiting Schindler’s Factory, you can walk to the MOCAK Museum or explore Kazimierz."*).
   - For multi-day trips, offer itinerary templates based on POI clusters (e.g., *"Day 1: Old Town + Wawel; Day 2: Kazimierz + Podgórze"*).

5. **Geospatial Queries:**
   - Use POI coordinates to suggest walking routes or clusters.

6. **Check**
   - Use context strictly.
   
   QUESTION: {question}

   CONTEXT: {context}
   
   Answer:""",


    "prompt_B": """### You are a Kraków Travel Assistant specialized in Points of Interest (POIs). Your goal is to provide accurate, actionable, 
    and personalized recommendations using both your knowledge and retrieved documents.

Instructions:

Evaluate retrieved POIs for relevance:

Prioritize attractions that match the user’s interests, duration of stay, and location preferences.

Rank POIs by importance, popularity, uniqueness, or cultural value.

For each recommended POI, include:

Name, location, and historical or cultural significance

Visiting hours, entry fees, and booking tips

Nearby attractions (geometry in context), restaurants, or cafes

Transportation options and accessibility details

Suggested activities or must-see highlights

Presentation style:

Friendly, engaging, and easy-to-follow

Provide concise lists or itineraries when relevant

Clearly indicate when information comes from retrieved sources

Flag any uncertainty or missing information

Personalization:

Tailor recommendations to visitor preferences (e.g., history, food, art, walking tours)

Suggest optimal itineraries based on number of days in Kraków

Always aim to help the user explore Kraków efficiently and enjoyably, focusing on the most relevant and high-value POIs.

QUESTION: {question}

CONTEXT: {context}

Answer:"""
,
'prompt_C': """### You are a Krakow travel assistant and expert guide. 
Your task is to answer the QUESTION based **strictly** on the information provided in the CONTEXT.
Do not use any external knowledge or make assumptions — rely only on the facts from the CONTEXT.

- Be clear and concise.
- Make your answer complete but do not add information not in the CONTEXT.
- If the CONTEXT does not contain the answer, say: "I don't have enough information to answer that."

QUESTION: {question}

CONTEXT: {context}

Answer:"""
}
TEMPERATURE = 0.0

# --- helper: call LLM-judge to label one sample (use your judge prompt) ---
JUDGE_PROMPT_TEMPLATE = """
You are an evaluator. Your task is to classify the quality of the answer provided by a RAG system.
Return ONLY JSON with labels (no extra text). Use one of the allowed labels for each criterion.

Faithfulness: ["NON_FAITHFUL","PARTLY_FAITHFUL","FAITHFUL"]
Groundedness: ["NON_GROUNDED","PARTLY_GROUNDED","GROUNDED"]
Relevance: ["NON_RELEVANT","PARTLY_RELEVANT","RELEVANT"]
Completeness: ["NON_COMPLETE","PARTLY_COMPLETE","COMPLETE"]
Coherence: ["NON_COHERENT","PARTLY_COHERENT","COHERENT"]
Conciseness: ["NON_CONCISE","PARTLY_CONCISE","CONCISE"]

Question: {question}
Context: {context}
Answer: {answer}

Return JSON exactly like:
{{"faithfulness":"...", "groundedness":"...", "relevance":"...", "completeness":"...", "coherence":"...", "conciseness":"..."}}
"""

def judge_label(question, context, answer):
    prompt = JUDGE_PROMPT_TEMPLATE.format(question=question, context=context, answer=answer)
    resp = openai_client.chat.completions.create(
        model="gpt-4o-mini",  # LLM-sędzia
        messages=[{"role":"user","content":prompt}],
        temperature=0.0
    )
    text = resp.choices[0].message.content.strip()
    
    # usuń ewentualne znaczniki markdown ```json ... ```
    text = re.sub(r"```json|```", "", text).strip()
    
    try:
        labels = json.loads(text)
    except json.JSONDecodeError:
        # fallback: ewaluacja literalna (bez markdown)
        try:
            labels = eval(text)
        except Exception as e:
            print("Failed to parse JSON from model response:", repr(text))
            raise e
    return labels

# --- helper: convert labels to numeric quality_score (0-6) ---
POSITIVE_MAPPING = {
    "faithfulness": "FAITHFUL",
    "groundedness": "GROUNDED",
    "relevance": "RELEVANT",
    "completeness": "COMPLETE",
    "coherence": "COHERENT",
    "conciseness": "CONCISE"
}

def quality_score_from_labels(labels):
    score = 0
    for crit, pos_label in POSITIVE_MAPPING.items():
        if labels.get(crit) == pos_label:
            score += 1
    return score




In [84]:
print(len(ground_truth))

random.seed(42)

ground_truth_sample = random.sample(ground_truth, 100)

3340


In [None]:

records = []

for model_name, model in MODELS.items():
    for prompt_name, prompt_template in PROMPTS.items():
        print(f"Running: model={model_name} prompt={prompt_name}")
        for record in tqdm(ground_truth_sample):
            question = record["question"]
            # 1) generate answer (cache to avoid powtarzania)
            cache_key = f"{model_name}__{prompt_name}__{record['id']}.json"
            cache_path = os.path.join(OUTPUT_DIR, cache_key)
            if os.path.exists(cache_path):
                with open(cache_path, "r", encoding="utf-8") as f:
                    rec = json.load(f)
            else:
                search_results = rrf_search(question)
                search_results =filter_rrf_results(search_results)
                context = build_context(search_results)
                prompt = build_prompt(prompt_template,question, context)
                answer = model(prompt)
                labels = judge_label(question, context, answer)
                score = quality_score_from_labels(labels)
                rec = {
                    "model":    model_name,
                    "prompt": prompt_name,
                    "id": record["id"],
                    "question": question,
                    "context": context,
                    "answer": answer,
                    "labels": labels,
                    "quality_score": score
                }
                with open(cache_path, "w", encoding="utf-8") as f:
                    json.dump(rec, f, ensure_ascii=False, indent=2)
            records.append(rec)


Running: model=openai_llm prompt=prompt_A


  7%|▋         | 7/100 [03:10<44:31, 28.73s/it]

In [None]:

df = pd.DataFrame(records)
df.to_parquet(os.path.join(OUTPUT_DIR, "all_runs.parquet"), index=False)


In [None]:

# ---------- ANALIZA POROWNAWCZA ----------
# 1) dystrybucje quality_score per (model,prompt)
summary = df.groupby(["model","prompt"])["quality_score"].describe()
print(summary)


In [None]:

# 2) rozkład etykiet relevance per pair
rel_dist = df.groupby(["model","prompt"])["labels"].apply(lambda s: pd.Series([x["relevance"] for x in s]).value_counts(normalize=True))
print(rel_dist)


In [None]:

# 3) Entropia rozkładów etykiet (relevance)
def entropy_for_group(g):
    vals = [x["relevance"] for x in g["labels"]]
    counts = pd.Series(vals).value_counts(normalize=True)
    return entropy(counts, base=2)

ent = df.groupby(["model","prompt"]).apply(entropy_for_group)
print("Entropy (relevance):\n", ent)


In [None]:

# 4) Bootstrap: porównanie średniego quality_score między dwoma konfiguracjami
def bootstrap_mean_diff(a, b, n_boot=5000, seed=42):
    rng = np.random.RandomState(seed)
    diffs = []
    n = len(a)
    for _ in range(n_boot):
        idx = rng.randint(0, n, n)  # bootstrap indices
        diffs.append(np.mean(a[idx]) - np.mean(b[idx]))
    diffs = np.array(diffs)
    lo, hi = np.percentile(diffs, [2.5, 97.5])
    return np.mean(a) - np.mean(b), lo, hi

# example: compare first two combos
combos = list(df.groupby(["model","prompt"]))
if len(combos) >= 2:
    (m1,p1), g1 = combos[0]
    (m2,p2), g2 = combos[1]
    # ensure matched ids: join on sample id
    merged = pd.merge(g1, g2, on="id", suffixes=("_a","_b"))
    a = merged["quality_score_a"].values
    b = merged["quality_score_b"].values
    mean_diff, lo, hi = bootstrap_mean_diff(a, b)
    print(f"Mean quality_score diff ({m1},{p1}) - ({m2},{p2}) = {mean_diff:.3f}, 95% CI [{lo:.3f}, {hi:.3f}]")


In [None]:

# save final aggregated results
agg = df.groupby(["model","prompt"])["quality_score"].agg(["mean","std","count"])
agg.to_csv(os.path.join(OUTPUT_DIR,"agg_results.csv"))

print(agg)

In [None]:

# Załóżmy, że masz df jak wcześniej:
# kolumny: model, prompt, labels (dict z 'faithfulness','groundedness','relevance',...)

# --- przygotowanie danych do wykresu ---
# Rozbijamy kolumnę 'labels' na osobne wiersze dla każdego kryterium
records_plots = []
for _, row in df.iterrows():
    for crit in ['faithfulness','groundedness','relevance']:
        records_plots .append({
            'model': row['model'],
            'prompt': row['prompt'],
            'criterion': crit,
            'label': row['labels'][crit]
        })

plot_df = pd.DataFrame(records_plots)

# --- wykresy słupkowe dla każdej konfiguracji ---
sns.set(style="whitegrid")

for crit in ['faithfulness','groundedness','relevance']:
    plt.figure(figsize=(10,5))
    subset = plot_df[plot_df['criterion'] == crit]
    
    # liczność etykiet
    sns.countplot(data=subset, x='label', hue='model', palette='Set2')
    
    plt.title(f'Distribution of labels for {crit}')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.legend(title='Model')
    plt.show()

# --- wykres procentowy ---
for crit in ['faithfulness','groundedness','relevance']:
    plt.figure(figsize=(10,5))
    subset = plot_df[plot_df['criterion'] == crit]
    
    # obliczamy procenty
    pct_df = subset.groupby(['model','prompt','label']).size().groupby(level=[0,1]).apply(lambda x: 100*x/x.sum()).reset_index(name='percent')
    
    sns.barplot(data=pct_df, x='label', y='percent', hue='model', palette='Set2')
    plt.title(f'Percentage distribution of labels for {crit}')
    plt.xlabel('Label')
    plt.ylabel('Percent (%)')
    plt.legend(title='Model')
    plt.show()
