In [38]:
# for each json file in output/results_q2e, do the following stuff
# take average for all values in the json file
# use the name of the json file (without json) as the name for the evaluation metrics
# in a csv file, the first column is the model name, for this row is q2e, the remaining is the evaluation metrics
import os
import json
import csv
import numpy as np
import pandas as pd
domain = "travel_dest"
embedding = "all-MiniLM-L6-v2"
# embedding = "msmarco-distilbert-base-tas-b"
output_csv = f"output/{domain}/evaluation_results.csv"
# Dictionary to store results in {model: {metric1: score1, metric2: score2, ...}} format
results_dict = {}
# Collect all metric names across models
metric_set = set()
for dir in os.listdir(f"output/{domain}"):
    #test whether dir is a directory
    if not os.path.isdir(f"output/{domain}/{dir}"):
        continue
    for file in os.listdir(f"output/{domain}/{dir}"):
        if embedding not in file:
            continue
        results_dir = os.path.join(f"output/{domain}", dir, file, f"results_{dir}")
        if not os.path.isdir(results_dir):
            continue  # Skip if not a valid results directory
        model_metrics = {}
        for file in os.listdir(results_dir):
            if file.endswith(".json"):
                with open(os.path.join(results_dir, file), "r") as f_json:
                    data = json.load(f_json)
                    metric_name = file.split(".")[0]
                    average_score = np.mean(list(data.values()))  # Compute mean score
                    model_metrics[metric_name] = average_score
                    metric_set.add(metric_name)  # Store all unique metric names
        results_dict[dir] = model_metrics  # Store the model's metrics
# Ensure all models have the same metric columns
metric_list = sorted(metric_set)  # Sort metrics for consistent column order
# Write to CSV
with open(output_csv, "w", newline="") as f:
    writer = csv.writer(f)
    # Write header: model name + all metric names
    writer.writerow(["model"] + metric_list)
    # Write rows for each model
    for model, metrics in results_dict.items():
        row = [model] + [metrics.get(metric, "N/A") for metric in metric_list]  # Fill missing values with "N/A"
        writer.writerow(row)
df = pd.read_csv(output_csv)
df

Unnamed: 0,model,map_at10,map_at100,map_at30,map_at50,recall_at10,recall_at100,recall_at30,recall_at50,rprecision
0,eqr_10,0.687638,,0.620278,0.577103,0.09386,,0.225422,0.314697,0.391402
1,eqr_12,0.723609,,0.640437,0.594826,0.094927,,0.22885,0.319741,0.394312
2,eqr_15,0.712255,,0.636847,0.596476,0.092833,,0.228949,0.321003,0.400211
3,eqr_5,0.716768,,0.632836,0.593324,0.093041,,0.231076,0.319396,0.394187
4,eqr_8,0.730383,,0.640527,0.591337,0.095725,,0.226652,0.314055,0.389835
5,gqr,0.55915,,0.504337,0.469693,0.073195,,0.180866,0.259065,0.336732
6,none,0.527457,0.391093,0.472738,0.435117,0.066663,0.391625,0.160803,0.24294,0.312361
7,q2d,0.744667,,0.639104,0.585436,0.094013,,0.21007,0.298666,0.374102
8,q2e,0.626822,,0.552426,0.516304,0.077318,,0.200628,0.2879,0.368814


In [35]:
import os
import json
import csv
import numpy as np
import pandas as pd

domain1 = "restaurant_nol"
domain2 = "restaurant_phi"
# embedding = "msmarco-distilbert-base-tas-b"
embedding = "all-MiniLM-L6-v2"
output_csv = "output/combined_evaluation_results.csv"

def process_domain(domain):
    results_dict = {}
    metric_set = set()
    
    for dir in os.listdir(f"output/{domain}"):
        if not os.path.isdir(f"output/{domain}/{dir}"):
            continue
            
        for file in os.listdir(f"output/{domain}/{dir}"):
            if embedding not in file:
                continue
                
            results_dir = os.path.join(f"output/{domain}", dir, file, f"results_{dir}")
            if not os.path.isdir(results_dir):
                continue
                
            model_metrics = {}
            for result_file in os.listdir(results_dir):
                if result_file.endswith(".json"):
                    with open(os.path.join(results_dir, result_file), "r") as f_json:
                        data = json.load(f_json)
                        metric_name = result_file.split(".")[0]
                        average_score = np.mean(list(data.values()))
                        model_metrics[metric_name] = average_score
                        metric_set.add(metric_name)
                        
            results_dict[dir] = model_metrics
            
    return results_dict, metric_set

# Process both domains
results_dict1, metric_set1 = process_domain(domain1)
results_dict2, metric_set2 = process_domain(domain2)

# Combine metric sets
all_metrics = sorted(metric_set1.union(metric_set2))

# Compute averages across domains
combined_results = {}
for model in set(results_dict1.keys()).union(set(results_dict2.keys())):
    combined_metrics = {}
    for metric in all_metrics:
        scores = []
        if model in results_dict1 and metric in results_dict1[model]:
            scores.append(results_dict1[model][metric])
        if model in results_dict2 and metric in results_dict2[model]:
            scores.append(results_dict2[model][metric])
        if scores:
            combined_metrics[metric] = np.mean(scores)
    combined_results[model] = combined_metrics

# Write results to CSV
with open(output_csv, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["model"] + all_metrics)
    
    for model, metrics in combined_results.items():
        row = [model] + [metrics.get(metric, "N/A") for metric in all_metrics]
        writer.writerow(row)

# Display results
df = pd.read_csv(output_csv)
df

Unnamed: 0,model,map_at10,map_at100,map_at15,map_at30,map_at50,map_at8,recall_at10,recall_at100,recall_at15,recall_at30,recall_at50,recall_at8,rprecision
0,q2d,0.794448,,0.697337,,0.70243,0.828674,0.11089,,0.030832,,0.251341,0.141224,0.455544
1,eqr_12,0.816286,,0.758718,,0.721566,0.820306,0.105569,,0.03864,,0.263048,0.130252,0.467052
2,gqr,0.793684,,0.703991,,0.690098,0.808852,0.108649,,0.031228,,0.258378,0.134931,0.450394
3,eqr_8,0.807628,,0.727038,,0.720348,0.817229,0.107895,,0.038792,,0.263006,0.123371,0.467882
4,eqr_10,0.821871,,0.769292,,0.723962,0.823489,0.107522,,0.038793,,0.262524,0.123081,0.466703
5,none,0.77123,0.63851,,0.709995,0.679858,,0.088044,0.275184,,0.152758,0.196165,,0.451708
6,eqr_15,0.811064,,0.776885,,0.722267,0.80892,0.106851,,0.039595,,0.260345,0.120181,0.469134
7,q2e,0.850059,,0.79057,,0.756967,0.860368,0.11384,,0.037733,,0.267499,0.142178,0.488384
8,eqr_5,0.815494,,0.726751,,0.723623,0.828075,0.107554,,0.038166,,0.261106,0.12401,0.468506


In [29]:
import os
import json
from collections import defaultdict
import scipy.stats as stats
# List of domains and result directories
# List of domains and result directories
domains = [
    "hotel_nyc", "hotel_nyc_2", "hotel_nyc_3",
    "hotel_beijing", "hotel_beijing_1", "hotel_beijing_2"
]
methods = ["eqr_10/results_eqr_10", "q2d/results_q2d", "q2e/results_q2e"]
# Initialize dictionaries to collect results
eqr_final = defaultdict(lambda: defaultdict(list))
q2d_final = defaultdict(lambda: defaultdict(list))
q2e_final = defaultdict(lambda: defaultdict(list))
# Helper function to load JSON data and append scores to the respective dictionaries
def load_scores(directory, file, metric_name, result_dict):
    file_path = os.path.join(directory, file)
    if os.path.isfile(file_path):  # Ensure the file exists before attempting to load it
        with open(file_path, "r") as f_json:
            data = json.load(f_json)
            for q, score in data.items():
                result_dict[q][metric_name].append(score)
# Iterate through all domains and methods to load results
for domain in domains:
    for method in methods:
        result_dir = f"output/{domain}/{method}"
        if os.path.exists(result_dir):  # Ensure the directory exists
            files = os.listdir(result_dir)
            for file in files:
                metric_name = file.split(".")[0]
                if "eqr" in method:
                    load_scores(result_dir, file, metric_name, eqr_final)
                elif "q2d" in method:
                    load_scores(result_dir, file, metric_name, q2d_final)
                elif "q2e" in method:
                    load_scores(result_dir, file, metric_name, q2e_final)
# Identify queries with statistically significant higher scores in EQR across all metrics
significant_queries = []
alpha = 0.32
required_significant_metrics = 3
for query, eqr_scores in eqr_final.items():
    if query in q2d_final and query in q2e_final:
        significant_metric_count = 0
        for metric, eqr_metric_scores in eqr_scores.items():
            q2d_metric_scores = q2d_final[query][metric]
            q2e_metric_scores = q2e_final[query][metric]
            # Perform paired t-tests for EQR vs Q2D and EQR vs Q2E
            t_stat_q2d, p_value_q2d = stats.ttest_rel(eqr_metric_scores, q2d_metric_scores)
            t_stat_q2e, p_value_q2e = stats.ttest_rel(eqr_metric_scores, q2e_metric_scores)
            # Check if EQR outperforms both Q2D and Q2E
            if p_value_q2d < alpha and p_value_q2e < alpha:
                significant_metric_count += 1
                if significant_metric_count >= required_significant_metrics:
                    significant_queries.append(query)
                    break

print(len(significant_queries))
for query in significant_queries:
    print(query)


51
Where can I find comfort and a chance to immerse myself in new surroundings?
I'm thinking of a quiet refuge that's not too far from interesting experiences.
What's a good place for someone who enjoys both downtime and a bit of excitement?
I'm looking for an inviting spot that can adapt to my changing itinerary.
Where do people go if they're seeking simplicity with a dash of adventure?
I'd like an environment that balances restfulness with discovery—ideas?
Is there a place that feels like a retreat but doesn't isolate me from the scene?
Any guidance on a spot that welcomes visitors of varied interests?
I want to keep my options open—any ideas for a hub that accommodates that?
Seeking a practical landing zone for indefinite plans—any recommendations?
I hope to find a place where I can unplug without being cut off—ideas?
Where might I land if I'm not sure what I'll do but want the option to do plenty?
I need a spot that doesn't lock me into a single type of experience—any leads?
Sugges

In [10]:
# for each json file in output/results_q2e, do the following stuff
# take average for all values in the json file
# use the name of the json file (without json) as the name for the evaluation metrics
# in a csv file, the first column is the model name, for this row is q2e, the remaining is the evaluation metrics
import os
import json
import csv
import numpy as np
import pandas as pd
domain = "restaurant_phi"
output_csv = f"output/{domain}/evaluation_results.csv"
# Dictionary to store results in {model: {metric1: score1, metric2: score2, ...}} format
results_dict = {}
# Collect all metric names across models
metric_set = set()
for dir in os.listdir(f"output/{domain}/none"):
    results_dir = os.path.join(f"output/{domain}/none", dir, f"results_none")
    if not os.path.isdir(results_dir):
        continue  # Skip if not a valid results directory
    model_metrics = {}
    for file in os.listdir(results_dir):
        if file.endswith(".json"):
            with open(os.path.join(results_dir, file), "r") as f_json:
                data = json.load(f_json)
                metric_name = file.split(".")[0]
                average_score = np.mean(list(data.values()))  # Compute mean score
                model_metrics[metric_name] = average_score
                metric_set.add(metric_name)  # Store all unique metric names
    results_dict[dir] = model_metrics  # Store the model's metrics
# Ensure all models have the same metric columns
metric_list = sorted(metric_set)  # Sort metrics for consistent column order
# Write to CSV
with open(output_csv, "w", newline="") as f:
    writer = csv.writer(f)
    # Write header: model name + all metric names
    writer.writerow(["model"] + metric_list)
    # Write rows for each model
    for model, metrics in results_dict.items():
        row = [model] + [metrics.get(metric, "N/A") for metric in metric_list]  # Fill missing values with "N/A"
        writer.writerow(row)
df = pd.read_csv(output_csv)
df

Unnamed: 0,model,map_at10,map_at100,map_at30,map_at50,recall_at10,recall_at100,recall_at30,recall_at50,rprecision
0,all-MiniLM-L6-v2_10,0.762361,0.622077,0.68092,0.656294,0.023162,0.169152,0.063038,0.098394,0.426198
1,all-MiniLM-L6-v2_11,0.772771,0.62305,0.686612,0.657263,0.022854,0.166886,0.062033,0.097401,0.423949
2,all-MiniLM-L6-v2_12,0.767042,0.622892,0.687769,0.656643,0.022635,0.165723,0.059166,0.097637,0.420288
3,all-MiniLM-L6-v2_13,0.764303,0.621236,0.684265,0.657267,0.022663,0.165642,0.059869,0.096532,0.419324
4,all-MiniLM-L6-v2_14,0.76311,0.619934,0.682454,0.657458,0.022885,0.166077,0.061673,0.095164,0.417825
5,all-MiniLM-L6-v2_15,0.763751,0.619435,0.682463,0.65744,0.023082,0.166507,0.061361,0.094338,0.416071
6,all-MiniLM-L6-v2_16,0.760818,0.618351,0.680895,0.656181,0.021924,0.16414,0.061065,0.09394,0.412341
7,all-MiniLM-L6-v2_17,0.769629,0.617059,0.679022,0.655197,0.0216,0.163596,0.061962,0.093849,0.412945
8,all-MiniLM-L6-v2_18,0.769052,0.615332,0.677254,0.653269,0.021373,0.163214,0.061702,0.093036,0.411509
9,all-MiniLM-L6-v2_19,0.766175,0.613107,0.673572,0.65051,0.02111,0.162156,0.061919,0.092536,0.409819


In [15]:
for domain in os.listdir(f"data"):
    print(f'analysis for {domain}')
    for file in os.listdir(f"data/{domain}"):
        if file == "ground_truth.json":
            with open(f"data/{domain}/{file}", "r") as f:
                data = json.load(f)
                gt_queries = set(data.keys())
        elif file == "queries.txt":
            with open(f"data/{domain}/{file}", "r") as f:
                queries = set(line.strip() for line in f)
    print(f'ground truth queries: {len(gt_queries)}')
    print(f'queries: {len(queries)}')
    print(f'difference: {len(queries - gt_queries)}')




analysis for hotel_beijing
ground truth queries: 100
queries: 51
difference: 0
analysis for hotel_nyc
ground truth queries: 100
queries: 51
difference: 0
analysis for restaurant_nol
ground truth queries: 91
queries: 50
difference: 0
analysis for restaurant_phi
ground truth queries: 98
queries: 50
difference: 50
analysis for travel_dest
ground truth queries: 50
queries: 50
difference: 0
