In [11]:
import json
import os
import numpy as np
import pandas as pd

# --- Configuration: adjust these to your setup ---
llms = ['Llama-3.1-8B-Instruct', 'gemma-2-2b-it', 'mistral-instruct']
tasks = ['education', 'therapy', 'chatting']
n_files_per_llm = 4

# Original JSON keys and their desired display names
metric_keys = ['P2_prompt_consistency_score', 'P2_index_consistency_score', 'P2_q&a_consistency_score']
rename_map = {
    'P2_prompt_consistency_score': 'line-to-prompt Consistency',
    'P2_index_consistency_score':   'line-to-line consistency',
    'P2_q&a_consistency_score':     'q&a consistency'
}

# Base directory containing one subfolder per task
tasks_root_dir = 'results'
records = []
for task in tasks:
    task_dir = os.path.join(tasks_root_dir, task)
    if not os.path.isdir(task_dir):
        print(f"Warning: task directory not found: {task_dir}; skipping.")
        continue

    # scan for JSON files once per task
    all_files = [f for f in os.listdir(task_dir) if f.endswith('.json')]
    for llm in llms:
        llm_files = sorted([f for f in all_files if llm in f])[:n_files_per_llm]
        metric_file_means = {k: [] for k in metric_keys}

        for fname in llm_files:
            path = os.path.join(task_dir, fname)
            try:
                data = json.load(open(path))
            except Exception as e:
                print(f"  • Couldn’t read {fname} ({e}); using zeros.")
                for k in metric_keys:
                    metric_file_means[k].append(0.0)
                continue

            # handle list-of-dicts format
            if isinstance(data, list):
                for k in metric_keys:
                    vals = []
                    for i, entry in enumerate(data):
                        v = entry.get(k, 0.0)
                        if not isinstance(v, (int, float)):
                            print(f"    • Bad '{k}' in element {i} of {fname}; using 0")
                            v = 0.0
                        vals.append(v)
                    metric_file_means[k].append(np.mean(vals) if vals else 0.0)

            # handle single-dict-with-list format
            elif isinstance(data, dict):
                for k in metric_keys:
                    arr = data.get(k, [])
                    if isinstance(arr, list) and len(arr) > 0:
                        metric_file_means[k].append(np.mean(arr))
                    else:
                        print(f"    • Missing/invalid '{k}' in {fname}; using 0")
                        metric_file_means[k].append(0.0)
            else:
                print(f"    • Unexpected structure in {fname}; padding zeros.")
                for k in metric_keys:
                    metric_file_means[k].append(0.0)

        # pad if fewer runs
        for k in metric_keys:
            missing = n_files_per_llm - len(metric_file_means[k])
            if missing > 0:
                metric_file_means[k].extend([0.0] * missing)

        rec = {'Task': task, 'LLM': llm}
        for k in metric_keys:
            rec[k] = np.mean(metric_file_means[k])
        records.append(rec)

# Build DataFrame with multi-index and three metric columns
df = pd.DataFrame(records)
table = df.set_index(['Task','LLM'])[metric_keys]
table.rename(columns=rename_map, inplace=True)

# Export to LaTeX
latex = table.to_latex(
    float_format="%.3f",
    caption="All Metrics (Prompt & Index Consistency) by Task and LLM",
    label="tab:all_metrics"
)
with open('all_llm_metrics.tex', 'w') as f:
    f.write(latex)

# Print or display
print(latex)
print(table)
print("\nLaTeX table written to all_llm_metrics.tex")

\begin{table}
\caption{All Metrics (Prompt & Index Consistency) by Task and LLM}
\label{tab:all_metrics}
\begin{tabular}{llrrr}
\toprule
 &  & line-to-prompt Consistency & line-to-line consistency & q&a consistency \\
Task & LLM &  &  &  \\
\midrule
\multirow[t]{3}{*}{education} & Llama-3.1-8B-Instruct & 0.824 & 0.800 & 0.000 \\
 & gemma-2-2b-it & 0.511 & 0.928 & 0.000 \\
 & mistral-instruct & 0.728 & 0.975 & 0.000 \\
\cline{1-5}
\multirow[t]{3}{*}{therapy} & Llama-3.1-8B-Instruct & 0.566 & 0.496 & 0.000 \\
 & gemma-2-2b-it & 0.000 & 0.000 & 0.000 \\
 & mistral-instruct & 0.000 & 0.000 & 0.000 \\
\cline{1-5}
\multirow[t]{3}{*}{chatting} & Llama-3.1-8B-Instruct & 0.155 & 0.248 & 0.000 \\
 & gemma-2-2b-it & 0.871 & 0.900 & 0.000 \\
 & mistral-instruct & 0.955 & 0.984 & 0.000 \\
\cline{1-5}
\bottomrule
\end{tabular}
\end{table}

                                 line-to-prompt Consistency  \
Task      LLM                                                 
education Llama-3.1-8B-Instruct     