In [15]:
import json
import os
import numpy as np
import pandas as pd

llms = ['Llama-3.1-8B-Instruct', 'gemma-2-2b-it', 'mistral-instruct']
tasks = ['education', 'therapy', 'chatting']
n_files_per_llm = 4

metric_keys = ['P2_prompt_consistency_score', 'P2_index_consistency_score', 'P2_q&a_consistency_score']
rename_map = {
    'P2_prompt_consistency_score': 'line-to-prompt Consistency',
    'P2_index_consistency_score':   'line-to-line Consistency',
    'P2_q&a_consistency_score':     'q&a Consistency'
}

tasks_root_dir = 'results'
records = []

for task in tasks:
    task_dir = os.path.join(tasks_root_dir, task)
    if not os.path.isdir(task_dir):
        print(f"Warning: task directory not found: {task_dir}; skipping.")
        continue

    all_files = [f for f in os.listdir(task_dir) if f.endswith('.json')]

    for llm in llms:
        llm_files = sorted([f for f in all_files if llm in f])[:n_files_per_llm]
        metric_values = {k: [] for k in metric_keys}

        for fname in llm_files:
            path = os.path.join(task_dir, fname)
            try:
                data = json.load(open(path))
            except Exception as e:
                print(f"  • Couldn’t read {fname} ({e}); skipping.")
                continue

            if isinstance(data, list):
                for entry in data:
                    for k in metric_keys:
                        v = entry.get(k, 0.0)
                        if not isinstance(v, (int, float)):
                            print(f"    • Bad '{k}' in {fname}; using 0")
                            v = 0.0
                        metric_values[k].append(v)
            elif isinstance(data, dict):
                for k in metric_keys:
                    arr = data.get(k, [])
                    if isinstance(arr, list):
                        for v in arr:
                            if not isinstance(v, (int, float)):
                                print(f"    • Bad '{k}' entry in {fname}; using 0")
                                v = 0.0
                            metric_values[k].append(v)
                    else:
                        print(f"    • Invalid format for '{k}' in {fname}; skipping.")
            else:
                print(f"    • Unexpected structure in {fname}; skipping.")

        rec = {'Task': task, 'LLM': llm}
        for k in metric_keys:
            vals = metric_values[k]
            mean = np.mean(vals) if vals else 0.0
            std = np.std(vals) if vals else 0.0
            rec[k] = f"{mean:.3f} ± {std:.3f}"
        records.append(rec)

df = pd.DataFrame(records)
table = df.set_index(['Task', 'LLM'])[metric_keys]
table.rename(columns=rename_map, inplace=True)

latex = table.to_latex(
    escape=False,
    caption="All Metrics (Prompt & Index Consistency) by Task and LLM (Mean ± Std)",
    label="tab:all_metrics"
)

with open('all_llm_metrics.tex', 'w') as f:
    f.write(latex)

print(latex)
print("\nLaTeX table written to all_llm_metrics.tex")


\begin{table}
\caption{All Metrics (Prompt & Index Consistency) by Task and LLM (Mean ± Std)}
\label{tab:all_metrics}
\begin{tabular}{lllll}
\toprule
 &  & line-to-prompt Consistency & line-to-line Consistency & q&a Consistency \\
Task & LLM &  &  &  \\
\midrule
\multirow[t]{3}{*}{education} & Llama-3.1-8B-Instruct & 0.824 ± 0.132 & 0.800 ± 0.148 & 0.000 ± 0.000 \\
 & gemma-2-2b-it & 0.511 ± 0.250 & 0.928 ± 0.092 & 0.000 ± 0.000 \\
 & mistral-instruct & 0.728 ± 0.191 & 0.975 ± 0.063 & 0.000 ± 0.000 \\
\cline{1-5}
\multirow[t]{3}{*}{therapy} & Llama-3.1-8B-Instruct & 0.740 ± 0.163 & 0.711 ± 0.158 & 0.000 ± 0.000 \\
 & gemma-2-2b-it & 0.665 ± 0.247 & 0.984 ± 0.040 & 0.000 ± 0.000 \\
 & mistral-instruct & 0.833 ± 0.142 & 0.959 ± 0.079 & 0.000 ± 0.000 \\
\cline{1-5}
\multirow[t]{3}{*}{chatting} & Llama-3.1-8B-Instruct & 0.619 ± 0.249 & 0.992 ± 0.025 & 0.000 ± 0.000 \\
 & gemma-2-2b-it & 0.871 ± 0.230 & 0.900 ± 0.123 & 0.000 ± 0.000 \\
 & mistral-instruct & 0.955 ± 0.097 & 0.984 ± 0.038 & 0