# Model Comparison Report by Experiment Type

This report compares the performance of all models under the `data/predictions` directory, grouped by experiment type (rag, zeroshot, fewshot, cot). For each experiment, a table and line graphs are provided to visualize the metrics (correct_instances, total_instances, comet_score, meta_score) per language for each model.

In [27]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt

base_dir = "./data/predictions"
models = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
experiment_map = {
    "rag": ["rag-wikidata", "rag-wikidata-entity-type-matching", "one_shot_rag_wikidata", "zero_shot_rag_wikidata"],
    "zeroshot": ["zero-shot", "zero_shot", "zero-shot-1", "zero-shot-2", "zero_shot_longer_prompt"],
    "fewshot": ["few-shot", "few(3)_shot", "few(3)_shot_longer_prompt", "one_shot_longer_prompt"],
    "cot": ["cot"]
}

# Collect results: {experiment: {model: {lang: metrics_dict}}}
results = {}
for exp_group, exp_folders in experiment_map.items():
    results[exp_group] = {}
    for model in models:
        model_dir = os.path.join(base_dir, model, "validation")
        found = False
        for exp_folder in exp_folders:
            exp_path = os.path.join(model_dir, exp_folder, "scores")
            if os.path.isdir(exp_path):
                found = True
                break
        if not found:
            continue
        results[exp_group][model] = {}
        for exp_folder in exp_folders:
            exp_path = os.path.join(model_dir, exp_folder, "scores")
            if not os.path.isdir(exp_path):
                continue
            for fname in os.listdir(exp_path):
                if fname.endswith(".json"):
                    lang = fname.replace(".json", "")
                    with open(os.path.join(exp_path, fname), "r", encoding="utf-8") as f:
                        data = json.load(f)
                        results[exp_group][model][lang] = {
                            "correct_instances": data.get("correct_instances", 0),
                            "total_instances": data.get("total_instances", 0),
                            "comet_score": data.get("comet_score", None),
                            "meta_score": data.get("meta_score", None)
                        }

In [None]:
# Collect results: {experiment_type: {model: {experiment: {lang: metrics_dict}}}}
results = {}
for exp_group, exp_folders in experiment_map.items():
    results[exp_group] = {}
    for model in models:
        model_dir = os.path.join(base_dir, model, "validation")
        for exp_folder in exp_folders:
            exp_path = os.path.join(model_dir, exp_folder, "scores")
            if not os.path.isdir(exp_path):
                continue
            if model not in results[exp_group]:
                results[exp_group][model] = {}
            if exp_folder not in results[exp_group][model]:
                results[exp_group][model][exp_folder] = {}
            for fname in os.listdir(exp_path):
                if fname.endswith(".json"):
                    lang = fname.replace(".json", "")
                    with open(os.path.join(exp_path, fname), "r", encoding="utf-8") as f:
                        data = json.load(f)
                        results[exp_group][model][exp_folder][lang] = {
                            "correct_instances": data.get("correct_instances", 0),
                            "total_instances": data.get("total_instances", 0),
                            "comet_score": data.get("comet_score", None),
                            "meta_score": data.get("meta_score", None)
                        }

In [None]:
# For each experiment type, create a merged table and line graph for correct/total instances, and separate for other metrics
for exp_group in results:
    display_name = exp_group.capitalize()
    print(f"\n## {display_name} Experiment")
    exp_data = results[exp_group]
    # Collect all (model, experiment) pairs
    all_langs = set()
    all_model_exp = []
    for model in exp_data:
        for exp_name in exp_data[model]:
            all_model_exp.append((model, exp_name))
            all_langs.update(exp_data[model][exp_name].keys())
    all_langs = sorted(list(all_langs))
    all_model_exp = sorted(all_model_exp)
    # --- Merged Table for Correct/Total Instances ---
    total_instances_row = {"Model": "Total Instances"}
    for lang in all_langs:
        for model, exp_name in all_model_exp:
            val = exp_data[model][exp_name].get(lang, {}).get("total_instances", None)
            if val is not None:
                total_instances_row[lang] = val
                break
        else:
            total_instances_row[lang] = "NA"
    # Build correct instances rows
    correct_rows = []
    for model, exp_name in all_model_exp:
        row = {"Model": f"{model} ({exp_name})"}
        for lang in all_langs:
            val = exp_data[model][exp_name].get(lang, {}).get("correct_instances", None)
            row[lang] = val if val is not None else "NA"
        correct_rows.append(row)
    merged_df = pd.DataFrame([total_instances_row] + correct_rows)
    print(f"\n### Correct / Total Instances Table")
    display(merged_df)
    # Merged line graph: only plot values for languages where data is available for that (model, exp)
    plt.figure(figsize=(12, 6))
    for model, exp_name in all_model_exp:
        langs = []
        values = []
        for lang in all_langs:
            val = exp_data[model][exp_name].get(lang, {}).get("correct_instances", None)
            if val is not None:
                langs.append(lang)
                values.append(val)
        plt.plot(langs, values, marker='o', label=f"{model} ({exp_name}) (correct)")
    # Only plot total instances for languages where available
    langs_total = [lang for lang in all_langs if total_instances_row[lang] != "NA"]
    total_values = [total_instances_row[lang] for lang in langs_total]
    plt.plot(langs_total, total_values, linestyle='--', marker='x', color='black', label='Total Instances')
    chart_title = f"{display_name} Experiment: Correct / Total Instances per Language and Model"
    plt.title(chart_title)
    plt.ylabel("Instances")
    plt.xlabel("Language")
    plt.xticks(rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    # Save chart as PNG
    filename = chart_title.lower().replace(' ', '-').replace('/', '-').replace(':', '') + '.png'
    plt.savefig(os.path.join('images', filename), bbox_inches='tight')
    plt.show()
    # --- Other metrics as before, but with line+marker plots and NA for missing ---
    for metric_key, metric_label in [("comet_score", "COMET Score"), ("meta_score", "Meta Score")]:
        table_rows = []
        for model, exp_name in all_model_exp:
            row = {"Model": f"{model} ({exp_name})"}
            for lang in all_langs:
                val = exp_data[model][exp_name].get(lang, {}).get(metric_key, None)
                row[lang] = val if val is not None else "NA"
            table_rows.append(row)
        metric_df = pd.DataFrame(table_rows)
        print(f"\n### {metric_label} Table")
        display(metric_df)
        plt.figure(figsize=(12, 6))
        for model, exp_name in all_model_exp:
            langs = []
            values = []
            for lang in all_langs:
                val = exp_data[model][exp_name].get(lang, {}).get(metric_key, None)
                if val is not None:
                    langs.append(lang)
                    values.append(val)
            plt.plot(langs, values, marker='o', label=f"{model} ({exp_name})")
        chart_title = f"{display_name} Experiment: {metric_label} per Language and Model"
        plt.title(chart_title)
        plt.ylabel(metric_label)
        plt.xlabel("Language")
        plt.xticks(rotation=45, ha='right')
        plt.legend()
        plt.tight_layout()
        # Save chart as PNG
        filename = chart_title.lower().replace(' ', '-').replace('/', '-').replace(':', '') + '.png'
        plt.savefig(os.path.join('images', filename), bbox_inches='tight')
        plt.show()