### Create data dictionary

In [1]:
import os
import json
import sys
import numpy as np

In [2]:
data = {}

#### Identify relevant runs

In [3]:
datasets = {"AIDA",
            "cweb",
            "reddit-comments",
            "reddit-posts",
            "shadowlinks-shadow",
            "shadowlinks-tail",
            "shadowlinks-top",
            "tweeki",
            "wned-wiki"}

results_folder_path = "Google/gemini-2.5-flash-lite"
req_dates = ["2025-10-30"]
run_criteria = {"top_k": 20}
split = "test"

In [4]:
def parse_line(line):
    try:
        gold = line.split("---")[0].split("/")[1].strip()
        de_pred = line.split("---")[1].strip()
        llm_pred = line.split("---")[2].strip()

        if llm_pred == "N/A":
            return None, None, None, None

        preds = []
        preds_str = line.split("---")[3].strip()[2:-2]  # Remove leading "[(" and trailing ")]"
        for pred_str in preds_str.split("), ("):
            preds.append((pred_str.split(", ")[0][1:-1].strip(), float(pred_str.split(", ")[1].strip())))

        return gold, de_pred, llm_pred, preds
    except:
        return None, None, None, None

In [5]:
def read_run_predictions(data, dataset, results_folder_path, subfolder, split):
    file_path = os.path.join("Results", dataset, results_folder_path, subfolder, f"predictions_{split}.txt")
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    # Read File line by line
    with open(file_path, 'r') as f:
        data[dataset][subfolder] = {}
        for line in f:
            if not line.startswith('"'):
                continue
            # Parse the line
            gold, de_pred, llm_pred, preds = parse_line(line)

            if gold is None:    # Happens when there was an error parsing the line or the LLM did not make a prediction
                continue
            
            # Get the score (rounded to nearest integer) of the top prediction
            score = int(round(preds[0][1], 0))
            if not score in data[dataset][subfolder]:
                # Total: Number of spans that have this score
                # correct_llm: Number of spans with this score where LLM prediction is correct
                # correct_de: Number of spans with this score where DE prediction is correct
                # gold_in_preds: Number of spans with this score where gold entity is in predictions
                # gold_pred_ranks: List of ranks at which gold entity was found in predictions. Spans where gold entity was not found are not included.
                data[dataset][subfolder][score] = {"total": 0,
                                                   "correct_llm": 0,
                                                   "correct_de": 0,
                                                   "gold_in_preds": 0,
                                                   "gold_pred_ranks": [],
                                                   "gold_top_1_diff_abs": [],
                                                   "gold_top_1_diff_perc": []}

            # Update the counts
            data[dataset][subfolder][score]["total"] += 1
            if llm_pred == gold:
                data[dataset][subfolder][score]["correct_llm"] += 1
            if de_pred == gold:
                data[dataset][subfolder][score]["correct_de"] += 1
            
            # See if gold in preds and get rank
            for i, (pred, pred_score) in enumerate(preds):
                if pred == gold:
                    data[dataset][subfolder][score]["gold_pred_ranks"].append(i + 1)
                    data[dataset][subfolder][score]["gold_in_preds"] += 1
                    data[dataset][subfolder][score]["gold_top_1_diff_abs"].append(abs(pred_score - score))
                    data[dataset][subfolder][score]["gold_top_1_diff_perc"].append((pred_score - score) / score * 100 if score != 0 else 0)
                    break
    
    for score in data[dataset][subfolder]:
        data[dataset][subfolder][score]["gold_preds_rank_avg"] = float(round(np.mean(data[dataset][subfolder][score]["gold_pred_ranks"]) if data[dataset][subfolder][score]["gold_pred_ranks"] else 0.0, 2))
        data[dataset][subfolder][score]["gold_preds_rank_75th"] = float(round(np.percentile(data[dataset][subfolder][score]["gold_pred_ranks"], 90) if data[dataset][subfolder][score]["gold_pred_ranks"] else 0.0, 2))
    
        data[dataset][subfolder][score]["gold_top_1_diff_abs_avg"] = float(round(np.mean(data[dataset][subfolder][score]["gold_top_1_diff_abs"]) if data[dataset][subfolder][score]["gold_top_1_diff_abs"] else 0.0, 2))
        data[dataset][subfolder][score]["gold_top_1_diff_perc_avg"] = float(round(np.mean(data[dataset][subfolder][score]["gold_top_1_diff_perc"]) if data[dataset][subfolder][score]["gold_top_1_diff_perc"] else 0.0, 2))
    # Sort the dictionary by score
    data[dataset][subfolder] = dict(sorted(data[dataset][subfolder].items()))

            



In [6]:
for dataset in datasets:
    if not dataset in data:
        data[dataset] = {}

    # Get names of all Subfolders in dataset folder (Results/<dataset>/<results_folder_path>/)
    subfolders = [f.name for f in os.scandir(f"Results/{dataset}/{results_folder_path}/") if f.is_dir()]
    
    for subfolder in subfolders:
        # Check if subfolder timestamp contains a valid date
        valid = False
        for req_date in req_dates:
            if req_date in subfolder:
                valid = True
                break
        if not valid:
            continue

        # Check if run meets criteria
        run_args = json.load(open(f"Results/{dataset}/{results_folder_path}/{subfolder}/run_parameters.json", "r"))

        valid = True
        for key, value in run_criteria.items():
            if key not in run_args or run_args[key] != value:
                valid = False
                break

        if not valid:
            continue

        # Read Run results
        read_run_predictions(data, dataset, results_folder_path, subfolder, split)


print("Printing statistics")
for i in range(-30, 0, 1):
    print(f"Score: {i}")
    global_avgs = []
    global_75ths = []
    global_gold_top_1_diff_abs_list = []
    global_gold_top_1_diff_perc_list = []
    for dataset in data:
        _avg_rank_list = []
        _75th_rank_list = []
        _gold_top_1_diff_abs_list = []
        _gold_top_1_diff_perc_list = []
        for subfolder in data[dataset]:
            if i in data[dataset][subfolder]:
                _avg_rank_list.append(data[dataset][subfolder][i]["gold_preds_rank_avg"])
                _75th_rank_list.append(data[dataset][subfolder][i]["gold_preds_rank_75th"])
                _gold_top_1_diff_abs_list.append(data[dataset][subfolder][i]["gold_top_1_diff_abs_avg"])
                _gold_top_1_diff_perc_list.append(data[dataset][subfolder][i]["gold_top_1_diff_perc_avg"])
        if len(_avg_rank_list) > 0:
            avg_rank = float(round(np.mean(_avg_rank_list), 2))
            global_avgs.append(avg_rank)
            rank_75th = float(round(np.mean(_75th_rank_list), 2))
            global_75ths.append(rank_75th)
            print(f"{dataset}: Avg Rank: {avg_rank}, 75th Percentile Rank: {rank_75th}")
        
        if len(_gold_top_1_diff_abs_list) > 0:
            gold_top_1_diff_abs = float(round(np.mean(_gold_top_1_diff_abs_list), 2))
            global_gold_top_1_diff_abs_list.append(gold_top_1_diff_abs)
            gold_top_1_diff_rel = float(round(np.mean(_gold_top_1_diff_perc_list), 2))
            global_gold_top_1_diff_perc_list.append(gold_top_1_diff_rel)
            print(f"-      Avg Gold Top-1 Diff Abs: {gold_top_1_diff_abs}, Avg Gold Top-1 Diff Perc: {gold_top_1_diff_rel}")

    if len(global_avgs) > 0:
        print()
        print(f"Avg Global Avg Rank: {float(round(np.mean(global_avgs), 2))}, 75th: {float(round(np.percentile(global_avgs, 75), 2))}, Last: {float(round(np.max(global_avgs), 2))}")
        print(f"Avg Global 75th% Rank: {float(round(np.mean(global_75ths), 2))}, 75th: {float(round(np.percentile(global_75ths, 75), 2))}, Last: {float(round(np.max(global_75ths), 2))}")

    if len(global_gold_top_1_diff_abs_list) > 0:
        print(f"Avg Global Gold Top-1 Diff Abs: {float(round(np.mean(global_gold_top_1_diff_abs_list), 2))}, 75th: {float(round(np.percentile(global_gold_top_1_diff_abs_list, 75), 2))}, Last: {float(round(np.max(global_gold_top_1_diff_abs_list), 2))}")

    if len(global_gold_top_1_diff_perc_list) > 0:
        print(f"Avg Global Gold Top-1 Diff Perc: {float(round(np.mean(global_gold_top_1_diff_perc_list), 2))}, 75th: {float(round(np.percentile(global_gold_top_1_diff_perc_list, 75), 2))}, Last: {float(round(np.max(global_gold_top_1_diff_perc_list), 2))}")

    print()
    print("--------------------------------------------------")
    print()

Printing statistics
Score: -30

--------------------------------------------------

Score: -29

--------------------------------------------------

Score: -28
shadowlinks-top: Avg Rank: 4.0, 75th Percentile Rank: 4.0
-      Avg Gold Top-1 Diff Abs: 0.15, Avg Gold Top-1 Diff Perc: -0.54
tweeki: Avg Rank: 6.0, 75th Percentile Rank: 13.0
-      Avg Gold Top-1 Diff Abs: 0.55, Avg Gold Top-1 Diff Perc: 0.8
reddit-comments: Avg Rank: 9.0, 75th Percentile Rank: 13.8
-      Avg Gold Top-1 Diff Abs: 0.58, Avg Gold Top-1 Diff Perc: 1.13
wned-wiki: Avg Rank: 1.0, 75th Percentile Rank: 1.0
-      Avg Gold Top-1 Diff Abs: 0.31, Avg Gold Top-1 Diff Perc: -1.12
cweb: Avg Rank: 7.0, 75th Percentile Rank: 7.8
-      Avg Gold Top-1 Diff Abs: 0.3, Avg Gold Top-1 Diff Perc: 1.06

Avg Global Avg Rank: 5.4, 75th: 7.0, Last: 9.0
Avg Global 75th% Rank: 7.92, 75th: 13.0, Last: 13.8
Avg Global Gold Top-1 Diff Abs: 0.38, 75th: 0.55, Last: 0.58
Avg Global Gold Top-1 Diff Perc: 0.27, 75th: 1.06, Last: 1.13

------

### Generate graphics