### Import evaluation dataset and dependecies

Records is a list of dictionaries. Each dictionary specifies the original language of the article, metadata such as title and abstract, and a list of keywords. Each keyword is a dictionary with specified: the label, the wikidata_url, and the match. This is present even if the keyword is not in the original language of the article. 

In [None]:
from src.utils import parse_excel_file, compute_precision, compute_recall
from src.clients import OpenAIClient, AnthropicClient, GroqClient, OpenAIWebSearchClient
from src.pipelines import EntityExtractionPipeline
import os
import uuid
from datetime import datetime
import json

In [None]:
total_records = parse_excel_file("./data/Dset_Eval_KW_Alignment_Eval_def.xlsx")
records = total_records[:10]

### Definition of the client

Run the load_dotenv() for load all the env variables and run the cell with your client and 

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
model = 'openai'
client = OpenAIClient(api_key=os.getenv("OPENAI_API_KEY"), model_name="gpt-4o-mini")

In [None]:
model = 'openai_websearch'
client = OpenAIWebSearchClient(api_key=os.getenv("OPENAI_API_KEY"), model_name="gpt-4o-search-preview")

In [None]:
model = 'anthropic'
client = AnthropicClient(api_key=os.getenv("ANTHROPIC_API_KEY"), model_name="claude-opus-4-20250514")

In [None]:
model = 'groq'
client = GroqClient(api_key=os.getenv("GROQ_API_KEY"), model_name="llama-3.1-8b-instant")

### ðŸ”„ Pipeline Initialization

This cell runs the entity extraction pipeline using the input `.xlsx` file. It processes each article in the dataset and does the following for every keyword:

1. **Generate Entity Candidates**
   For each keyword, it generates up to 10 possible matching entities from Wikidata (default value set by `NUM_NAMES`).

2. **Select Relevant Entity**
   From the candidates, the model selects the best matching entity (by default, only 1 is selected).

3. **Track Progress and Save Results**
   The results are saved in a `.json` file with the original article data enriched by the LLM annotations for each keyword.

In [None]:
run_id = str(uuid.uuid4())

In [None]:
pipeline = EntityExtractionPipeline(client)

In [None]:
os.makedirs("./data/evaluation_output", exist_ok=True)

In [None]:
SAVE_EVERY = 3
MAX_WORKERS = 10
base_output_dir = './data/evaluation_output'
base_filename = f'eval_{model}_{run_id}'
results_file = os.path.join(base_output_dir, f"{base_filename}.json")
adjusted_file = os.path.join(base_output_dir, f"{base_filename}_adjusted.json")

In [None]:
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import copy

file_lock = threading.Lock()

def process_keyword(record_data):
    record_idx, kw_idx, language, title_or, abstract_or, kw_label = record_data
    
    try:
        llm_uris = pipeline.extract_entities(
            language=language,
            title=title_or,
            abstract=abstract_or,
            keywords=kw_label
        )
        return record_idx, kw_idx, llm_uris, None
    except Exception as e:
        print(f"LLM URIs cannot be computed for record {record_idx}, keyword {kw_idx}: {e}")
        return record_idx, kw_idx, [], str(e)

tasks_data = []
for record_idx, record in enumerate(records):
    for kw_idx, kw in enumerate(record['kws']):
        task_data = (
            record_idx,
            kw_idx,
            record['language'],
            record['title_eng'],
            record['abstract_eng'],
            kw['label']
        )
        tasks_data.append(task_data)

print(f"Preparati {len(tasks_data)} task da processare")

In [None]:

# Processa in parallelo
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Sottometti tutti i task con dati giÃ  copiati
    future_to_task = {
        executor.submit(process_keyword, task_data): task_data[0:2]  # record_idx, kw_idx
        for task_data in tasks_data
    }
    
    processed_count = 0
    with tqdm(total=len(tasks_data), desc="Processing keywords") as pbar:
        for future in as_completed(future_to_task):
            record_idx, kw_idx, llm_uris, error = future.result()
            
            # Aggiorna i risultati
            records[record_idx]['kws'][kw_idx]['llm_uris'] = llm_uris
            
            processed_count += 1
            pbar.update(1)
            
            # Salvataggio periodico thread-safe
            if processed_count % SAVE_EVERY == 0:
                with file_lock:
                    # print(f"Saving partial results at {processed_count} processed keywords")
                    with open(results_file, 'w', encoding='utf-8') as f:
                        json.dump(records, f, ensure_ascii=False, indent=2)

# Salvataggio finale
print("All keywords processed, saving final results.")
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

The following cell of code contains some required adjustments to the evaluation output for the results analysis. These adjustments mainly involve string parsing. Running this cell is required to obtain the statistics. 

In [None]:
new_records = []

with open(results_file, 'r', encoding='utf-8') as f:
    records = json.load(f)


for record in records:
    new_record = record
    for i, kw in enumerate(record['kws']):
        if len(kw['llm_uris']) == 3 and kw['llm_uris'][1] == "wikidata":
            new_record['kws'][i]['llm_uris'] = ['.'.join(kw['llm_uris'])]
    new_records.append(new_record)

with open(adjusted_file, 'w', encoding='utf-8') as f:
    json.dump(new_records, f, ensure_ascii=False, indent=2)
    

### Evaluation

The following two cells can be used to obtain the statistics. As can be seen, the recall, precision, and the F1 scores are printed. They are printed for all the examples, for language (that is, we distinguish the results based on the original language of the article of the keyword), and by match type (where for match type we consider how match the Wikidata URL matches the actual keyword, this could be 'e' - exact match -, or 'r' - related match).

As can be seen in the example below, the code prints a report with the results. 

In [None]:
file_to_evaluate = adjusted_file

In [None]:
with open(file_to_evaluate, 'r', encoding='utf-8') as f:
    records = json.load(f)

languages = set(record['language'] for record in records)

scores_llm = {
    "Total": {
        "recall": {"Sum": 0, "Size": 0},
        "precision": {"Sum": 0, "Size": 0},
        "f1": {"Sum": 0, "Size": 0}
    },
    "Per_match_type": {
        "e": {
            "recall": {"Sum": 0, "Size": 0},
            "precision": {"Sum": 0, "Size": 0},
            "f1": {"Sum": 0, "Size": 0}
        },
        "r": {
            "recall": {"Sum": 0, "Size": 0},
            "precision": {"Sum": 0, "Size": 0},
            "f1": {"Sum": 0, "Size": 0}
        }
    },
    "Per_language": {
        language: {
            "recall": {"Sum": 0, "Size": 0},
            "precision": {"Sum": 0, "Size": 0},
            "f1": {"Sum": 0, "Size": 0}
        } for language in languages
    }
}

for record in records:
    for i, kw in enumerate(record['kws']):
        if kw['match'] in ("e", "r"):
            correct_uris = [
                url.replace("https", "http").replace("/wiki/", "/entity/")
                for url in kw['wikidata_url']
            ]
            retrieved_uris_llm = kw['llm_uris']
            recall_llm = compute_recall(correct_uris, retrieved_uris_llm)
            precision_llm = compute_precision(correct_uris, retrieved_uris_llm)
            
            # Calcolo del punteggio F1 con controllo per divisione per zero
            if (precision_llm + recall_llm) > 0:
                f1_llm = 2 * precision_llm * recall_llm / (precision_llm + recall_llm)
            else:
                f1_llm = 0

            # Aggiornamento dei punteggi totali
            scores_llm["Total"]["recall"]["Sum"] += recall_llm
            scores_llm["Total"]["recall"]["Size"] += 1
            scores_llm["Total"]["precision"]["Sum"] += precision_llm
            scores_llm["Total"]["precision"]["Size"] += 1
            scores_llm["Total"]["f1"]["Sum"] += f1_llm
            scores_llm["Total"]["f1"]["Size"] += 1

            # Aggiornamento dei punteggi per tipo di match
            scores_llm["Per_match_type"][kw['match']]["recall"]["Sum"] += recall_llm
            scores_llm["Per_match_type"][kw['match']]["recall"]["Size"] += 1
            scores_llm["Per_match_type"][kw['match']]["precision"]["Sum"] += precision_llm
            scores_llm["Per_match_type"][kw['match']]["precision"]["Size"] += 1
            scores_llm["Per_match_type"][kw['match']]["f1"]["Sum"] += f1_llm
            scores_llm["Per_match_type"][kw['match']]["f1"]["Size"] += 1

            # Aggiornamento dei punteggi per lingua
            scores_llm["Per_language"][record['language']]["recall"]["Sum"] += recall_llm
            scores_llm["Per_language"][record['language']]["recall"]["Size"] += 1
            scores_llm["Per_language"][record['language']]["precision"]["Sum"] += precision_llm
            scores_llm["Per_language"][record['language']]["precision"]["Size"] += 1
            scores_llm["Per_language"][record['language']]["f1"]["Sum"] += f1_llm
            scores_llm["Per_language"][record['language']]["f1"]["Size"] += 1


In [None]:
import json

def compute_mean_metrics(stats_dict):
    """
    Dato un dizionario con la struttura:
    {
      'Total': {
        'recall': {'Sum': x, 'Size': y},
        'precision': {'Sum': x2, 'Size': y2},
        'f1': {'Sum': x3, 'Size': y3}
      },
      # 'Per_match_type': {
        'e': {
          'recall': {'Sum': x, 'Size': y},
          'precision': {'Sum': x2, 'Size': y2},
          'f1': {'Sum': x3, 'Size': y3}
        },
        ...
      },
      'Per_language': {
        'fr': {
          'recall': {'Sum': x, 'Size': y},
          'precision': {'Sum': x2, 'Size': y2},
          'f1': {'Sum': x3, 'Size': y3}
        },
        ...
      }
    }

    Restituisce un dizionario con i valori medi di recall, precision e f1.
    """
    mean_dict = {
        'Total': {},
        'Per_match_type': {},
        'Per_language': {}
    }
    
    # --- 1) TOTAL ---
    if 'Total' in stats_dict:
        total_recall_sum = stats_dict['Total']['recall']['Sum']
        total_recall_size = stats_dict['Total']['recall']['Size']
        total_precision_sum = stats_dict['Total']['precision']['Sum']
        total_precision_size = stats_dict['Total']['precision']['Size']
        
        mean_dict['Total']['recall'] = total_recall_sum / total_recall_size if total_recall_size != 0 else 0
        mean_dict['Total']['precision'] = total_precision_sum / total_precision_size if total_precision_size != 0 else 0
        
        if 'f1' in stats_dict['Total']:
            total_f1_sum = stats_dict['Total']['f1']['Sum']
            total_f1_size = stats_dict['Total']['f1']['Size']
            mean_dict['Total']['f1'] = total_f1_sum / total_f1_size if total_f1_size != 0 else 0

    # --- 2) PER MATCH TYPE ---
    if 'Per_match_type' in stats_dict:
        for match_type, metrics in stats_dict['Per_match_type'].items():
            recall_sum = metrics['recall']['Sum']
            recall_size = metrics['recall']['Size']
            precision_sum = metrics['precision']['Sum']
            precision_size = metrics['precision']['Size']
            
            mean_rec = recall_sum / recall_size if recall_size != 0 else 0
            mean_prec = precision_sum / precision_size if precision_size != 0 else 0
            
            mean_dict['Per_match_type'][match_type] = {
                'recall': mean_rec,
                'precision': mean_prec
            }
            if 'f1' in metrics:
                f1_sum = metrics['f1']['Sum']
                f1_size = metrics['f1']['Size']
                mean_dict['Per_match_type'][match_type]['f1'] = f1_sum / f1_size if f1_size != 0 else 0

    # --- 3) PER LANGUAGE ---
    if 'Per_language' in stats_dict:
        for lang, metrics in stats_dict['Per_language'].items():
            recall_sum = metrics['recall']['Sum']
            recall_size = metrics['recall']['Size']
            precision_sum = metrics['precision']['Sum']
            precision_size = metrics['precision']['Size']
            
            mean_rec = recall_sum / recall_size if recall_size != 0 else 0
            mean_prec = precision_sum / precision_size if precision_size != 0 else 0
            
            mean_dict['Per_language'][lang] = {
                'recall': mean_rec,
                'precision': mean_prec
            }
            if 'f1' in metrics:
                f1_sum = metrics['f1']['Sum']
                f1_size = metrics['f1']['Size']
                mean_dict['Per_language'][lang]['f1'] = f1_sum / f1_size if f1_size != 0 else 0
                
    return mean_dict


def print_system_report(system_dict):
    """
    Stampa un report leggibile dei risultati del sistema (LLM) per Total,
    Per_match_type e Per_language includendo F1.
    """
    system_means = compute_mean_metrics(system_dict)
    
    print("======== SYSTEM METRICS REPORT ========")
    
    # --- 1) TOTAL ---
    print("\n--- TOTAL ---")
    total = system_means.get('Total', {})
    if total:
        print(f"LLM Recall:    {total.get('recall', 0):.4f}")
        print(f"LLM Precision: {total.get('precision', 0):.4f}")
        print(f"LLM F1:        {total.get('f1', 0):.4f}")
    else:
        print("No total data found.")
    
    # --- 2) PER MATCH TYPE ---
    print("\n--- PER MATCH TYPE ---")
    per_match_data = system_means.get('Per_match_type', {})
    for mtype, vals in per_match_data.items():
        print(f"\nMatch Type: {mtype}")
        print(f"  LLM Recall:    {vals.get('recall', 0):.4f}")
        print(f"  LLM Precision: {vals.get('precision', 0):.4f}")
        print(f"  LLM F1:        {vals.get('f1', 0):.4f}")
    
    # --- 3) PER LANGUAGE ---
    print("\n--- PER LANGUAGE ---")
    per_lang_data = system_means.get('Per_language', {})
    # Ordinamento convertendo le chiavi in stringa per evitare errori se sono di tipo misto
    for lang, vals in sorted(per_lang_data.items(), key=lambda x: str(x[0])):
        print(f"\nLanguage: {lang}")
        print(f"  LLM Recall:    {vals.get('recall', 0):.4f}")
        print(f"  LLM Precision: {vals.get('precision', 0):.4f}")
        print(f"  LLM F1:        {vals.get('f1', 0):.4f}")


# Esempio di utilizzo:
# Supponiamo che 'scores_llm' contenga i risultati del sistema (LLM)
# Ad esempio, scores_llm Ã¨ stato popolato precedentemente nel codice.
print_system_report(scores_llm)