In [24]:
def compute_metrics(pred_items, gt_items, match_keys):
    """Computes precision, recall, and F1-score based on matching keys"""
    pred_set = {tuple(item.get(key, None) for key in match_keys) for item in pred_items}
    gt_set = {tuple(item.get(key, None) for key in match_keys) for item in gt_items}

    correct_matches = len(pred_set & gt_set)  # Intersection of predicted & ground truth
    precision = correct_matches / len(pred_set) if pred_set else 0
    recall = correct_matches / len(gt_set) if gt_set else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0

    return precision, recall, f1


def evaluate_json(generated_json, ground_truth_json):
    """Evaluates the generated JSON against the ground truth"""

    # Node Matching
    node_keys = ["type_of_node", "label"]
    node_precision, node_recall, node_f1 = compute_metrics(
        generated_json["nodes"], ground_truth_json["nodes"], node_keys
    )

    # Edge Matching
    edge_keys = ["source_label", "target_label", "relationship_type", "relationship_value"]
    edge_precision, edge_recall, edge_f1 = compute_metrics(
        generated_json["edges"], ground_truth_json["edges"], edge_keys
    )

    return {
        "node_precision": node_precision,
        "node_recall": node_recall,
        "node_f1": node_f1,
        "edge_precision": edge_precision,
        "edge_recall": edge_recall,
        "edge_f1": edge_f1,
    }


In [25]:
import os
import pandas as pd
from typing import Literal
import json

def read_df(location:Literal['base-it1', 'base-it2','it1','it2']):
    """Find benchmark score with the evaluation Ground Truth"""
    df = pd.DataFrame()
    gt_files = os.listdir("eval_gt")
    for file in gt_files:
        with (
            open(f"eval_gt/{file}") as gt_json_file,
            open(f"outputs/{location}/{file}") as generated_json_file,
        ):
            gt = json.load(gt_json_file)
            pred = json.load(generated_json_file)
        metrics = evaluate_json(pred, gt)
        
        # Create a new single-row DataFrame
        new_row = pd.DataFrame({
            "file": [file],
            "node_precision": [metrics["node_precision"]],
            "node_recall": [metrics["node_recall"]],
            "node_f1": [metrics["node_f1"]],
            "edge_precision": [metrics["edge_precision"]],
            "edge_recall": [metrics["edge_recall"]],
            "edge_f1": [metrics["edge_f1"]],
        })
        
        # Use pd.concat instead of append
        df = pd.concat([df, new_row], ignore_index=True)
    return df
    

In [26]:
base_it1, base_it2, fin_it1, fin_it2 = read_df('base-it1'), read_df('base-it2'), read_df('it1'), read_df('it2')

In [27]:
# Mean of the model's iterations
base_mean = pd.DataFrame()
fine_mean = pd.DataFrame()

for col in base_it1.columns:
    if col == 'file':
        base_mean['file'] = base_it1['file']
    else:
        base_mean[col] = (base_it1[col] + base_it2[col])/2

for col in fin_it1.columns:
    if col == 'file':
        fine_mean['file'] = fin_it1['file']
    else:
        fine_mean[col] = (fin_it1[col] + fin_it2[col])/2

In [29]:
comparison_df = pd.DataFrame()

comparison_df['file'] = base_mean['file']
metric_columns = ['node_precision', 'node_recall', 'node_f1', 'edge_precision', 'edge_recall', 'edge_f1']

for col in metric_columns:
    comparison_df[f'base_{col}'] = base_mean[col]
    comparison_df[f'fine_{col}'] = fine_mean[col]

comparison_df

Unnamed: 0,file,base_node_precision,fine_node_precision,base_node_recall,fine_node_recall,base_node_f1,fine_node_f1,base_edge_precision,fine_edge_precision,base_edge_recall,fine_edge_recall,base_edge_f1,fine_edge_f1
0,418.json,0.5,1.0,0.428571,1.0,0.461538,1.0,0.619048,0.714286,0.571429,0.714286,0.593407,0.714286
1,37.json,0.714286,0.666667,0.625,0.5,0.666667,0.571429,0.285714,0.285714,0.222222,0.222222,0.25,0.25
2,39.json,1.0,1.0,1.0,1.0,1.0,1.0,0.333333,1.0,0.2,0.6,0.25,0.75
3,76.json,0.5,0.833333,0.5,0.833333,0.5,0.833333,0.171429,0.666667,0.142857,0.571429,0.154762,0.615385
4,634.json,0.722222,0.777778,0.722222,0.777778,0.722222,0.777778,0.0,0.5,0.0,0.454545,0.0,0.47619
5,701.json,0.75,0.75,0.75,0.75,0.75,0.75,0.333333,0.6,0.25,0.75,0.285714,0.666667
6,49.json,0.6,1.0,0.6,1.0,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,332.json,0.6,1.0,0.6,1.0,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,930.json,1.0,1.0,1.0,1.0,1.0,1.0,0.6,0.714286,0.5,0.833333,0.545455,0.769231
9,728.json,0.75,0.8,0.6,0.8,0.666667,0.8,0.0,1.0,0.0,1.0,0.0,1.0


In [45]:
f1_only = pd.DataFrame()

f1_only['file'] = comparison_df['file']
f1_only['Base Node F1'] = comparison_df['base_node_f1']
f1_only['Fine Node F1'] = comparison_df['fine_node_f1']

f1_only['Base Edge F1'] = comparison_df['base_edge_f1']
f1_only['Fine Edge F1'] = comparison_df['fine_edge_f1']
f1_only = f1_only.round(2)
f1_only

Unnamed: 0,file,Base Node F1,Fine Node F1,Base Edge F1,Fine Edge F1
0,418.json,0.46,1.0,0.59,0.71
1,37.json,0.67,0.57,0.25,0.25
2,39.json,1.0,1.0,0.25,0.75
3,76.json,0.5,0.83,0.15,0.62
4,634.json,0.72,0.78,0.0,0.48
5,701.json,0.75,0.75,0.29,0.67
6,49.json,0.6,1.0,1.0,1.0
7,332.json,0.6,1.0,1.0,1.0
8,930.json,1.0,1.0,0.55,0.77
9,728.json,0.67,0.8,0.0,1.0


In [43]:
print(f1_only['Base Node F1'].mean(), f1_only['Fine Node F1'].mean(), f1_only['Base Edge F1'].mean(), f1_only['Fine Edge F1'].mean())
print(f'impovement in Node F1: {(f1_only["Fine Node F1"].mean() - f1_only["Base Node F1"].mean()):.2%}')
print(f'impovement in Edge F1: {(f1_only["Fine Edge F1"].mean() - f1_only["Base Edge F1"].mean()):.2%}')

0.749 0.891 0.4605 0.6945
impovement in Node F1: 14.20%
impovement in Edge F1: 23.40%


In [44]:
f1_only.to_csv('qwen-2-5-vl-finetune-benchmark.csv', index=False)

In [1]:
import pandas as pd

comparison = pd.read_csv('qwen-2-5-vl-finetune-benchmark.csv')
comparison

Unnamed: 0,file,(Base)Node F1,(Fine)Node F1,(Base)Edge F1,(Fine)Edge F1
0,418.json,0.46,1.0,0.59,0.71
1,37.json,0.67,0.57,0.25,0.25
2,39.json,1.0,1.0,0.25,0.75
3,76.json,0.5,0.83,0.15,0.62
4,634.json,0.72,0.78,0.0,0.48
5,701.json,0.75,0.75,0.29,0.67
6,49.json,0.6,1.0,1.0,1.0
7,332.json,0.6,1.0,1.0,1.0
8,930.json,1.0,1.0,0.55,0.77
9,728.json,0.67,0.8,0.0,1.0


In [3]:
comparison.loc['Mean'] = comparison.mean(numeric_only=True,skipna=True )
comparison.tail()

Unnamed: 0,file,(Base)Node F1,(Fine)Node F1,(Base)Edge F1,(Fine)Edge F1
16,652.json,0.8,1.0,0.63,0.73
17,345.json,0.83,0.83,0.33,0.43
18,113.json,0.75,0.8,0.06,0.22
19,171.json,0.81,1.0,0.23,0.75
Mean,,0.749,0.891,0.4605,0.6945
