### Evaluation on FIRE Dataset

In [1]:
import json
import pandas as pd

In [2]:
# FIRE Dataset - Load data files
fire_data_path = "../../datasets/fire/data"

fire_files = {
    "train": f"{fire_data_path}/fire_train.json",
    "test": f"{fire_data_path}/fire_test.json", 
    "dev": f"{fire_data_path}/fire_dev.json",
    "types": f"{fire_data_path}/fire_types.json"
}

print("FIRE dataset files:")
for split, path in fire_files.items():
    print(f"{split}: {path}")

FIRE dataset files:
train: ../../datasets/fire/data/fire_train.json
test: ../../datasets/fire/data/fire_test.json
dev: ../../datasets/fire/data/fire_dev.json
types: ../../datasets/fire/data/fire_types.json


In [3]:
# Load FIRE dataset splits
with open(fire_files["train"], 'r') as f:
    fire_train = json.load(f)
    
with open(fire_files["test"], 'r') as f:
    fire_test = json.load(f)
    
with open(fire_files["dev"], 'r') as f:
    fire_dev = json.load(f)
    
with open(fire_files["types"], 'r') as f:
    fire_types = json.load(f)

print(f"Dataset sizes: Train={len(fire_train)}, Test={len(fire_test)}, Dev={len(fire_dev)}")

Dataset sizes: Train=2117, Test=454, Dev=454


### Data Preprocessing

In [5]:
def preprocess_fire_data(fire_data):
    """
    Convert FIRE dataset from tokens to sentences for multi-agent system.
    
    Each sample contains:
    - tokens: List of individual tokens  
    - entities: List with entity info
    - relations: List with relation info
    
    Returns processed data with sentence text ready for KG construction.
    """
    processed_samples = []
    
    for sample in fire_data:
        # Convert tokens to sentence
        sentence = " ".join(sample['tokens'])
        
        # Extract entities in simple format
        entities = []
        for entity in sample['entities']:
            entities.append({
                'text': entity['text'],
                'type': entity['type'],
                'start': entity['start'], 
                'end': entity['end']
            })
        
        # Extract relations in simple format
        relations = []
        for relation in sample['relations']:
            relations.append({
                'head_id': relation['head'],
                'tail_id': relation['tail'], 
                'type': relation['type']
            })
        
        processed_samples.append({
            'sentence': sentence,
            'entities': entities,
            'relations': relations,
            'orig_id': sample.get('orig_id', len(processed_samples))
        })
    
    return processed_samples

# Test preprocessing on small sample
test_sample = preprocess_fire_data(fire_train[:3])
print("Preprocessed samples:")
for i, sample in enumerate(test_sample):
    print(f"\nSample {i+1}:")
    print(f"Sentence: {sample['sentence'][:100]}...")
    print(f"Entities: {len(sample['entities'])}")
    print(f"Relations: {len(sample['relations'])}")

Preprocessed samples:

Sample 1:
Sentence: Albertsons and Rite Aid called off the planned merger after several advisory firms recommended votin...
Entities: 4
Relations: 0

Sample 2:
Sentence: EU clears merger of Essilor , Luxottica without conditions...
Entities: 4
Relations: 2

Sample 3:
Sentence: On February 21 , Broadcom reduced their offer to $117 billion in light of Qualcomms increased bid fo...
Entities: 10
Relations: 5


In [6]:
# Preprocess all FIRE dataset splits  
fire_train_processed = preprocess_fire_data(fire_train)
fire_test_processed = preprocess_fire_data(fire_test)
fire_dev_processed = preprocess_fire_data(fire_dev)

print("Preprocessed dataset sizes:")
print(f"Train: {len(fire_train_processed)} samples")
print(f"Test: {len(fire_test_processed)} samples") 
print(f"Dev: {len(fire_dev_processed)} samples")

Preprocessed dataset sizes:
Train: 2117 samples
Test: 454 samples
Dev: 454 samples


In [7]:
fire_train_df = pd.DataFrame(fire_train_processed)
fire_test_df = pd.DataFrame(fire_test_processed)
fire_dev_df = pd.DataFrame(fire_dev_processed)

In [7]:
fire_train_df.iloc[:3]

Unnamed: 0,sentence,entities,relations,orig_id
0,Albertsons and Rite Aid called off the planned...,"[{'text': 'Albertsons', 'type': 'Company', 'st...",[],0
1,"EU clears merger of Essilor , Luxottica withou...","[{'text': 'EU', 'type': 'Location', 'start': 0...","[{'head_id': 1, 'tail_id': 3, 'type': 'Actiont...",2
2,"On February 21 , Broadcom reduced their offer ...","[{'text': 'February 21', 'type': 'Date', 'star...","[{'head_id': 2, 'tail_id': 1, 'type': 'Propert...",4


In [8]:

entities = [e for e in fire_types['entities']]
relations = [r for r in fire_types['relations']]

print(f"Entities: {entities}")
print(f"Relations: {relations}")

Entities: ['Action', 'BusinessUnit', 'Company', 'Date', 'Designation', 'FinancialEntity', 'GeopoliticalEntity', 'Location', 'Money', 'Person', 'Product', 'Quantity', 'Sector']
Relations: ['ActionBuy', 'Actionin', 'ActionMerge', 'ActionSell', 'Actionto', 'Constituentof', 'Designation', 'Employeeof', 'Locatedin', 'Productof', 'Propertyof', 'Quantity', 'Sector', 'Subsidiaryof', 'Value', 'ValueChangeDecreaseby', 'Valuein', 'ValueChangeIncreaseby']


### Initialize Evalution

In [9]:
import sys
import os
from datetime import datetime

# Add both the main source and current directory to path
sys.path.append('../../src')
sys.path.append('.')  # For fire_metrics in current directory

from ma_finkg.kg_construction_graph import FinancialKGConstructionGraph
from fire_metrics import evaluate_simple_baseline, calculate_fire_metrics

# Set OpenRouter API key and model
openrouter_key = "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"
model = "qwen/qwen3-30b-a3b-instruct-2507"

# Set API key and initialize
os.environ["OPENROUTER_API_KEY"] = openrouter_key
kg_system = FinancialKGConstructionGraph(model_name=model, ontology="fire", prompts="fire")

### Baseline & Multi-Agent System Eval on FIRE Dataset

In [10]:
# Run baseline evaluation
start = datetime.now()
model_name = "qwen/qwen3-30b-a3b-instruct-2507"
print(f"Running simple {model_name} few-shot baseline...")
baseline_results = evaluate_simple_baseline(model_name, fire_test_processed, max_samples=454, entity_text_only=True)

print(f"\nSimple {model_name} Baseline Results:")
print(f"RE (Head/Relation/Tail) - P:{baseline_results['re_precision']:.3f}, R: {baseline_results['re_recall']:.3f}, F1: {baseline_results['re_f1']:.3f}")
print(f"NER - P:{baseline_results['ner_precision']:.3f}, R: {baseline_results['ner_recall']:.3f}, F1: {baseline_results['ner_f1']:.3f}")
print(f"Relation Type - P:{baseline_results['relation_type_precision']:.3f}, R: {baseline_results['relation_type_recall']:.3f}, F1: {baseline_results['relation_type_f1']:.3f}")
print(f"Samples: {baseline_results['samples']}")
end = datetime.now()
print(f"Eval Duration: {end - start}")


Running simple qwen/qwen3-30b-a3b-instruct-2507 few-shot baseline...

=== Sample 1 ===
Text: Mr. Rooke owns and manages Rooke Fiduciary Management , a private trust company , which specializes in the investment management of publicly held securities and the oversight of a multitude of trust investments .
Gold: [('Mr. Rooke', 'Employeeof', 'Rooke Fiduciary Management'), ('Rooke Fiduciary Management', 'Sector', 'trust')]
Predicted: [('Rooke Fiduciary Management', 'Productof', 'investment management of publicly held securities'), ('Rooke Fiduciary Management', 'Locatedin', 'private trust company'), ('Rooke Fiduciary Management', 'Productof', 'oversight of a multitude of trust investments'), ('Mr. Rooke', 'Employeeof', 'Rooke Fiduciary Management'), ('Rooke Fiduciary Management', 'Sector', 'Financial Services')]
TP: 1, FP: 4, FN: 1

=== Sample 2 ===
Text: Pegasus s profits and losses will be distributed at 80 % to APH and 20 % to PLF .
Gold: [('profits and losses', 'Quantity', '20 %'), ('p

In [11]:
# Run comprehensive evaluation
start_ma = datetime.now()
print("Running comprehensive NER, RE, and Triplets evaluation...")
comprehensive_results = calculate_fire_metrics(fire_test_processed, kg_system, max_samples=454, entity_text_only=True)

print(f"\n=== COMPREHENSIVE EVALUATION RESULTS (n={comprehensive_results['processed_samples']}) ===")
print(f"\nNER Metrics:")
print(f"  Precision: {comprehensive_results['ner']['precision']:.3f}")
print(f"  Recall:    {comprehensive_results['ner']['recall']:.3f}")
print(f"  F1:        {comprehensive_results['ner']['f1']:.3f}")
print(f"  (TP:{comprehensive_results['ner']['tp']}, FP:{comprehensive_results['ner']['fp']}, FN:{comprehensive_results['ner']['fn']})")

print(f"\nRE Metrics:")
print(f"  Precision: {comprehensive_results['re']['precision']:.3f}")
print(f"  Recall:    {comprehensive_results['re']['recall']:.3f}")
print(f"  F1:        {comprehensive_results['re']['f1']:.3f}")
print(f"  (TP:{comprehensive_results['re']['tp']}, FP:{comprehensive_results['re']['fp']}, FN:{comprehensive_results['re']['fn']})")

print(f"\nTriplets Metrics:")
print(f"Precision: {comprehensive_results['re']['precision']:.3f}")
print(f"Recall:    {comprehensive_results['re']['recall']:.3f}")
print(f"F1:        {comprehensive_results['re']['f1']:.3f}")
print(f"(TP:{comprehensive_results['re']['tp']}, FP:{comprehensive_results['re']['fp']}, FN:{comprehensive_results['re']['fn']})")

# Save results to file
import json
with open('comprehensive_fire_results.json', 'w') as f:
    json.dump(comprehensive_results, f, indent=2)
    
print(f"\nResults {model} saved to comprehensive_fire_results.json")
end_ma = datetime.now()
print(f"Eval Duration: {end_ma - start_ma}")

Running comprehensive NER, RE, and Triplets evaluation...
Processing sample 1/454...
[0.0s] Starting knowledge graph construction...
[2.9s] Creating ontology...

[ONTOLOGY] Using predefined fire ontology
[4.2s] Extracting entities and relations...
[24.1s] NER completed: 10 entities                                                                  
[36.1s] RE completed: 4 filtered triples                                                            

[REVISION] Validated: 10/10 entities, 4/4 triples
[36.6s] Finalizing results...
[36.6s] Construction completed!

=== DEBUG SAMPLE 1 === [TEXT-ONLY]
NER: Gold entities (3): ['rooke fiduciary management', 'trust', 'mr. rooke'])
NER: Sys entities (10): ['owner', 'manages', 'manager', 'investment management', 'oversees', 'specializes', 'private trust company', 'oversight of trust investments', 'rooke fiduciary management', 'owns'])
Gold triples (2): [('mr. rooke', 'Employeeof', 'rooke fiduciary management'), ('rooke fiduciary management', 'Sector'

In [None]:
# Results Comparison Table

import pandas as pd

# Create comparison dataframe
results_data = []

# Add baseline results
if 'baseline_results' in locals():
    results_data.append({
        'System':  model + " " + model_name + ' Baseline',
        'NER_P': baseline_results['ner_precision'],
        'NER_R': baseline_results['ner_recall'],
        'NER_F1': baseline_results['ner_f1'],
        'RE_P': baseline_results['re_precision'],
        'RE_R': baseline_results['re_recall'],
        'RE_F1': baseline_results['re_f1'],
        'RelType_P': baseline_results['relation_type_precision'],
        'RelType_R': baseline_results['relation_type_recall'],
        'RelType_F1': baseline_results['relation_type_f1']
    })

# Add multi-agent results
results_data.append({
    'System': 'Multi-Agent System',
    'NER_P': comprehensive_results['ner']['precision'],
    'NER_R': comprehensive_results['ner']['recall'],
    'NER_F1': comprehensive_results['ner']['f1'],
    'RE_P': comprehensive_results['re']['precision'],
    'RE_R': comprehensive_results['re']['recall'],
    'RE_F1': comprehensive_results['re']['f1'],
    'RelType_P': comprehensive_results['relation_type']['precision'],
    'RelType_R': comprehensive_results['relation_type']['recall'],
    'RelType_F1': comprehensive_results['relation_type']['f1']
})

# Create and display comparison table
comparison_df = pd.DataFrame(results_data)
comparison_df = comparison_df.round(3)

print("=== FIRE EVALUATION COMPARISON ===")
print(comparison_df.to_string(index=False))

# Detailed breakdown for multi-agent system
print(f"\n=== DETAILED MULTI-AGENT BREAKDOWN ===")
print(f"Samples processed: {comprehensive_results['processed_samples']}")
print(f"\nNER (Entity Extraction):")
print(f"  True Positives:  {comprehensive_results['ner']['tp']}")
print(f"  False Positives: {comprehensive_results['ner']['fp']}")
print(f"  False Negatives: {comprehensive_results['ner']['fn']}")

print(f"\nRE (Relation Extraction):")
print(f"  True Positives:  {comprehensive_results['re']['tp']}")
print(f"  False Positives: {comprehensive_results['re']['fp']}")
print(f"  False Negatives: {comprehensive_results['re']['fn']}")

print(f"\nRelation Types:")
print(f"  True Positives:  {comprehensive_results['relation_type']['tp']}")
print(f"  False Positives: {comprehensive_results['relation_type']['fp']}")
print(f"  False Negatives: {comprehensive_results['relation_type']['fn']}")

# Save comparison table
comparison_df.to_csv('fire_evaluation_comparison.csv', index=False)
print(f"\nComparison table saved to fire_evaluation_comparison.csv")

=== FIRE EVALUATION COMPARISON ===
                                                                    System  NER_P  NER_R  NER_F1  RE_P  RE_R  RE_F1  RelType_P  RelType_R  RelType_F1
qwen/qwen3-30b-a3b-instruct-2507 qwen/qwen3-30b-a3b-instruct-2507 Baseline  0.412  0.493   0.449 0.084 0.178  0.114      0.356      0.769       0.487
                                                        Multi-Agent System  0.531  0.602   0.565 0.129 0.166  0.145      0.382      0.355       0.368

=== DETAILED MULTI-AGENT BREAKDOWN ===
Samples processed: 454

NER (Entity Extraction):
  True Positives:  1305
  False Positives: 1152
  False Negatives: 861

RE (Relation Extraction):
  True Positives:  200
  False Positives: 1351
  False Negatives: 1008

Relation Types:
  True Positives:  306
  False Positives: 494
  False Negatives: 557

Comparison table saved to fire_evaluation_comparison.csv
