In [13]:
import json 
import pandas as pd

In [14]:
# REFinD Dataset - Load data files
refind_data_path = "../../datasets/refind"

refind_files = {
    "train": f"{refind_data_path}/train_refind_official.json",
    "test": f"{refind_data_path}/test_refind_official.json", 
    "dev": f"{refind_data_path}/dev_refind_official.json"
}

print("REFinD dataset files:")
for split, path in refind_files.items():
    print(f"{split}: {path}")

REFinD dataset files:
train: ../../datasets/refind/train_refind_official.json
test: ../../datasets/refind/test_refind_official.json
dev: ../../datasets/refind/dev_refind_official.json


In [15]:
# Load refind dataset splits
with open(refind_files["train"], 'r') as f:
    refind_train = json.load(f)
    
with open(refind_files["test"], 'r') as f:
    refind_test = json.load(f)
    
with open(refind_files["dev"], 'r') as f:
    refind_dev = json.load(f)

print(f"Dataset sizes: Train={len(refind_train)}, Test={len(refind_test)}, Dev={len(refind_dev)}")

Dataset sizes: Train=20070, Test=4300, Dev=4306


In [None]:
def preprocess_refind_data(refind_data):
    processed_samples = []
    
    for idx, sample in enumerate(refind_data):
        # Convert tokens to sentence
        sentence = " ".join(sample['token'])
        
        # Extract entities from e1 and e2
        entities = []
        
        # Entity 1
        e1_text = " ".join(sample['token'][sample['e1_start']:sample['e1_end']])
        entities.append({
            'text': e1_text,
            'type': sample['e1_type'],
            'start': sample['e1_start'], 
            'end': sample['e1_end']
        })
        
        # Entity 2  
        e2_text = " ".join(sample['token'][sample['e2_start']:sample['e2_end']])
        entities.append({
            'text': e2_text,
            'type': sample['e2_type'],
            'start': sample['e2_start'],
            'end': sample['e2_end']
        })
        
        # Extract relations 
        relations = []
        if sample['relation'] != 'no_relation':
            relations.append({
                'head_id': 0,  # e1 is head
                'tail_id': 1,  # e2 is tail
                'type': sample['relation']
            })
        
        processed_samples.append({
            'sentence': sentence,
            'entities': entities,
            'relations': relations,
            'gold_relation': sample['relation'],  # Keep original for strict F1
            'orig_id': sample.get('id', idx)
        })
    
    return processed_samples

# Test preprocessing on small sample
test_sample = preprocess_refind_data(refind_test[:5])
print("Preprocessed REFinD samples (including no_relation):")
for i, sample in enumerate(test_sample):
    print(f"\nSample {i+1}:")
    print(f"Sentence: {sample['sentence'][:80]}...")
    print(f"Gold relation: {sample['gold_relation']}")
    print(f"Entities: {len(sample['entities'])}")
    for j, entity in enumerate(sample['entities']):
        print(f"  Entity {j+1}: '{entity['text']}' ({entity['type']})")
    print(f"Relations: {len(sample['relations'])}")
    for relation in sample['relations']:
        print(f"  Relation: {sample['entities'][relation['head_id']]['text']} -> {relation['type']} -> {sample['entities'][relation['tail_id']]['text']}")
    if not sample['relations']:
        print("  (No relations - gold is no_relation)")

Preprocessed REFinD samples (including no_relation):

Sample 1:
Sentence: other changes in the financial condition or future prospects of issuers of secur...
Gold relation: no_relation
Entities: 2
  Entity 1: 'Best Hometown Bancorp , Inc.' (ORG)
  Entity 2: 'FHLB' (ORG)
Relations: 0
  (No relations - gold is no_relation)

Sample 2:
Sentence: eWELLNESS HEALTHCARE Corp also agreed to change eWELLNESS HEALTHCARE Corp. name ...
Gold relation: no_relation
Entities: 2
  Entity 1: 'eWELLNESS HEALTHCARE Corp' (ORG)
  Entity 2: 'eWellness Healthcare Corporation' (ORG)
Relations: 0
  (No relations - gold is no_relation)

Sample 3:
Sentence: eWELLNESS HEALTHCARE Corp also agreed to change eWELLNESS HEALTHCARE Corp. name ...
Gold relation: no_relation
Entities: 2
  Entity 1: 'eWELLNESS HEALTHCARE Corp.' (ORG)
  Entity 2: 'eWellness Healthcare Corporation' (ORG)
Relations: 0
  (No relations - gold is no_relation)

Sample 4:
Sentence: above and if the Huawei Contract as described in 5.c above is con

In [17]:
# Preprocess all REFinD dataset splits  
refind_train_processed = preprocess_refind_data(refind_train)
refind_test_processed = preprocess_refind_data(refind_test)
refind_dev_processed = preprocess_refind_data(refind_dev)

print("Preprocessed dataset sizes:")
print(f"Train: {len(refind_train_processed)} samples")
print(f"Test: {len(refind_test_processed)} samples") 
print(f"Dev: {len(refind_dev_processed)} samples")

Preprocessed dataset sizes:
Train: 20070 samples
Test: 4300 samples
Dev: 4306 samples


In [29]:
# Stratified sampling for REFinD evaluation
import numpy as np
from collections import Counter
import random

def stratified_sample_refind(data, target_size=855, min_per_class=5, random_state=42):
    random.seed(random_state)
    np.random.seed(random_state)
    
    # Group by relation type
    relation_groups = {}
    for sample in data:
        rel_type = sample['gold_relation']
        if rel_type not in relation_groups:
            relation_groups[rel_type] = []
        relation_groups[rel_type].append(sample)
    
    print("=== RELATION TYPE DISTRIBUTION ===")
    total_samples = len(data)
    for rel_type, samples in sorted(relation_groups.items(), key=lambda x: len(x[1]), reverse=True):
        pct = len(samples) / total_samples * 100
        print(f"{rel_type:25s}: {len(samples):4d} ({pct:5.1f}%)")
    
    sampled_data = []
    total_allocated = 0
    
    print(f"\n=== STRATIFIED SAMPLING (Target: {target_size}) ===")
    
    # First pass: Ensure minimum per class
    remaining_budget = target_size
    for rel_type, samples in relation_groups.items():
        if len(samples) < min_per_class:
            allocated = len(samples)
            sampled_data.extend(samples)
        else:
            allocated = min_per_class
            sampled_data.extend(random.sample(samples, allocated))
        
        remaining_budget -= allocated
        total_allocated += allocated
        print(f"{rel_type:25s}: {allocated:3d} samples (minimum)")
    
    # Second pass: Distribute remaining budget proportionally
    if remaining_budget > 0:
        print(f"\nDistributing remaining {remaining_budget} samples proportionally...")
        
        for rel_type, samples in relation_groups.items():
            if len(samples) >= min_per_class: 
                proportion = len(samples) / total_samples
                additional = min(
                    int(remaining_budget * proportion),
                    len(samples) - min_per_class 
                )
                
                if additional > 0:
                    available_samples = [s for s in samples if s not in sampled_data]
                    if available_samples:
                        additional_samples = random.sample(available_samples, min(additional, len(available_samples)))
                        sampled_data.extend(additional_samples)
                        total_allocated += len(additional_samples)
                        print(f"{rel_type:25s}: +{len(additional_samples):2d} samples (proportional)")
    
    print(f"\n=== FINAL STRATIFIED SAMPLE ===")
    print(f"Total sampled: {len(sampled_data)} / {target_size} target")
    
    # Verify distribution
    sampled_relations = Counter([s['gold_relation'] for s in sampled_data])
    for rel_type, count in sorted(sampled_relations.items(), key=lambda x: x[1], reverse=True):
        original_count = len(relation_groups[rel_type])
        pct_sampled = count / original_count * 100
        print(f"{rel_type:25s}: {count:3d}/{original_count:3d} ({pct_sampled:5.1f}%)")
    
    return sampled_data

# Apply stratified sampling
SAMPLE_SIZE = 855

print("Applying stratified sampling to REFinD test set...")
refind_test_sampled = stratified_sample_refind(
    refind_test_processed, 
    target_size=SAMPLE_SIZE,
    min_per_class=5
)

print(f"\n✅ Use refind_test_sampled ({len(refind_test_sampled)} samples) for evaluation")
print("✅ Maintains statistical validity with proper class representation")

Applying stratified sampling to REFinD test set...
=== RELATION TYPE DISTRIBUTION ===
no_relation              : 1953 ( 45.4%)
pers:title:title         :  671 ( 15.6%)
org:gpe:operations_in    :  605 ( 14.1%)
pers:org:employee_of     :  374 (  8.7%)
org:org:agreement_with   :  141 (  3.3%)
org:date:formed_on       :   96 (  2.2%)
pers:org:member_of       :   95 (  2.2%)
org:org:subsidiary_of    :   83 (  1.9%)
org:org:shares_of        :   61 (  1.4%)
org:money:revenue_of     :   47 (  1.1%)
org:money:loss_of        :   31 (  0.7%)
org:gpe:headquartered_in :   29 (  0.7%)
org:date:acquired_on     :   24 (  0.6%)
pers:org:founder_of      :   20 (  0.5%)
org:gpe:formed_in        :   17 (  0.4%)
org:org:acquired_by      :   12 (  0.3%)
pers:univ:employee_of    :   12 (  0.3%)
pers:gov_agy:member_of   :    8 (  0.2%)
pers:univ:attended       :    7 (  0.2%)
org:money:profit_of      :    5 (  0.1%)
pers:univ:member_of      :    5 (  0.1%)
org:money:cost_of        :    4 (  0.1%)

=== STRATIF

In [19]:
def extract_types_from_split(data, split_name):
    entity_types = set()
    relation_types = set()
    
    for sample in data:
        entity_types.add(sample['e1_type'])
        entity_types.add(sample['e2_type'])
        relation_types.add(sample['relation'])
    
    print(f"{split_name.upper()} SPLIT:")
    print(f"  Entity Types ({len(entity_types)}): {sorted(list(entity_types))}")
    print(f"  Relation Types ({len(relation_types)}): {sorted(list(relation_types))}")
    print(f"  Total Samples: {len(data)}")
    print()
    
    return entity_types, relation_types

# Extract entities & relations from each split
train_entities, train_relations = extract_types_from_split(refind_train, "train")
test_entities, test_relations = extract_types_from_split(refind_test, "test") 
dev_entities, dev_relations = extract_types_from_split(refind_dev, "dev")

TRAIN SPLIT:
  Entity Types (8): ['DATE', 'GOV_AGY', 'GPE', 'MONEY', 'ORG', 'PERSON', 'TITLE', 'UNIV']
  Relation Types (22): ['no_relation', 'org:date:acquired_on', 'org:date:formed_on', 'org:gpe:formed_in', 'org:gpe:headquartered_in', 'org:gpe:operations_in', 'org:money:cost_of', 'org:money:loss_of', 'org:money:profit_of', 'org:money:revenue_of', 'org:org:acquired_by', 'org:org:agreement_with', 'org:org:shares_of', 'org:org:subsidiary_of', 'pers:gov_agy:member_of', 'pers:org:employee_of', 'pers:org:founder_of', 'pers:org:member_of', 'pers:title:title', 'pers:univ:attended', 'pers:univ:employee_of', 'pers:univ:member_of']
  Total Samples: 20070

TEST SPLIT:
  Entity Types (8): ['DATE', 'GOV_AGY', 'GPE', 'MONEY', 'ORG', 'PERSON', 'TITLE', 'UNIV']
  Relation Types (22): ['no_relation', 'org:date:acquired_on', 'org:date:formed_on', 'org:gpe:formed_in', 'org:gpe:headquartered_in', 'org:gpe:operations_in', 'org:money:cost_of', 'org:money:loss_of', 'org:money:profit_of', 'org:money:revenue_

### REFinD Evaluation

In [None]:
import sys
import os
from datetime import datetime

# Add both the main source and current directory to path
sys.path.append('../../src')
sys.path.append('.')  # For refind_metrics in current directory

from ma_finkg.kg_construction_graph import FinancialKGConstructionGraph
from refind_metrics import evaluate_refind_dual_track, evaluate_track_a_full_pipeline, evaluate_track_b_direct_classification

# Set OpenRouter API key and model
openrouter_key = ""
model = "openai/gpt-4.1-mini"

# Set API key and initialize with REFinD ontology
os.environ["OPENROUTER_API_KEY"] = openrouter_key
kg_system = FinancialKGConstructionGraph(model_name=model, ontology="refind", prompts="refind")

In [25]:
start_ma = datetime.now()
print("Track A: Full pipeline with span matching (system capability)")
# Track A: Full Pipeline Evaluation
print("\nRunning Track A: Full Pipeline Evaluation...")
track_a_results = evaluate_track_a_full_pipeline(refind_test_sampled, kg_system, max_samples=787)

print(f"\nTRACK A: Full Pipeline (System Capability)")
print(f"  Strict Micro-F1: {track_a_results['strict_f1_relation']['f1']:.3f}")
print(f"  Precision: {track_a_results['strict_f1_relation']['precision']:.3f}")
print(f"  Recall:    {track_a_results['strict_f1_relation']['recall']:.3f}")
print(f"  (TP:{track_a_results['strict_f1_relation']['tp']}, FP:{track_a_results['strict_f1_relation']['fp']}, FN:{track_a_results['strict_f1_relation']['fn']})")
print(f"  True negatives excluded: {track_a_results['true_negatives_excluded']}")
print(f"  NER F1: {track_a_results['ner']['f1']:.3f} (P={track_a_results['ner']['precision']:.3f}, R={track_a_results['ner']['recall']:.3f})")
end_ma = datetime.now()
print(f"Evaluation Duration: {end_ma - start_ma}")

Track A: Full pipeline with span matching (system capability)

Running Track A: Full Pipeline Evaluation...
Processing sample 1/787...
[0.0s] Starting knowledge graph construction...
[1.0s] Creating ontology...

[ONTOLOGY] Using predefined refind ontology
[1.9s] Extracting entities and relations...
[7.2s] NER completed: 2 entities                                                                    
[18.5s] RE completed: 0 filtered triples                                                            

[REVISION] Validated: 2/2 entities, 0/0 triples
[19.5s] Finalizing results...
[19.5s] Construction completed!
Track A Sample 1: Gold='merk' → 'gmbh' (no_relation)
  System triples: 0
  Predicted: no_relation
  NER: Gold=2, Sys=2, TP=1


=== DEBUG SAMPLE 1 ===
Text: ( 3 ) The GmbH shall insure Mr Merk at its own expense against accidents as follows : ....
Gold relation: no_relation
Predicted relation: no_relation
Strict F1 category: TN
  → EXCLUDED from strict F1 calculation
[0.0s] Starting kn

In [None]:
# Run separate Track A and Track B evaluations

print("Running separate Track A and Track B REFinD evaluations...")
print("Track B: Direct classification (Li et al. comparable)")

# Track B: Direct Classification Evaluation  
print("\nRunning Track B: Direct Classification Evaluation...")
track_b_results = evaluate_track_b_direct_classification(refind_test_sampled, kg_system, max_samples=844)

print(f"\n=== REFinD EVALUATION RESULTS (n={track_a_results['processed_samples']}) ===")

print(f"\nTRACK B: Direct Classification (Li et al. Comparable)")
print(f"  Macro-F1: {track_b_results['macro_f1']:.3f}")
print(f"  Relations evaluated: {track_b_results['num_relations']}")
print(f"  Per-relation breakdown:")
for rel_type, metrics in track_b_results['per_relation_f1'].items():
    if metrics['tp'] > 0 or metrics['fn'] > 0:  # Only show relations that appear
        print(f"    {rel_type}: F1={metrics['f1']:.3f} (P={metrics['precision']:.3f}, R={metrics['recall']:.3f})")

# Summary
print(f"\n=== EVALUATION METHODOLOGY VALIDATION ===")
print(f"Track A: Proper entity pair span matching implemented")
print(f"Track B: Direct classification enables Li et al. comparison") 
print(f"Strict F1: Excludes {track_a_results['true_negatives_excluded']} true negatives")
print(f"Macro-F1: Computed across {track_b_results['num_relations']} relation types")
print(f"NER: Evaluated only in Track A (system capability)")

# Save results to files
import json
results = {
    'track_a': track_a_results,
    'track_b': track_b_results,
    'evaluation_summary': {
        'track_a_micro_f1': track_a_results['strict_f1_relation']['f1'],
        'track_b_macro_f1': track_b_results['macro_f1'],
        'processed_samples': track_a_results['processed_samples']
    }
}

with open('separate_track_refind_results.json', 'w') as f:
    json.dump(results, f, indent=2)
    
print(f"\nResults for {model} saved to separate_track_refind_results.json")
end_ma = datetime.now()
print(f"Evaluation Duration: {end_ma - start_ma}")

Running separate Track A and Track B REFinD evaluations...
Track B: Direct classification (Li et al. comparable)

Running Track B: Direct Classification Evaluation...
Track B: Processing sample 1/787...

[ONTOLOGY] Using predefined refind ontology
Track B Sample 1: 'Merk' → 'GmbH' = no_relation (gold: no_relation)
[0.0s] Starting knowledge graph construction...
[0.7s] Creating ontology...

[ONTOLOGY] Using predefined refind ontology
[1.4s] Extracting entities and relations...
[6.7s] NER completed: 2 entities                                                                    
[15.6s] RE completed: 0 filtered triples                                                            

[REVISION] Validated: 2/2 entities, 0/0 triples
[16.5s] Finalizing results...
[16.5s] Construction completed!

[ONTOLOGY] Using predefined refind ontology
Track B Sample 2: 'Mirror' → 'Mississippi' = no_relation (gold: no_relation)
[0.0s] Starting knowledge graph construction...
[0.9s] Creating ontology...

[ONTOLO