### NER Evaluation (Conllpp)

In [None]:
%pip install "datasets<4.0.0" "huggingface_hub<1.0.0"

In [None]:
import sys
import os
sys.path.append('.')

from datasets import load_dataset
from ma_finkg.kg_construction_graph import FinancialKGConstructionGraph
from ma_finkg.utils import set_global_timer

# Set OpenRouter API key and model
openrouter_key = ""
model = "openai/gpt-3.5-turbo"

# Set API key and initialize
os.environ["OPENROUTER_API_KEY"] = openrouter_key
kg_system = FinancialKGConstructionGraph(model_name=model, ontology="conllpp", prompts="conllpp", enable_re=False)

# Load actual datasets
print("Loading Conllpp...")
conllpp_dataset = load_dataset("ZihanWangKi/conllpp")

Loading Conllpp...


### Conllpp NER Test

In [2]:

print(f"Train: {conllpp_dataset['train']}")
print(f"Validation: {conllpp_dataset['validation']}")
print(f"Test: {conllpp_dataset['test']}")

Train: Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})
Validation: Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 3250
})
Test: Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 3453
})


In [3]:
id2ner = {
    0: 'O',
    1: 'B-PER', 2: 'I-PER',
    3: 'B-ORG', 4: 'I-ORG',
    5: 'B-LOC', 6: 'I-LOC',
    7: 'B-MISC', 8: 'I-MISC'
}
tokens = conllpp_dataset['test'][4]['tokens']
ner_tags = conllpp_dataset['test'][4]['ner_tags']
text = ' '.join(tokens)
text

'But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .'

In [4]:
entities = []
current = None

for token, tag_id in zip(tokens, ner_tags):
    tag = id2ner[tag_id]
    if tag.startswith('B-'):
        # finish previous
        if current:
            entities.append(current)
        current = {'type': tag[2:], 'text': token}
    elif tag.startswith('I-') and current and tag[2:]==current['type']:
        current['text'] += ' ' + token
    else:
        if current:
            entities.append(current)
            current = None

# catch last
if current:
    entities.append(current)

entities

[{'type': 'LOC', 'text': 'China'}, {'type': 'LOC', 'text': 'Uzbekistan'}]

### Conllpp Micro F1 Evaluation

In [5]:
# Evaluate on first 100 test samples with batching and JSON saving
import json
from pathlib import Path

rows = 50  # Change to 3453 for full evaluation (test set size) 
batch_size = 5
results_file = "conll_evaluation.json"

# Load existing progress if resuming
results_path = Path(results_file)
if results_path.exists():
    with open(results_path, 'r') as f:
        saved_data = json.load(f)
    start_idx = saved_data.get('processed_samples', 0)
    total_tp = saved_data.get('total_tp', 0)
    total_fp = saved_data.get('total_fp', 0) 
    total_fn = saved_data.get('total_fn', 0)
    print(f"Resuming from sample {start_idx}")
else:
    start_idx = 0
    total_tp = total_fp = total_fn = 0

all_gold_set = []
all_sys_set = []

for i in range(start_idx, rows):
    sample = conllpp_dataset['test'][i]
    text = ' '.join(sample['tokens'])
    
    # Get gold entities (reuse existing logic)
    entities = []
    current = None
    for token, tag_id in zip(sample['tokens'], sample['ner_tags']):
        tag = id2ner[tag_id]
        if tag.startswith('B-'):
            if current: entities.append(current)
            current = {'type': tag[2:], 'text': token}
        elif tag.startswith('I-') and current and tag[2:]==current['type']:
            current['text'] += ' ' + token
        else:
            if current: entities.append(current); current = None
    if current: entities.append(current)
    
    gold_set = set((e['text'].lower(), e['type']) for e in entities)
    all_gold_set.append(gold_set)
    
    # System extraction
    print(f"Processing sample {i+1}/{rows}...")
    
    result = kg_system.construct_kg(text)
    sys_entities = result['finalize'].get("revised_entities", []) if 'finalize' in result else []
    sys_set = set((e.text.lower(), e.entity_type) for e in sys_entities)
    
    # Only append non-empty sys_set to avoid skewing evaluation metrics
    if sys_set:
        all_sys_set.append(sys_set)
    else:
        all_sys_set.append(set())  # Keep alignment with gold sets

    tp = len(gold_set & sys_set)
    fp = len(sys_set - gold_set) 
    fn = len(gold_set - sys_set)

    total_tp += tp
    total_fp += fp
    total_fn += fn

    # Save progress every batch_size samples
    if (i + 1) % batch_size == 0 or (i + 1) == rows:
        save_data = {
            'processed_samples': i + 1,
            'total_tp': total_tp,
            'total_fp': total_fp,
            'total_fn': total_fn
        }
        with open(results_path, 'w') as f:
            json.dump(save_data, f)
        print(f"Progress saved: {i+1}/{rows}")

    if i % 20 == 0: print(f"Sample {i+1}/{rows}]")

print(f"FINAL EXTRACTION: {sum(len(s) for s in all_sys_set)} System Entities, {sum(len(s) for s in all_gold_set)} Gold Entities")
print("Saved Final Triples to JSON files")
with open("all_sys_set.json", 'w') as f:
    json.dump([list(s) for s in all_sys_set], f)
with open("all_gold_set.json", 'w') as f:
    json.dump([list(s) for s in all_gold_set], f)

Processing sample 1/50...
[0.0s] Starting knowledge graph construction...
[2.3s] Creating ontology...

[ONTOLOGY] Using predefined conllpp ontology (no LLM call needed)
[4.3s] Extracting entities and relations...
[12.2s] NER completed: 0 entities                                                                   
[SKIP] Relation extraction disabled for faster NER testing                                          

[REVISION] Validated: 0/0 entities, 0/0 triples
[13.3s] Finalizing results...
[13.3s] Construction completed!
Sample 1/50]
Processing sample 2/50...
[0.0s] Starting knowledge graph construction...
[7.0s] Creating ontology...

[ONTOLOGY] Using predefined conllpp ontology (no LLM call needed)
[8.3s] Extracting entities and relations...
[11.3s] NER completed: 1 entities                                                                   
[SKIP] Relation extraction disabled for faster NER testing                                          

[REVISION] Validated: 1/1 entities, 0/0 tripl

### Micro F1 Metrics

In [21]:
# Calculate micro F1
precision = total_tp / (total_tp + total_fp) if total_tp + total_fp > 0 else 0
recall = total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0  
f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

print(f"\nMicro F1 Results ({rows} samples):")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")


Micro F1 Results (3453 samples):
Precision: 0.686
Recall: 0.638
F1: 0.661
