## NYT11-HRL Relation Extraction Evaluation

In [1]:
import hashlib
import json
import pandas as pd
import pathlib
import re

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dykphd/nyt11-hrl-re")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/zohairy/.cache/kagglehub/datasets/dykphd/nyt11-hrl-re/versions/1


In [3]:
file_path = {
    "train": f"{path}/nyt11/train.json",
    "test": f"{path}/nyt11/test.json",
    "test-plus": f"{path}/nyt11/test-plus.json",
}

In [4]:
with open(file_path["train"]) as f:
    for i in range(3):
        print(f.readline())

{"sentext": " But that spasm of irritation by a master intimidator was minor compared with what Bobby Fischer , the erratic former world chess champion , dished out in March at a news conference in Reykjavik , Iceland . ", "entities": ["Bobby Fischer", "Reykjavik", "Iceland"], "ID": 1, "relations": [{"rtext": "/people/person/nationality", "em2": "Iceland", "em1": "Bobby Fischer", "tags": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 5, 0]}, {"rtext": "/location/country/capital", "em2": "Reykjavik", "em1": "Iceland", "tags": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 4, 0]}, {"rtext": "/people/deceased_person/place_of_death", "em2": "Reykjavik", "em1": "Bobby Fischer", "tags": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0]}]}

{"sentext": " But Schaap seems as comfortable in that role as Joe Buck , the

### Data Cleaning

In [5]:
def fix_json_format(file_path, save_fixed=False) -> str:
    with open(file_path, "r") as f:
        mal_formatted_json = f.read()
    
    # Add comma before newline
    comma_fixed = re.sub(r"\n", ",\n", mal_formatted_json)
    
    # Remove comma after last elements because JSON standard does not allow "trailing comma" (https://www.json.org/json-en.html)
    print(f"{file_path}: checking trailing comma - {repr(comma_fixed[-10:])}")
    comma_fixed = comma_fixed[:-2]
    
    # Make it a list
    list_fixed = "[" + comma_fixed + "]"

    # Save the fixed JSON data
    if save_fixed:
        file_path = pathlib.Path(file_path)
        parent = file_path.parent
        name = file_path.stem
        save_path = parent / f"{name}_fixed.json"
        with open(save_path, "w") as f:
            f.write(list_fixed)
        print(f"{file_path}: fixed json saved to file - {save_path}")

    return list_fixed

In [6]:
train_fixed_str = fix_json_format(file_path["train"], save_fixed=False)
test_fixed_str = fix_json_format(file_path["test"], save_fixed=False)
test_plus_fixed_str = fix_json_format(file_path["test-plus"], save_fixed=False)

/Users/zohairy/.cache/kagglehub/datasets/dykphd/nyt11-hrl-re/versions/1/nyt11/train.json: checking trailing comma - '5, 0]}]},\n'
/Users/zohairy/.cache/kagglehub/datasets/dykphd/nyt11-hrl-re/versions/1/nyt11/test.json: checking trailing comma - '0, 0]}]},\n'
/Users/zohairy/.cache/kagglehub/datasets/dykphd/nyt11-hrl-re/versions/1/nyt11/test-plus.json: checking trailing comma - 'her . "},\n'


### Data Conversion to Pandas DataFrame

In [7]:
train_fixed_json = json.loads(train_fixed_str)
test_fixed_json = json.loads(test_fixed_str)
test_plus_fixed_json = json.loads(test_plus_fixed_str)

In [8]:
def json_to_pandas(json_data) -> pd.DataFrame:
    triples = []
    for e in json_data:
        """
        - Definition of Triples: (subject, predicate, object) pairs
        - Column Description:
            - sentext: input sentence.
            - em1: subject
            - rtext: predicate
            - em2: object
        - Note
            - There are cases that one sentence has more than one triples.
            - Therefore, hashed `sentext` is used for unique `id`.
            - The `tags` data is omitted because I am only intereseted in entities and relationship between them.
        """
        id = hashlib.md5()
        id.update(e["sentext"].encode("utf-8"))
        for r in e["relations"]:
            t = {
                "id": id.hexdigest(),
                "sentext": e["sentext"],
                "entities": e["entities"],
                "em1": r["em1"],
                "rtext": r["rtext"],
                "em2": r["em2"],
                # "tags": r["tags"]
            }
            triples.append(t)
    return pd.DataFrame.from_dict(triples)

In [9]:
train_df = json_to_pandas(train_fixed_json)
test_df = json_to_pandas(test_fixed_json)
test_plus_df = json_to_pandas(test_plus_fixed_json)

### NYT11 Data Exploration

In [None]:
test_df.describe()

In [None]:
test_df.iloc[1]

In [None]:
test_plus_df.describe()

### Getting Unique Predicates (Relationships)

In [10]:
train_rtext = list(train_df["rtext"].unique())
test_rtext = list(test_df["rtext"].unique())
test_plus_rtext = list(test_plus_df["rtext"].unique())

In [None]:
train_rtext.sort()
test_rtext.sort()
test_plus_rtext.sort()

In [None]:
print(f"Predicates in train set:\n{train_rtext}\n")
print(f"Predicates in test set:\n{test_rtext}\n")
print(f"Predicates in test-plus set:\n{test_plus_rtext}\n")

In [None]:
unique_rtext = list(set(train_rtext + test_rtext + test_plus_rtext))
unique_rtext

### NYT11-HRL Evaluation

In [11]:
# Initialize KG System for evaluation
import sys
import os
sys.path.append('../../src/ma_finkg')

from kg_construction_graph import FinancialKGConstructionGraph
from utils import set_global_timer

# Set OpenRouter API key and model
openrouter_key = "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"
model = "openai/gpt-3.5-turbo"

# Set API key and initialize
os.environ["OPENROUTER_API_KEY"] = openrouter_key
test="nyt11"
kg_system = FinancialKGConstructionGraph(model_name=model, ontology=test, prompts=test)

In [12]:
# Evaluate NYT11-HRL relation extraction

import json
from pathlib import Path
from collections import defaultdict

rows = 10 # Test: 370
batch_size = 10
comparison_file = "nyt11_clean_comparison.json"

comparison_path = Path(comparison_file)
total_tp = total_fp = total_fn = 0
if comparison_path.exists():
    with open(comparison_path, 'r') as f:
        comparison_data = json.load(f)
    # Get already processed sentences from existing comparison data
    processed_sentences = set(item['sentence'] for item in comparison_data)
    
    # Recalculate metrics from existing data
    for item in comparison_data:
        gold_set = set(tuple(rel) for rel in item['gold'])
        sys_set = set(tuple(rel) for rel in item['sys'])
        total_tp += len(gold_set & sys_set)
        total_fp += len(sys_set - gold_set)
        total_fn += len(gold_set - sys_set)
    
    print(f"Resuming from {len(processed_sentences)} processed sentences")
else:
    comparison_data = []
    processed_sentences = set()

# Group relations by sentence to avoid duplicate processing
sentence_relations = defaultdict(list)
sentence_cache = {}

# Collect all gold relations by sentence
for i in range(rows):
    row = test_df.iloc[i]
    text = row['sentext'].strip()
    gold_relation = [row['em1'].lower(), row['rtext'], row['em2'].lower()]
    sentence_relations[text].append(gold_relation)

remaining_sentences = {text: relations for text, relations in sentence_relations.items() 
                      if text not in processed_sentences}

print(f"Processing {len(remaining_sentences)}/{len(sentence_relations)} remaining sentences...")

# Process each remaining sentence
for sentence_idx, (text, gold_relations) in enumerate(remaining_sentences.items(), 1):
    if text not in sentence_cache:
        print(f"Processing sentence {len(processed_sentences) + sentence_idx}/{len(sentence_relations)}...")
        result = kg_system.construct_kg(text)
        sys_triples = result['finalize'].get("revised_triples", []) if 'finalize' in result else []
        
        sys_relations = []
        for triple in sys_triples:
            if hasattr(triple, 'head') and hasattr(triple, 'relation') and hasattr(triple, 'tail'):
                sys_relations.append([
                    triple.head.lower(),
                    triple.relation,
                    triple.tail.lower()
                ])
        sentence_cache[text] = sys_relations
    
    sys_relations = sentence_cache[text]
    
    # skip empty-empty cases
    if gold_relations or sys_relations:
        comparison_data.append({
            "gold": gold_relations,
            "sys": sys_relations,
            "sentence": text 
        })
    
    # Calculate metrics for this sentence
    gold_set = set(tuple(rel) for rel in gold_relations)
    sys_set = set(tuple(rel) for rel in sys_relations)
    
    tp = len(gold_set & sys_set)
    fp = len(sys_set - gold_set) 
    fn = len(gold_set - sys_set)

    total_tp += tp
    total_fp += fp
    total_fn += fn

    # Save progress
    if sentence_idx % batch_size == 0 or sentence_idx == len(remaining_sentences):
        with open(comparison_path, 'w') as f:
            json.dump(comparison_data, f, indent=2)
        print(f"Progress saved: {len(comparison_data)} comparisons")

# Export final data
print(f"\nExporting {len(comparison_data)} non-empty sentence comparisons...")
with open(comparison_file, 'w') as f:
    json.dump(comparison_data, f, indent=2)

# Summary statistics
print(f"FINAL EXTRACTION: {sum(len(item['sys']) for item in comparison_data)} System Relations, {sum(len(item['gold']) for item in comparison_data)} Gold Relations")
print(f"Non-empty cases: {len(comparison_data)}/{len(sentence_relations)} sentences")
print("Saved clean comparison to nyt11_clean_comparison.json")

Processing 10/10 remaining sentences...
Processing sentence 1/10...
[0.0s] Starting knowledge graph construction...
[0.9s] Creating ontology...

[ONTOLOGY] Using predefined nyt11 ontology (no LLM call needed)
[1.5s] Extracting entities and relations...
[3.4s] NER completed: 3 entities                                                                    
[18.1s] RE completed: 8 filtered triples                                                            

[REVISION] Validated: 3/3 entities, 8/8 triples
[18.7s] Finalizing results...
[18.7s] Construction completed!
Processing sentence 2/10...
[0.0s] Starting knowledge graph construction...
[0.5s] Creating ontology...

[ONTOLOGY] Using predefined nyt11 ontology (no LLM call needed)
[1.1s] Extracting entities and relations...
[2.8s] NER completed: 3 entities                                                                    
[18.3s] RE completed: 9 filtered triples                                                            

[REVISION] Validat

KeyboardInterrupt: 

In [None]:
# Calculate micro F1 for relation extraction

precision = total_tp / (total_tp + total_fp) if total_tp + total_fp > 0 else 0
recall = total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0  
f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

print(f"\nNYT11-HRL Relation Extraction Micro F1 Results ({rows} samples):")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")
print(f"Total TP: {total_tp}, FP: {total_fp}, FN: {total_fn}")

# Show sample comparisons from exported data
print(f"\n=== SAMPLE COMPARISONS FROM CLEAN EXPORT ===")
sample_comparisons = comparison_data[:5] 
for i, item in enumerate(sample_comparisons, 1):
    print(f"\nSample {i}:")
    print(f"Gold: {item['gold']}")
    print(f"Sys:  {item['sys']}")
    print(f"Sentence: {item['sentence']}")
    
    # Show match analysis
    gold_set = set(tuple(rel) for rel in item['gold'])
    sys_set = set(tuple(rel) for rel in item['sys'])
    
    if gold_set & sys_set:
        print(f"✓ MATCHES: {list(gold_set & sys_set)}")
    if sys_set - gold_set:
        print(f"✗ FALSE POSITIVES: {list(sys_set - gold_set)}")
    if gold_set - sys_set:
        print(f"✗ FALSE NEGATIVES: {list(gold_set - sys_set)}")
        
print(f"\n=== ERROR ANALYSIS BENEFITS ===")
print(f"• Total cases: {len(sentence_relations)} unique sentences")
print(f"• Non-empty cases: {len(comparison_data)} ({len(comparison_data)/len(sentence_relations)*100:.1f}%)")
print(f"• Avoided {len(sentence_relations) - len(comparison_data)} empty cases that would dilute analysis")