# Milestone 3 - Notebook 5: Evaluation & Analysis

## Objective

- Compute quantitative metrics
- Compare M2 vs M3 performance
- Ablation analysis
- Success case analysis

In [1]:
import json
import sys
from pathlib import Path
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

sys.path.insert(0, str(Path.cwd().parent / 'src'))
from utils import preprocess_data

print('Imports successful!')

Successfully imported functions from Milestone 2: /Users/egeaydin/Github/TUW2025WS/Token13-tuw-nlp-ie-2025WS/milestone_2/rule_based
Imports successful!


## 1. Load Predictions

In [2]:
with open('../data/predictions/train_predictions.json', 'r') as f:
    train_preds_data = json.load(f)

with open('../data/predictions/test_predictions.json', 'r') as f:
    test_preds_data = json.load(f)

with open('../../data/processed/train/train.json', 'r') as f:
    train_data = json.load(f)

with open('../../data/processed/test/test.json', 'r') as f:
    test_data = json.load(f)

print(f'Loaded predictions for {len(test_preds_data)} test samples')

Loaded predictions for 2717 test samples


## 2. Prepare Ground Truth

In [3]:
def get_directed_label(item):
    rel_type = item['relation']['type']
    direction = item['relation'].get('direction', '')
    if rel_type == 'Other':
        return 'Other'
    direction = direction.replace('(', '').replace(')', '')
    return f"{rel_type}({direction})"

train_true = [get_directed_label(item) for item in train_data]
test_true = [get_directed_label(item) for item in test_data]

train_preds = [p['prediction'] for p in train_preds_data]
test_preds = [p['prediction'] for p in test_preds_data]

## 3. Compute Metrics

In [4]:
print('='*80)
print('MILESTONE 3 RESULTS')
print('='*80)

print('\nTRAIN SET:')
train_acc = accuracy_score(train_true, train_preds)
train_prec = precision_score(train_true, train_preds, average='macro', zero_division=0)
train_rec = recall_score(train_true, train_preds, average='macro', zero_division=0)
train_f1 = f1_score(train_true, train_preds, average='macro', zero_division=0)

print(f'  Accuracy: {train_acc:.3f}')
print(f'  Precision: {train_prec:.3f}')
print(f'  Recall: {train_rec:.3f}')
print(f'  F1: {train_f1:.3f}')

print('\nTEST SET:')
test_acc = accuracy_score(test_true, test_preds)
test_prec = precision_score(test_true, test_preds, average='macro', zero_division=0)
test_rec = recall_score(test_true, test_preds, average='macro', zero_division=0)
test_f1 = f1_score(test_true, test_preds, average='macro', zero_division=0)

print(f'  Accuracy: {test_acc:.3f}')
print(f'  Precision: {test_prec:.3f}')
print(f'  Recall: {test_rec:.3f}')
print(f'  F1: {test_f1:.3f}')

MILESTONE 3 RESULTS

TRAIN SET:
  Accuracy: 0.446
  Precision: 0.414
  Recall: 0.411
  F1: 0.383

TEST SET:
  Accuracy: 0.360
  Precision: 0.277
  Recall: 0.313
  F1: 0.283


## 4. Detailed Classification Report

In [5]:
print('\n' + '='*80)
print('TEST SET - DETAILED REPORT')
print('='*80)
print(classification_report(test_true, test_preds, zero_division=0))


TEST SET - DETAILED REPORT
                           precision    recall  f1-score   support

      Cause-Effect(e1,e2)       0.29      0.49      0.36       134
      Cause-Effect(e2,e1)       0.37      0.51      0.43       194
   Component-Whole(e1,e2)       0.11      0.02      0.04       162
   Component-Whole(e2,e1)       0.28      0.30      0.29       150
 Content-Container(e1,e2)       0.34      0.69      0.45       153
 Content-Container(e2,e1)       0.12      0.15      0.13        39
Entity-Destination(e1,e2)       0.74      0.65      0.69       291
Entity-Destination(e2,e1)       0.00      0.00      0.00         1
     Entity-Origin(e1,e2)       0.35      0.28      0.31       211
     Entity-Origin(e2,e1)       0.00      0.00      0.00        47
 Instrument-Agency(e1,e2)       0.09      0.23      0.13        22
 Instrument-Agency(e2,e1)       0.33      0.37      0.35       134
 Member-Collection(e1,e2)       0.11      0.06      0.08        32
 Member-Collection(e2,e1)       0

## 5. M2 vs M3 Comparison

**M2 Baseline (from plan):**
- Test Accuracy: 49.7%
- Macro Recall: 40.2%
- Macro F1: 43.0%

In [6]:
m2_metrics = {'accuracy': 0.497, 'recall': 0.402, 'f1': 0.430}
m3_metrics = {'accuracy': test_acc, 'recall': test_rec, 'f1': test_f1}

print('\nM2 vs M3 COMPARISON:')
print(f"{'Metric':<15} {'M2':<10} {'M3':<10} {'Change':<15}")
print('-'*50)
for metric in ['accuracy', 'recall', 'f1']:
    m2 = m2_metrics[metric]
    m3 = m3_metrics[metric]
    change = m3 - m2
    pct_change = (change / m2 * 100) if m2 > 0 else 0
    print(f"{metric:<15} {m2:<10.3f} {m3:<10.3f} {change:+.3f} ({pct_change:+.1f}%)")


M2 vs M3 COMPARISON:
Metric          M2         M3         Change         
--------------------------------------------------
accuracy        0.497      0.360      -0.137 (-27.6%)
recall          0.402      0.313      -0.089 (-22.1%)
f1              0.430      0.283      -0.147 (-34.2%)


## Summary

Milestone 3 implementation complete! Results saved.