# Analysis of TAC KBP 2015 submissions

In [52]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
# imports
import os
from collections import Counter, defaultdict

from tqdm import tqdm
from kbpo.util import Provenance, EvaluationEntry, OutputEntry, invert_dict
from kbpo.jupyter_util import to_table

import numpy as np

OUTPUT_DIR="data/KBP2015_Cold_Start_Slot-Filling_Evaluation_Results_2016-03-31/corrected_runs/"
GOLD_FILE="data/KBP2015_Cold_Start_Slot-Filling_Evaluation_Results_2016-03-31/SF_aux_files/batch_00_05_poolc.assessed.fqec"
QUERYSET="data/KBP2015_Cold_Start_Slot-Filling_Evaluation_Results_2016-03-31/SF_aux_files/batch_00_05_queryids.v3.0.txt"

In [3]:
# Construct a map of (a) the queries that were evaluated and 
# (b) the mapping from specific query entrypoints to the common query entity.
Q = {}
with open(QUERYSET) as f:
    for line in f:
        fields = line.split()
        if len(fields) != 2: # this is the case for a few queries that were only partially assessed.
            pass
        else:
            ldc_query, cssf_query = fields
            Q[cssf_query] = ldc_query

In [4]:
# Open an read files
evaluation_data = []
with open(GOLD_FILE) as f:
    for line in tqdm(f):
        entry = EvaluationEntry.from_line(line)
        if entry.query_id in Q:
            evaluation_data.append(entry)
print("Loaded {} evaluation entries".format(len(evaluation_data)))

363391it [00:04, 85999.56it/s]

Loaded 83562 evaluation entries





In [5]:
output_data = []
for fname in tqdm(os.listdir(OUTPUT_DIR)):
    if not fname.endswith('.txt'): continue
    with open(os.path.join(OUTPUT_DIR, fname)) as f:
        for line in f:
            entry = OutputEntry.from_line(line)
            if entry.query_id in Q:
                output_data.append(entry)
print("Loaded {} output entries from {} systems.".format(len(output_data), sum(1 for d in os.listdir(OUTPUT_DIR) if d.endswith('.txt'))))

100%|██████████| 72/72 [00:24<00:00,  2.30it/s]

Loaded 140385 output entries from 71 systems.





## Basic statistics of evaluation data

In [28]:
def p2k(provs):
    """Converts a provenance string into a key"""
    return tuple(map(str, provs))
def k(entry):
    """Key for both evaluation entries and output entries, based on their entity id, slot value and provenance."""
    return (entry.query_id,entry.slot_value,p2k(entry.slot_provenances),p2k(entry.relation_provenances))
def kk(entry):
    """Key for LDC queries without provenance"""
    return (Q[entry.query_id],entry.slot_value)
def k2kk(key):
    return (Q[key[0]], key[1])
def e(entry):
    """Get a representation of the equivalence class"""
    return entry.eq_class

### Class label distribution aggregated across systems
(Note: 'C' is correct and 'W' is incorrect. 'X' is "inexact" and is scored to be 'wrong'.)

In [7]:
labels = Counter()
for entry in evaluation_data:
    labels[entry.slot_value_label] +=1

labels["*"] = len(evaluation_data)
tbl = [(lbl, cnt, cnt/len(evaluation_data)) for lbl, cnt in labels.items()]

to_table(sorted(tbl, key=lambda r:r[0]), cols="label count %".split())

label,count,%
*,83562,1.0
C,20442,0.2446327277949307
W,59582,0.7130274526698739
X,3538,0.0423398195351954


### Class label distribution after collapsing output using key, aggregated across systems 
In order to match output of systems with the evaluation data, we have an imperfect key to compare the two, namely the query id, relation, slot value and provenance strings. This shows that the difference between using the key function `k` and not is minimal.

In [8]:
evaluation_data_ = {}
for entry in evaluation_data:
    evaluation_data_[k(entry)] = entry
evaluation_data_ = list(evaluation_data_.values())

labels = Counter()
for entry in evaluation_data_:
    labels[entry.slot_value_label] +=1

labels["*"] = len(evaluation_data_)
tbl = [(lbl, cnt, cnt/len(evaluation_data_)) for lbl, cnt in labels.items()]

to_table(sorted(tbl, key=lambda r:r[0]), cols="label count %".split())

label,count,%
*,83368,1.0
C,20317,0.2437026197102005
W,59533,0.7140989348431053
X,3518,0.0421984454466941


## How many lines of output are reported by each team?

First, we construct an index over the gold output labels.

In [9]:
print("Building index over gold output answers")
gold_labels = defaultdict(set)
for entry in evaluation_data_:
    gold_labels[entry.slot_value_label == "C"].add(k(entry))

print("Intermediate output")
to_table([(lbl, len(ks)) for lbl, ks in gold_labels.items()], cols="label count".split())
print("Removing overlaps")
print("C \ !C: {}".format(len(gold_labels[True].intersection(gold_labels[False]))))
gold_labels[False].difference_update(gold_labels[True])
to_table([(lbl, len(ks)) for lbl, ks in gold_labels.items()], cols="label count".split())

Building index over gold output answers
Intermediate output


label,count
False,63051
True,20317


Removing overlaps
C \ !C: 0


label,count
False,63051
True,20317


Let's also create an index over equivalence classes.

In [30]:
gold_eq = defaultdict(lambda: None)
print("Building index over gold eq classes")
for entry in evaluation_data_:
    if entry.slot_value_label == "C":
        gold_eq[k(entry)] = e(entry)
gold_es = invert_dict(gold_eq)        
print("%d entries map to %d eq classes"%(len(gold_eq), len(gold_es)))
print("Equivalence classes")
to_table([[e, len(ks)] for e, ks in gold_es.items()], cols="eq-class count".split())

Building index over gold eq classes
20317 entries map to 4322 eq classes
Equivalence classes


eq-class,count
CSSF15_ENG_0458206f71:2,7
CSSF15_ENG_056e81f24d:2,16
CSSF15_ENG_c623245dc3:31,3
CSSF15_ENG_76e7c9e1fb:1,21
CSSF15_ENG_5b9e9c5d4b:1,1
CSSF15_ENG_283200ba9f:1,24
CSSF15_ENG_80db4d4ceb:1,3
CSSF15_ENG_9e42bfe1cf:7,1
CSSF15_ENG_8b4f322c35:1,15
CSSF15_ENG_fbbb2d8249:3,6


In [10]:
# Build per system output
per_system_output = defaultdict(set)
for entry in output_data:
    per_system_output[entry.run_id].add(k(entry))

# restricted to correct entries.
per_system_true_output = defaultdict(set)
for run_id, vs in per_system_output.items():
    per_system_true_output[run_id] = vs.intersection(gold_labels[True])

This table summarizes the output produced by each team.

Note that we are only looking at assessed queries, so all unassessed % should be 0.

In [11]:
tbl = []
for run_id, vs in per_system_output.items():
    total = len(vs)
    correct = len(vs.intersection(gold_labels[True]))
    incorrect = len(vs.intersection(gold_labels[False]))
    evaluated = correct + incorrect
    unassessed = len(vs) - evaluated
    missing = len(gold_labels[True].difference(vs))
    tbl.append([run_id, correct, incorrect, evaluated, unassessed, missing, total])
tbl = [(run_id, c, i, e, u, m, t, "%.2f"%(e and 100*c/e), "%.2f"%((c+m) and 100*c/(c+m)), "%.2f"%(t and 100*u/t)) for (run_id, c, i, e, u, m, t) in tbl]
to_table(tbl, cols="run_id correct incorrect evaluated unassessed missing total prec% rec% unass%".split())

run_id,correct,incorrect,evaluated,unassessed,missing,total,prec%,rec%,unass%
KB_ICTCAS_OKN1,26,43,69,0,20291,69,37.68,0.13,0.0
SF_UWashington2,145,148,293,0,20172,293,49.49,0.71,0.0
SF_CMUML3,18,14,32,0,20299,32,56.25,0.09,0.0
KB_ICTCAS_OKN4,5,2,7,0,20312,7,71.43,0.02,0.0
KB_BBN1,1569,1732,3301,0,18748,3301,47.53,7.72,0.0
SF_UWashington1,485,621,1106,0,19832,1106,43.85,2.39,0.0
KB_Stanford1,652,484,1136,0,19665,1136,57.39,3.21,0.0
SF_ZJU_DCD_SF2,178,1403,1581,0,20139,1581,11.26,0.88,0.0
KB_BBN5,1447,1707,3154,0,18870,3154,45.88,7.12,0.0
KB_BBN3,1560,1975,3535,0,18757,3535,44.13,7.68,0.0


In [12]:
everything = set()
for x in per_system_output.values(): 
    everything.update(x)
print("Total quantity of system output: ", len(everything))
print("Total quantity of gold output: ", len(gold_labels[True]))
# These are true labels that aren't predicted by any system -- not sure why they exist.
print("Total quantity of gold output not produced by systems: ", len(gold_labels[True].difference(everything)))

Total quantity of system output:  60799
Total quantity of gold output:  20317
Total quantity of gold output not produced by systems:  4387


## How much overlap exists between system outputs?

In [13]:
tbl = []
for run_id, vs in per_system_output.items():
    diff = set(vs)
    for run_id_, vs_ in per_system_output.items():
        if run_id_ != run_id:
            diff.difference_update(vs_)
    tbl.append([run_id, len(diff), len(vs), len(everything)])
tbl = [(run_id, d, s, t, "%.2f"%(100*d/s), "%2.f"%(100*s/t), "%.2f"%(100*d/t)) for (run_id, d, s, t) in tbl]
to_table(tbl, cols="run_id unique system total unique% system% unique_total%".split())

run_id,unique,system,total,unique%,system%,unique_total%
KB_ICTCAS_OKN1,31,69,60799,44.93,0,0.05
SF_UWashington2,0,293,60799,0.0,0,0.0
SF_CMUML3,8,32,60799,25.0,0,0.01
KB_ICTCAS_OKN4,3,7,60799,42.86,0,0.0
KB_BBN1,2,3301,60799,0.06,5,0.0
SF_UWashington1,0,1106,60799,0.0,2,0.0
KB_Stanford1,123,1136,60799,10.83,2,0.2
SF_ZJU_DCD_SF2,1208,1581,60799,76.41,3,1.99
KB_BBN5,727,3154,60799,23.05,5,1.2
KB_BBN3,372,3535,60799,10.52,6,0.61


## Restricted to correct output

In [14]:
def get_system_name(name):
    return name[:-1].split('_', 1)[-1]

In [15]:
tbl = []
for run_id, vs in per_system_true_output.items():
    diff = set(vs)
    for run_id_, vs_ in per_system_true_output.items():
        if get_system_name(run_id_) != get_system_name(run_id): # the systems need to be distinct
            diff.difference_update(vs_)
    tbl.append([run_id, len(diff), len(vs)])
tbl.append(["total", sum(d for _, d, _ in tbl), sum(s for _, _, s in tbl)])
t = len(gold_labels[True])
tbl = [(run_id, d, s, t, "%.2f"%(100*d/s), "%.2f"%(100*s/t), "%.2f"%(100*d/t)) for (run_id, d, s) in tbl]
to_table(sorted(tbl), cols="run_id unique system total unique% system% unique_total%".split())

run_id,unique,system,total,unique%,system%,unique_total%
KB_BBN1,1375,1569,20317,87.64,7.72,6.77
KB_BBN2,1231,1413,20317,87.12,6.95,6.06
KB_BBN3,1371,1560,20317,87.88,7.68,6.75
KB_BBN4,1545,1738,20317,88.9,8.55,7.6
KB_BBN5,1249,1447,20317,86.32,7.12,6.15
KB_ICTCAS_OKN1,26,26,20317,100.0,0.13,0.13
KB_ICTCAS_OKN2,3,3,20317,100.0,0.01,0.01
KB_ICTCAS_OKN3,42,42,20317,100.0,0.21,0.21
KB_ICTCAS_OKN4,5,5,20317,100.0,0.02,0.02
KB_NYU1,736,736,20317,100.0,3.62,3.62


## Aggregating by query id instead of entry point

In [16]:
print("Building index over gold output answers")
gold_labels_by_id = defaultdict(set)
gold_labels_by_id[True].update(k2kk(key) for key in gold_labels[True])
gold_labels_by_id[False].update(k2kk(key) for key in gold_labels[False])

print("Intermediate output")
to_table([(lbl, len(ks)) for lbl, ks in gold_labels_by_id.items()], cols="label count".split())
print("Removing overlaps")
print("C \ !C: {}".format(len(gold_labels_by_id[True].intersection(gold_labels_by_id[False]))))
gold_labels_by_id[False].difference_update(gold_labels_by_id[True])
to_table([(lbl, len(ks)) for lbl, ks in gold_labels_by_id.items()], cols="label count".split())

per_system_true_output_by_id = {}
for run_id, vs in per_system_true_output.items():
    per_system_true_output_by_id[run_id] = set(k2kk(key) for key in vs)

Building index over gold output answers
Intermediate output


label,count
False,10400
True,1415


Removing overlaps
C \ !C: 179


label,count
False,10221
True,1415


In [17]:
tbl = []
for run_id, vs in per_system_true_output_by_id.items():
    diff = set(vs)
    for run_id_, vs_ in per_system_true_output_by_id.items():
        if get_system_name(run_id_) != get_system_name(run_id):
            diff.difference_update(vs_)
    tbl.append([run_id, len(diff), len(vs)])
tbl.append(["total", sum(d for _, d, _ in tbl), sum(s for _, _, s in tbl)])
t = len(gold_labels_by_id[True])
tbl = [(run_id, d, s, t, "%.2f"%(100*d/s), "%.2f"%(100*s/t), "%.2f"%(100*d/t)) for (run_id, d, s) in tbl]
to_table(sorted(tbl), cols="run_id unique system total unique% system% unique_total%".split())

run_id,unique,system,total,unique%,system%,unique_total%
KB_BBN1,77,480,1415,16.04,33.92,5.44
KB_BBN2,60,435,1415,13.79,30.74,4.24
KB_BBN3,82,479,1415,17.12,33.85,5.8
KB_BBN4,90,523,1415,17.21,36.96,6.36
KB_BBN5,70,450,1415,15.56,31.8,4.95
KB_ICTCAS_OKN1,1,12,1415,8.33,0.85,0.07
KB_ICTCAS_OKN2,0,3,1415,0.0,0.21,0.0
KB_ICTCAS_OKN3,1,20,1415,5.0,1.41,0.07
KB_ICTCAS_OKN4,0,5,1415,0.0,0.35,0.0
KB_NYU1,23,267,1415,8.61,18.87,1.63


# Evaluation
Here is a table evaluating the performance of submitted systems.

In [24]:
tbl = []
for run_id, vs in per_system_output.items():
    precision = len(gold_labels[True].intersection(vs))/len(vs)  if len(vs) > 0 else 0.
    recall = len(gold_labels[True].intersection(vs))/len(gold_labels[True])
    f1 = 2 * (precision * recall) / (precision + recall)
    tbl.append([run_id, "%.2f"%(100*precision), "%.2f"%(100*recall), "%.2f"%(100*f1)])
to_table(sorted(tbl), cols="run_id precision recall f1".split())

run_id,precision,recall,f1
KB_BBN1,47.53,7.72,13.29
KB_BBN2,50.16,6.95,12.22
KB_BBN3,44.13,7.68,13.08
KB_BBN4,36.51,8.55,13.86
KB_BBN5,45.88,7.12,12.33
KB_ICTCAS_OKN1,37.68,0.13,0.26
KB_ICTCAS_OKN2,75.0,0.01,0.03
KB_ICTCAS_OKN3,44.68,0.21,0.41
KB_ICTCAS_OKN4,71.43,0.02,0.05
KB_NYU1,43.09,3.62,6.68


## By entry point

For a single relation pair, many relation instances may exist. In the KBP output, systems are explicitly discouraged from producing more than one such output and hence different systems may output different lines. To account for this effect, we aggregate by entry point.


In [36]:
tbl = []
for run_id, vs in per_system_output.items():
    es = set([gold_eq[v] for v in vs if gold_eq[v] is not None])
    precision = len(es.intersection(gold_es))/len(vs) if len(vs) > 0 else 0. # Implicitly penalizing duplicates
    recall = len(es.intersection(gold_es))/len(gold_es)
    f1 = 2 * (precision * recall) / (precision + recall)
    tbl.append([run_id, "%.2f"%(100*precision), "%.2f"%(100*recall), "%.2f"%(100*f1)])
to_table(sorted(tbl), cols="run_id precision recall f1".split())

run_id,precision,recall,f1
KB_BBN1,45.29,34.59,39.22
KB_BBN2,47.64,31.05,37.6
KB_BBN3,42.07,34.41,37.85
KB_BBN4,34.92,38.45,36.6
KB_BBN5,43.98,32.09,37.11
KB_ICTCAS_OKN1,37.68,0.6,1.18
KB_ICTCAS_OKN2,75.0,0.07,0.14
KB_ICTCAS_OKN3,42.55,0.93,1.81
KB_ICTCAS_OKN4,71.43,0.12,0.23
KB_NYU1,39.11,15.46,22.16


## By query id
Finally, we also present results collapsed by query id.

In [None]:
# TODO
# Map equivalence classes through LDC

# Quantifying pooling bias
We run a leave-one-out and leave-team-out analysis to quantify the pooling bias present in the system.

In [46]:
def make_loo_pool(run_id=None):
    """
    Make a pool containing all inputs except run_id.
    """
    pool = set()
    for run_id_, vs in per_system_output.items():
        if run_id is None or run_id_ != run_id:
            pool.update([gold_eq[v] for v in vs if gold_eq[v] is not None])
    return pool

def make_lto_pool(run_id=None):
    """
    Make a pool containing all inputs except run_id.
    """
    pool = set()
    for run_id_, vs in per_system_output.items():
        if run_id is None or get_system_name(run_id_) != get_system_name(run_id):
            pool.update([gold_eq[v] for v in vs if gold_eq[v] is not None])
    return pool

## LOO

In [68]:
gold_pool = make_loo_pool()

tbl = []
for run_id, vs in per_system_output.items():
    if run_id == "LDC": continue
    biased_pool = make_loo_pool(run_id)
    es = set([gold_eq[v] for v in vs if gold_eq[v] is not None])
    precision = len(es.intersection(gold_pool))/len(vs) if len(vs) > 0 else 0. # Implicitly penalizing duplicates
    recall = len(es.intersection(gold_pool))/len(gold_pool)
    f1 = 2 * (precision * recall) / (precision + recall)
    biased_precision = len(es.intersection(biased_pool))/len(vs) if len(vs) > 0 else 0. # Implicitly penalizing duplicates
    biased_recall = len(es.intersection(biased_pool))/len(biased_pool)
    biased_f1 = 2 * (biased_precision * biased_recall) / (biased_precision + biased_recall)
    p_bias = precision - biased_precision
    r_bias = recall - biased_recall
    f1_bias  = f1 - biased_f1
    
    tbl.append([run_id, 
                100*precision, 100*recall, 100*f1,
                100*biased_precision, 100*biased_recall, 100*biased_f1,
                100*p_bias, 100*r_bias, 100*f1_bias,
               ])
to_table(sorted(tbl), cols="run_id precision recall f1 biased-p biased-r biased-f1 p-bias r-bias f1-bias".split())


p_bias = np.array([r[-3] for r in tbl])
r_bias = np.array([r[-2] for r in tbl])
f1_bias = np.array([r[-1] for r in tbl])
print("p_bias (min/avg/max): {}/{}/{}".format(p_bias.min(), p_bias.mean(), p_bias.max()))
print("r_bias (min/avg/max): {}/{}/{}".format(r_bias.min(), r_bias.mean(), r_bias.max()))
print("f1_bias (min/avg/max): {}/{}/{}".format(f1_bias.min(), f1_bias.mean(), f1_bias.max()))

run_id,precision,recall,f1,biased-p,biased-r,biased-f1,p-bias,r-bias,f1-bias
KB_BBN1,45.29,37.56,41.07,45.29,37.56,41.07,0.0,0.0,0.0
KB_BBN2,47.64,33.72,39.49,47.64,33.72,39.49,0.0,0.0,0.0
KB_BBN3,42.07,37.36,39.57,41.53,37.06,39.17,0.54,0.3,0.41
KB_BBN4,34.92,41.76,38.03,33.74,40.93,36.99,1.18,0.83,1.04
KB_BBN5,43.98,34.85,38.88,43.6,34.65,38.61,0.38,0.2,0.27
KB_ICTCAS_OKN1,37.68,0.65,1.28,37.68,0.65,1.28,0.0,0.0,0.0
KB_ICTCAS_OKN2,75.0,0.08,0.15,75.0,0.08,0.15,0.0,0.0,0.0
KB_ICTCAS_OKN3,42.55,1.01,1.96,42.55,1.01,1.96,0.0,0.0,0.0
KB_ICTCAS_OKN4,71.43,0.13,0.25,71.43,0.13,0.25,0.0,0.0,0.0
KB_NYU1,39.11,16.78,23.49,39.11,16.78,23.49,0.0,0.0,0.0


p_bias (min/avg/max): 0.0/0.1982489381264269/2.4896265560165998
r_bias (min/avg/max): 0.0/0.0848812698692956/1.222322650554769
f1_bias (min/avg/max): 0.0/0.1094125314988298/1.4284158378869538


## LTO

In [67]:
gold_pool = make_lto_pool()

tbl = []
for run_id, vs in per_system_output.items():
    if run_id == "LDC": continue
    biased_pool = make_lto_pool(run_id)
    es = set([gold_eq[v] for v in vs if gold_eq[v] is not None])
    precision = len(es.intersection(gold_pool))/len(vs) if len(vs) > 0 else 0. # Implicitly penalizing duplicates
    recall = len(es.intersection(gold_pool))/len(gold_pool)
    f1 = 2 * (precision * recall) / (precision + recall)
    biased_precision = len(es.intersection(biased_pool))/len(vs) if len(vs) > 0 else 0. # Implicitly penalizing duplicates
    biased_recall = len(es.intersection(biased_pool))/len(biased_pool)
    biased_f1 = 2 * (biased_precision * biased_recall) / (biased_precision + biased_recall)
    p_bias = precision - biased_precision
    r_bias = recall - biased_recall
    f1_bias  = f1 - biased_f1
    
    tbl.append([run_id, 
                100*precision, 100*recall, 100*f1,
                100*biased_precision, 100*biased_recall, 100*biased_f1,
                100*p_bias, 100*r_bias, 100*f1_bias,
               ])
to_table(sorted(tbl), cols="run_id precision recall f1 biased-p biased-r biased-f1 p-bias r-bias f1-bias".split())

p_bias = np.array([r[-3] for r in tbl])
r_bias = np.array([r[-2] for r in tbl])
f1_bias = np.array([r[-1] for r in tbl])
print("p_bias (min/avg/max): {}/{}/{}".format(p_bias.min(), p_bias.mean(), p_bias.max()))
print("r_bias (min/avg/max): {}/{}/{}".format(r_bias.min(), r_bias.mean(), r_bias.max()))
print("f1_bias (min/avg/max): {}/{}/{}".format(f1_bias.min(), f1_bias.mean(), f1_bias.max()))

run_id,precision,recall,f1,biased-p,biased-r,biased-f1,p-bias,r-bias,f1-bias
KB_BBN1,45.29,37.56,41.07,39.35,35.14,37.12,5.94,2.43,3.94
KB_BBN2,47.64,33.72,39.49,42.28,32.22,36.57,5.36,1.5,2.92
KB_BBN3,42.07,37.36,39.57,36.38,34.78,35.56,5.69,2.58,4.01
KB_BBN4,34.92,41.76,38.03,29.79,38.36,33.53,5.13,3.4,4.5
KB_BBN5,43.98,34.85,38.88,38.68,33.0,35.62,5.29,1.85,3.27
KB_ICTCAS_OKN1,37.68,0.65,1.28,33.33,0.58,1.14,4.35,0.07,0.15
KB_ICTCAS_OKN2,75.0,0.08,0.15,75.0,0.08,0.15,0.0,-0.0,-0.0
KB_ICTCAS_OKN3,42.55,1.01,1.96,39.36,0.93,1.82,3.19,0.07,0.15
KB_ICTCAS_OKN4,71.43,0.13,0.25,71.43,0.13,0.25,0.0,-0.0,-0.0
KB_NYU1,39.11,16.78,23.49,35.89,15.65,21.79,3.22,1.14,1.7


p_bias (min/avg/max): 0.0/2.3476752280966973/9.03225806451613
r_bias (min/avg/max): -0.05055766638068365/0.7531454242033278/3.40337065364692
f1_bias (min/avg/max): -0.08067934863853904/1.117708642354091/4.49768637086957
