# Analysis of TAC KBP 2015 submissions

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# imports
import os
from collections import Counter, defaultdict

from tqdm import tqdm
from kbpo.util import Provenance, EvaluationEntry, OutputEntry
from kbpo.jupyter_util import to_table

OUTPUT_DIR="data/KBP2015_Cold_Start_Slot-Filling_Evaluation_Results_2016-03-31/corrected_runs/"
GOLD_FILE="data/KBP2015_Cold_Start_Slot-Filling_Evaluation_Results_2016-03-31/SF_aux_files/batch_00_05_poolc.assessed.fqec"
QUERYSET="data/KBP2015_Cold_Start_Slot-Filling_Evaluation_Results_2016-03-31/SF_aux_files/batch_00_05_queryids.v3.0.txt"

In [5]:
# Construct a map of (a) the queries that were evaluated and 
# (b) the mapping from specific query entrypoints to the common query entity.
Q = {}
with open(QUERYSET) as f:
    for line in f:
        fields = line.split()
        if len(fields) != 2: # this is the case for a few queries that were only partially assessed.
            pass
        else:
            ldc_query, cssf_query = fields
            Q[cssf_query] = ldc_query

In [6]:
# Open an read files
evaluation_data = []
with open(GOLD_FILE) as f:
    for line in tqdm(f):
        entry = EvaluationEntry.from_line(line)
        if entry.query_id in Q:
            evaluation_data.append(entry)
print("Loaded {} evaluation entries".format(len(evaluation_data)))

363391it [00:06, 58652.65it/s]

Loaded 83562 evaluation entries





In [7]:
output_data = []
for fname in tqdm(os.listdir(OUTPUT_DIR)):
    if not fname.endswith('.txt'): continue
    with open(os.path.join(OUTPUT_DIR, fname)) as f:
        for line in f:
            entry = OutputEntry.from_line(line)
            if entry.query_id in Q:
                output_data.append(entry)
print("Loaded {} output entries from {} systems.".format(len(output_data), sum(1 for d in os.listdir(OUTPUT_DIR) if d.endswith('.txt'))))

100%|██████████| 71/71 [00:32<00:00,  3.96it/s]

Loaded 140385 output entries from 71 systems.





## Basic statistics of evaluation data

In [8]:
def p2k(provs):
    """Converts a provenance string into a key"""
    return tuple(map(str, provs))
def k(entry):
    """Key for both evaluation entries and output entries, based on their entity id, slot value and provenance."""
    return (entry.query_id,entry.slot_value,p2k(entry.slot_provenances),p2k(entry.relation_provenances))
def kk(entry):
    """Key for LDC queries without provenance"""
    return (Q[entry.query_id],entry.slot_value)
def k2kk(key):
    return (Q[key[0]], key[1])

### Class label distribution aggregated across systems
(Note: 'C' is correct and 'W' is incorrect. 'X' is "inexact" and is scored to be 'wrong'.)

In [9]:
labels = Counter()
for entry in evaluation_data:
    labels[entry.slot_value_label] +=1

labels["*"] = len(evaluation_data)
tbl = [(lbl, cnt, cnt/len(evaluation_data)) for lbl, cnt in labels.items()]

to_table(sorted(tbl, key=lambda r:r[0]), cols="label count %".split())

label,count,%
*,83562,1.0
C,20442,0.2446327277949307
W,59582,0.7130274526698739
X,3538,0.0423398195351954


### Class label distribution after collapsing output using key, aggregated across systems 
In order to match output of systems with the evaluation data, we have an imperfect key to compare the two, namely the query id, relation, slot value and provenance strings. This shows that the difference between using the key function `k` and not is minimal.

In [10]:
evaluation_data_ = {}
for entry in evaluation_data:
    evaluation_data_[k(entry)] = entry
evaluation_data_ = list(evaluation_data_.values())

labels = Counter()
for entry in evaluation_data_:
    labels[entry.slot_value_label] +=1

labels["*"] = len(evaluation_data_)
tbl = [(lbl, cnt, cnt/len(evaluation_data_)) for lbl, cnt in labels.items()]

to_table(sorted(tbl, key=lambda r:r[0]), cols="label count %".split())

label,count,%
*,83368,1.0
C,20317,0.2437026197102005
W,59533,0.7140989348431053
X,3518,0.0421984454466941


## How many lines of output are reported by each team?

First, we construct an index over the gold output labels.

In [11]:
print("Building index over gold output answers")
gold_labels = defaultdict(set)
for entry in evaluation_data_:
    gold_labels[entry.slot_value_label == "C"].add(k(entry))

print("Intermediate output")
to_table([(lbl, len(ks)) for lbl, ks in gold_labels.items()], cols="label count".split())
print("Removing overlaps")
print("C \ !C: {}".format(len(gold_labels[True].intersection(gold_labels[False]))))
gold_labels[False].difference_update(gold_labels[True])
to_table([(lbl, len(ks)) for lbl, ks in gold_labels.items()], cols="label count".split())

Building index over gold output answers
Intermediate output


label,count
False,63051
True,20317


Removing overlaps
C \ !C: 0


label,count
False,63051
True,20317


In [12]:
# Build per system output
per_system_output = defaultdict(set)
for entry in output_data:
    per_system_output[entry.run_id].add(k(entry))

# restricted to correct entries.
per_system_true_output = defaultdict(set)
for run_id, vs in per_system_output.items():
    per_system_true_output[run_id] = vs.intersection(gold_labels[True])

This table summarizes the output produced by each team.

Note that we are only looking at assessed queries, so all unassessed % should be 0.

In [13]:
tbl = []
for run_id, vs in per_system_output.items():
    total = len(vs)
    correct = len(vs.intersection(gold_labels[True]))
    incorrect = len(vs.intersection(gold_labels[False]))
    evaluated = correct + incorrect
    unassessed = len(vs) - evaluated
    missing = len(gold_labels[True].difference(vs))
    tbl.append([run_id, correct, incorrect, evaluated, unassessed, missing, total])
tbl = [(run_id, c, i, e, u, m, t, "%.2f"%(e and 100*c/e), "%.2f"%((c+m) and 100*c/(c+m)), "%.2f"%(t and 100*u/t)) for (run_id, c, i, e, u, m, t) in tbl]
to_table(tbl, cols="run_id correct incorrect evaluated unassessed missing total prec% rec% unass%".split())

run_id,correct,incorrect,evaluated,unassessed,missing,total,prec%,rec%,unass%
KB_BBN5,1447,1707,3154,0,18870,3154,45.88,7.12,0.0
SF_BUPT_PRIS2,324,2279,2603,0,19993,2603,12.45,1.59,0.0
KB_BBN3,1560,1975,3535,0,18757,3535,44.13,7.68,0.0
SF_CIS1,527,441,968,0,19790,968,54.44,2.59,0.0
KB_NYU5,698,617,1315,0,19619,1315,53.08,3.44,0.0
KB_Stanford2,1148,2816,3964,0,19169,3964,28.96,5.65,0.0
SF_CIS3,1110,2960,4070,0,19207,4070,27.27,5.46,0.0
SF_MSIIPL_THU5,44,575,619,0,20273,619,7.11,0.22,0.0
SF_Stanford1,1046,1017,2063,0,19271,2063,50.7,5.15,0.0
KB_ICTCAS_OKN1,26,43,69,0,20291,69,37.68,0.13,0.0


In [14]:
everything = set()
for x in per_system_output.values(): 
    everything.update(x)
print("Total quantity of system output: ", len(everything))
print("Total quantity of gold output: ", len(gold_labels[True]))
# These are true labels that aren't predicted by any system -- not sure why they exist.
print("Total quantity of gold output not produced by systems: ", len(gold_labels[True].difference(everything)))

Total quantity of system output:  60799
Total quantity of gold output:  20317
Total quantity of gold output not produced by systems:  4387


## How much overlap exists between system outputs?

In [15]:
tbl = []
for run_id, vs in per_system_output.items():
    diff = set(vs)
    for run_id_, vs_ in per_system_output.items():
        if run_id_ != run_id:
            diff.difference_update(vs_)
    tbl.append([run_id, len(diff), len(vs), len(everything)])
tbl = [(run_id, d, s, t, "%.2f"%(100*d/s), "%2.f"%(100*s/t), "%.2f"%(100*d/t)) for (run_id, d, s, t) in tbl]
to_table(tbl, cols="run_id unique system total unique% system% unique_total%".split())

run_id,unique,system,total,unique%,system%,unique_total%
KB_BBN5,727,3154,60799,23.05,5,1.2
SF_BUPT_PRIS2,0,2603,60799,0.0,4,0.0
KB_BBN3,372,3535,60799,10.52,6,0.61
SF_CIS1,0,968,60799,0.0,2,0.0
KB_NYU5,3,1315,60799,0.23,2,0.0
KB_Stanford2,2753,3964,60799,69.45,7,4.53
SF_CIS3,1301,4070,60799,31.97,7,2.14
SF_MSIIPL_THU5,338,619,60799,54.6,1,0.56
SF_Stanford1,119,2063,60799,5.77,3,0.2
KB_ICTCAS_OKN1,31,69,60799,44.93,0,0.05


## Restricted to correct output

In [27]:
def get_system_name(name):
    return name[:-1].split('_', 1)[-1]

In [31]:
tbl = []
for run_id, vs in per_system_true_output.items():
    diff = set(vs)
    for run_id_, vs_ in per_system_true_output.items():
        if get_system_name(run_id_) != get_system_name(run_id): # the systems need to be distinct
            diff.difference_update(vs_)
    tbl.append([run_id, len(diff), len(vs)])
tbl.append(["total", sum(d for _, d, _ in tbl), sum(s for _, _, s in tbl)])
t = len(gold_labels[True])
tbl = [(run_id, d, s, t, "%.2f"%(100*d/s), "%.2f"%(100*s/t), "%.2f"%(100*d/t)) for (run_id, d, s) in tbl]
to_table(sorted(tbl), cols="run_id unique system total unique% system% unique_total%".split())

run_id,unique,system,total,unique%,system%,unique_total%
KB_BBN1,1375,1569,20317,87.64,7.72,6.77
KB_BBN2,1231,1413,20317,87.12,6.95,6.06
KB_BBN3,1371,1560,20317,87.88,7.68,6.75
KB_BBN4,1545,1738,20317,88.9,8.55,7.6
KB_BBN5,1249,1447,20317,86.32,7.12,6.15
KB_ICTCAS_OKN1,26,26,20317,100.0,0.13,0.13
KB_ICTCAS_OKN2,3,3,20317,100.0,0.01,0.01
KB_ICTCAS_OKN3,42,42,20317,100.0,0.21,0.21
KB_ICTCAS_OKN4,5,5,20317,100.0,0.02,0.02
KB_NYU1,736,736,20317,100.0,3.62,3.62


## Aggregating by query id instead of entry point

In [17]:
print("Building index over gold output answers")
gold_labels_by_id = defaultdict(set)
gold_labels_by_id[True].update(k2kk(key) for key in gold_labels[True])
gold_labels_by_id[False].update(k2kk(key) for key in gold_labels[False])

print("Intermediate output")
to_table([(lbl, len(ks)) for lbl, ks in gold_labels_by_id.items()], cols="label count".split())
print("Removing overlaps")
print("C \ !C: {}".format(len(gold_labels_by_id[True].intersection(gold_labels_by_id[False]))))
gold_labels_by_id[False].difference_update(gold_labels_by_id[True])
to_table([(lbl, len(ks)) for lbl, ks in gold_labels_by_id.items()], cols="label count".split())

per_system_true_output_by_id = {}
for run_id, vs in per_system_true_output.items():
    per_system_true_output_by_id[run_id] = set(k2kk(key) for key in vs)

Building index over gold output answers
Intermediate output


label,count
False,10400
True,1415


Removing overlaps
C \ !C: 179


label,count
False,10221
True,1415


In [30]:
tbl = []
for run_id, vs in per_system_true_output_by_id.items():
    diff = set(vs)
    for run_id_, vs_ in per_system_true_output_by_id.items():
        if get_system_name(run_id_) != get_system_name(run_id):
            diff.difference_update(vs_)
    tbl.append([run_id, len(diff), len(vs)])
tbl.append(["total", sum(d for _, d, _ in tbl), sum(s for _, _, s in tbl)])
t = len(gold_labels_by_id[True])
tbl = [(run_id, d, s, t, "%.2f"%(100*d/s), "%.2f"%(100*s/t), "%.2f"%(100*d/t)) for (run_id, d, s) in tbl]
to_table(sorted(tbl), cols="run_id unique system total unique% system% unique_total%".split())

run_id,unique,system,total,unique%,system%,unique_total%
KB_BBN1,77,480,1415,16.04,33.92,5.44
KB_BBN2,60,435,1415,13.79,30.74,4.24
KB_BBN3,82,479,1415,17.12,33.85,5.8
KB_BBN4,90,523,1415,17.21,36.96,6.36
KB_BBN5,70,450,1415,15.56,31.8,4.95
KB_ICTCAS_OKN1,1,12,1415,8.33,0.85,0.07
KB_ICTCAS_OKN2,0,3,1415,0.0,0.21,0.0
KB_ICTCAS_OKN3,1,20,1415,5.0,1.41,0.07
KB_ICTCAS_OKN4,0,5,1415,0.0,0.35,0.0
KB_NYU1,23,267,1415,8.61,18.87,1.63
