# Analyze the annotation results

The script includes three parts:
1.Data Loading:
    Read mTurk format file and collect annotation data, a part of input data

2.Clustering:
    Gather annotation of coreference into clusters

3.Perform Evaluation
    This include Kappa Cohen, etc.



In [1]:
import csv
import json
from copy import deepcopy
csv.field_size_limit(1131072)

131072

In [2]:
from utils.data_util import read_annotation
from utils.data_util import gather_by_annotator
from utils.data_util import gather_by_scene

## Part 1: Data Loading

In [3]:
# Load Kate Annotation
kate = read_annotation('data/kate.csv')
for item in kate:
    item['Turkle.Username'] = "ksanders"

chenyu = read_annotation('data/chenyu.csv')
for item in chenyu:
    item['Turkle.Username'] = "chenyu"

boyuan_patrick = read_annotation('data/results.csv')

results = []
results.extend(kate)
results.extend(boyuan_patrick)
results.extend(chenyu)

scenes = gather_by_scene(results)

In [4]:
print(scenes.keys())

dict_keys([":|Where|'d|you|go|?|I|ca|n't", ':|Oh|,|yeah|,|no|,|this|thing', ':|We|just|wanted|to|see|how|your|class'])


# Evaluation with self-implemented MUC

In [5]:
from conll import muc, b_cubed, blanc

In [7]:
for scene_key in scenes:
    scene = scenes[scene_key]
    print(len(scene[0]['sentences']))
    for anno1 in scene:
        name1 = anno1['Turkle.Username']
        for anno2 in scene:
            name2 = anno2['Turkle.Username']
            print("All", name1, name2, blanc(anno1['clusters_all'], anno2['clusters_all']))
            print("No Plural:", name1, name2, blanc(anno1['clusters_no_plural'], anno2['clusters_no_plural']))

9
All ksanders ksanders (1.0, 1.0, 1.0)
No Plural: ksanders ksanders (1.0, 1.0, 1.0)
All ksanders bzheng (0.9989316239316239, 0.9506944924331422, 0.9741649884794716)
No Plural: ksanders bzheng (0.9989316239316239, 0.9506944924331422, 0.9741649884794716)
All ksanders paxia (0.949023199023199, 0.8920865719646207, 0.9187062937062938)
No Plural: ksanders paxia (0.949023199023199, 0.8920865719646207, 0.9187062937062938)
All ksanders chenyu (0.8354700854700854, 0.6129309287204023, 0.7070401417569686)
No Plural: ksanders chenyu (0.824023199023199, 0.8568524970963995, 0.827549061851228)
All bzheng ksanders (0.9506944924331422, 0.9989316239316239, 0.9741649884794716)
No Plural: bzheng ksanders (0.9506944924331422, 0.9989316239316239, 0.9741649884794716)
All bzheng bzheng (1.0, 1.0, 1.0)
No Plural: bzheng bzheng (1.0, 1.0, 1.0)
All bzheng paxia (0.9334531131227973, 0.9202291610828196, 0.9262035903712771)
No Plural: bzheng paxia (0.9334531131227973, 0.9202291610828196, 0.9262035903712771)
All bzh

In [24]:
sample = scenes[":|Where|'d|you|go|?|I|ca|n't"][3]['answers']

In [26]:
for item in sample:
    print(item)

[(0, 4, 5), [(1, 0, 1)]]
[(0, 7, 8), [(0, 0, 1)]]
[(0, 12, 14), 'notPresent']
[(3, 2, 3), [(3, 0, 1)]]
[(3, 6, 8), 'notPresent']
[(3, 9, 10), [(3, 0, 1)]]
[(3, 13, 14), 'notPresent']
[(3, 16, 17), [(2, 0, 1)]]
[(3, 19, 20), [(3, 0, 1)]]
[(3, 24, 25), [(2, 0, 1)]]
[(3, 26, 29), 'notPresent']
[(3, 30, 31), [(3, 0, 1)]]
[(3, 32, 33), 'notPresent']
[(4, 2, 3), [(4, 0, 1)]]
[(4, 6, 7), [(3, 28, 29), (3, 27, 28), (3, 26, 27)]]
[(4, 8, 10), 'notPresent']
[(4, 11, 12), [(4, 0, 1)]]
[(4, 15, 16), 'notPresent']
[(4, 18, 19), [(3, 27, 28), (3, 28, 29), (3, 26, 27)]]
[(4, 20, 21), [(4, 0, 1)]]
[(4, 26, 27), [(3, 27, 28), (3, 28, 29), (3, 26, 27)]]
[(5, 5, 6), 'notPresent']
[(5, 8, 9), [(3, 27, 28), (3, 28, 29), (3, 26, 27)]]
[(5, 12, 15), 'notPresent']
[(5, 16, 17), [(3, 27, 28), (3, 28, 29), (3, 26, 27)]]
[(7, 2, 3), 'notPresent']
[(7, 4, 5), [(3, 27, 28), (3, 28, 29), (3, 26, 27)]]
[(7, 9, 10), 'notPresent']
[(7, 11, 12), 'notPresent']
[(7, 13, 16), 'notMention']
[(7, 17, 18), [(3, 28, 29), (3, 

In [105]:
for item in cluster:
    for x in item:
        print(x)
    print()

('0_4_5',)
('1_0_1',)

('0_7_8',)
('0_0_1',)

('0_12_14',)

('3_2_3',)
('3_0_1',)
('3_9_10',)
('3_0_1',)
('3_19_20',)
('3_0_1',)
('3_30_31',)
('3_0_1',)

('3_6_8',)

('3_13_14',)

('3_16_17',)
('2_0_1',)
('3_24_25',)
('2_0_1',)

('3_26_29',)

('3_32_33',)

('4_2_3',)
('4_0_1',)
('4_11_12',)
('4_0_1',)
('4_20_21',)
('4_0_1',)

('4_6_7',)
('3_28_29', '3_27_28', '3_26_27')
('7_17_18',)
('3_28_29', '3_27_28', '3_26_27')

('4_8_10',)

('4_15_16',)

('4_18_19',)
('3_27_28', '3_28_29', '3_26_27')
('4_26_27',)
('3_27_28', '3_28_29', '3_26_27')
('5_8_9',)
('3_27_28', '3_28_29', '3_26_27')
('5_16_17',)
('3_27_28', '3_28_29', '3_26_27')
('7_4_5',)
('3_27_28', '3_28_29', '3_26_27')

('5_5_6',)

('5_12_15',)

('7_2_3',)

('7_9_10',)

('7_11_12',)



In [106]:
cluster1 = generate_clusters(scenes[":|Where|'d|you|go|?|I|ca|n't"][0])
cluster2 = generate_clusters(scenes[":|Where|'d|you|go|?|I|ca|n't"][2])

In [107]:
print(cluster1)
print(cluster2)

[[('0_4_5',)], [('0_7_8',), ('0_0_1',)], [('0_12_14',)], [('3_2_3',), ('3_0_1',), ('3_9_10',), ('3_2_3',), ('3_19_20',), ('3_9_10',), ('3_30_31',), ('3_9_10',)], [('3_6_8',), ('3_13_14',), ('3_6_8',)], [('3_16_17',), ('2_0_1',), ('3_24_25',), ('3_16_17',)], [('3_26_29',), ('4_6_7',), ('3_26_29',), ('4_18_19',), ('4_6_7',), ('4_26_27',), ('4_18_19',)], [('3_32_33',)], [('4_2_3',), ('4_0_1',), ('4_11_12',), ('4_2_3',), ('4_20_21',), ('4_0_1',)], [('4_8_10',)], [('4_15_16',), ('5_5_6',), ('4_15_16',)], [('5_16_17',)], [('7_4_5',)], [('7_9_10',)], [('7_11_12',)], [('7_13_16',)], [('7_17_18',)]]
[[('0_4_5',)], [('0_7_8',), ('0_0_1',)], [('0_12_14',)], [('3_2_3',), ('3_0_1',), ('3_9_10',), ('3_0_1',), ('3_19_20',), ('3_0_1',), ('3_30_31',), ('3_0_1',)], [('3_6_8',)], [('3_13_14',)], [('3_16_17',), ('2_0_1',), ('3_24_25',), ('2_0_1',)], [('3_26_29',), ('4_6_7',), ('3_26_29',), ('4_18_19',), ('3_26_29',), ('4_26_27',), ('3_26_29',)], [('3_32_33',)], [('4_2_3',), ('4_0_1',), ('4_11_12',), ('4_0

In [109]:
print(muc(cluster1, cluster2))

(0.9615384615384616, 0.9615384615384616, 0.9615384615384615)


In [14]:
for scene_key in scenes:
    scene = scenes[scene_key]
    for sample in scene:
        for item in sample['answer_spans']:
            answers = []
            for answer in item['span_list']:
                answers.append((answer['sentenceIndex'], answer['startToken'], answer['endToken']))
            print(answers)
        print("=="*50)

[(-1, -1, -1)]
[(0, 0, 1)]
[(-1, -1, -1)]
[(3, 0, 1)]
[(-1, -1, -1)]
[(3, 2, 3)]
[(3, 6, 8)]
[(2, 0, 1)]
[(3, 9, 10)]
[(3, 16, 17)]
[(-1, -1, -1)]
[(3, 9, 10)]
[(-1, -1, -1)]
[(4, 0, 1)]
[(3, 26, 29)]
[(-1, -1, -1)]
[(4, 2, 3)]
[(-1, -1, -1)]
[(4, 6, 7)]
[(4, 0, 1)]
[(4, 18, 19)]
[(4, 15, 16)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(0, 0, 1)]
[(-1, -1, -1)]
[(3, 0, 1)]
[(-1, -1, -1)]
[(3, 0, 1)]
[(3, 6, 8)]
[(2, 0, 1)]
[(3, 0, 1)]
[(2, 0, 1)]
[(-1, -1, -1)]
[(3, 0, 1)]
[(-1, -1, -1)]
[(4, 0, 1)]
[(3, 26, 29)]
[(3, 32, 33)]
[(4, 0, 1)]
[(-1, -1, -1)]
[(3, 26, 29)]
[(4, 0, 1)]
[(3, 26, 29)]
[(4, 15, 16)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(-1, -1, -1)]
[(0, 0, 1)]
[(-1, -1, -1)]
[(3, 0, 1)]
[(-1, -1, -1)]
[(3, 0, 1)]
[(-1, -1, -1)]
[(2, 0, 1)]
[(3, 0, 1)]
[(2, 0, 1)]
[(-1, -1, -

In [7]:
sample = scenes[":|Where|'d|you|go|?|I|ca|n't"][0]['answer_spans']

In [10]:
for item in sample:
    print(item['span_list'])

[{'sentenceIndex': -1, 'startToken': -1, 'endToken': -1}]
[{'sentenceIndex': 0, 'startToken': 0, 'endToken': 1}]
[{'sentenceIndex': -1, 'startToken': -1, 'endToken': -1}]
[{'sentenceIndex': 3, 'startToken': 0, 'endToken': 1}]
[{'sentenceIndex': -1, 'startToken': -1, 'endToken': -1}]
[{'sentenceIndex': 3, 'startToken': 2, 'endToken': 3}]
[{'sentenceIndex': 3, 'startToken': 6, 'endToken': 8}]
[{'sentenceIndex': 2, 'startToken': 0, 'endToken': 1}]
[{'sentenceIndex': 3, 'startToken': 9, 'endToken': 10}]
[{'sentenceIndex': 3, 'startToken': 16, 'endToken': 17}]
[{'sentenceIndex': -1, 'startToken': -1, 'endToken': -1}]
[{'sentenceIndex': 3, 'startToken': 9, 'endToken': 10}]
[{'sentenceIndex': -1, 'startToken': -1, 'endToken': -1}]
[{'sentenceIndex': 4, 'startToken': 0, 'endToken': 1}]
[{'sentenceIndex': 3, 'startToken': 26, 'endToken': 29}]
[{'sentenceIndex': -1, 'startToken': -1, 'endToken': -1}]
[{'sentenceIndex': 4, 'startToken': 2, 'endToken': 3}]
[{'sentenceIndex': -1, 'startToken': -1, 

In [24]:
for item in cluster2:
    for x in item:
        print(x)

(13, 12, 13)
(7, 0, 1)
(7, 6, 7)
(11, 0, 1)
(0, 2, 3)
(13, 4, 5)
(11, 6, 7)
(0, 0, 1)
(0, 8, 10)
(0, 8, 9)
(0, 15, 16)
(1, 4, 6)
(1, 0, 1)
(2, 3, 4)
(2, 5, 6)
(2, 7, 11)
(3, 2, 4)
(4, 4, 5)
(4, 7, 9)
(4, 7, 8)
(5, 2, 3)
(5, 4, 7)
(5, 9, 10)
(5, 14, 15)
(5, 0, 1)
(5, 16, 18)
(5, 22, 23)
(8, 4, 5)
(8, 0, 1)
(9, 4, 5)
(8, 9, 10)
(8, 12, 15)
(8, 13, 14)
(8, 16, 18)
(9, 6, 7)
(9, 8, 9)
(10, 6, 8)
(10, 23, 24)
(10, 9, 11)
(10, 11, 12)
(10, 15, 16)
(10, 18, 21)
(10, 28, 29)
(10, 30, 32)
(10, 34, 35)
(10, 56, 57)
(10, 0, 1)
(10, 39, 40)
(10, 40, 41)
(10, 44, 46)
(10, 49, 51)
(10, 52, 53)
(10, 59, 61)
(11, 8, 10)
(11, 8, 9)
(13, 6, 7)
(13, 14, 15)
(15, 0, 1)
(15, 2, 3)
(16, 25, 26)
(15, 7, 9)
(15, 7, 8)
(15, 11, 12)
(12, 0, 1)
(16, 0, 1)
(16, 6, 7)
(16, 21, 22)
(16, 31, 32)
(16, 8, 10)
(16, 12, 13)
(16, 14, 20)
(16, 28, 31)
(18, 0, 1)
(18, 4, 5)
(18, 12, 14)
(18, 12, 13)
(20, 0, 1)
(21, 4, 5)
(21, 6, 9)
(21, 10, 13)
(21, 14, 17)
(23, 3, 4)
(22, 0, 1)
(22, 2, 3)
(22, 15, 16)
(22, 5, 9)
(22, 12, 

In [12]:
from conll import muc, b_cubed, blanc

In [10]:
muc(cluster1, cluster2)

(1.0, 0.9375, 0.967741935483871)

In [11]:
b_cubed(cluster1, cluster2)

(1.0, 0.9393939393939394, 0.96875)

In [13]:
blanc(cluster1, cluster2)

(0.9989316239316239, 0.9506944924331422, 0.9741649884794716)

## Part 2: Evaluation

This part consists of: 1.Mention Recall  2.Kappa Cohen  3.MUC

In [6]:
from utils.evaluation_util import kappa
from utils.evaluation_util import muc
from utils.evaluation_util import phi4
from utils.evaluation_util import b_cubed
from utils.evaluation_util import exact_match
from utils.evaluation_util import ceafe

In [7]:
results = []
results.extend(read_annotation('data/results.csv'))
scenes = gather_by_scene(results)

[(0, 4, 5)]
[(0, 7, 8), (0, 0, 1)]
[(0, 12, 14)]
[(3, 2, 3), (3, 0, 1)]
[(3, 6, 8)]
[(3, 9, 10), (3, 0, 1)]
[(3, 13, 14), (3, 6, 8)]
[(3, 16, 17), (2, 0, 1)]
[(3, 19, 20), (3, 0, 1)]
[(3, 24, 25), (2, 0, 1)]
[(3, 26, 29)]
[(3, 30, 31), (3, 0, 1)]
[(3, 32, 33)]
[(4, 2, 3), (4, 0, 1)]
[(4, 6, 7), (3, 26, 29)]
[(4, 8, 10), (3, 32, 33)]
[(4, 11, 12), (4, 0, 1)]
[(4, 15, 16)]
[(4, 18, 19), (3, 26, 29)]
[(4, 20, 21), (4, 0, 1)]
[(4, 26, 27), (3, 26, 29)]
[(5, 5, 6), (4, 15, 16)]
[(5, 16, 17)]
[(7, 2, 3)]
[(7, 4, 5)]
[(7, 9, 10)]
[(7, 11, 12)]
[(7, 13, 16)]
[(7, 17, 18)]
[(0, 4, 5)]
[(0, 7, 8), (0, 0, 1)]
[(0, 12, 14)]
[(3, 2, 3), (3, 0, 1)]
[(3, 6, 8)]
[(3, 9, 10), (3, 0, 1)]
[(3, 13, 14)]
[(3, 16, 17), (2, 0, 1)]
[(3, 19, 20), (3, 0, 1)]
[(3, 24, 25), (2, 0, 1)]
[(3, 26, 29)]
[(3, 30, 31), (3, 0, 1)]
[(3, 32, 33)]
[(4, 2, 3), (4, 0, 1)]
[(4, 6, 7), (3, 26, 29)]
[(4, 8, 10)]
[(4, 11, 12), (4, 0, 1)]
[(4, 15, 16)]
[(4, 18, 19), (3, 26, 29)]
[(4, 20, 21), (4, 0, 1)]
[(4, 26, 27), (3, 26, 29)]


In [8]:
for item in scenes:
    sample1, sample2 = scenes[item][0], scenes[item][1]
    print(len(sample1['sentences']), "Kappa:", kappa(sample1, sample2), "|", "Exact Match:",exact_match(sample1, sample2) , "|", "MUC:", muc(sample1, sample2), "|", "CEAFE:", ceafe(sample1, sample2), "|", "B Cubed:", b_cubed(sample1, sample2))

9 Kappa: 0.7819971870604782 | Exact Match: 0.8387096774193549 | MUC: (0, 28) | CEAFE: (7.0, 8, 7.0, 19) | B Cubed: (0.0, 36)
11 Kappa: 0.9090909090909091 | Exact Match: 0.9333333333333333 | MUC: (0, 19) | CEAFE: (4.0, 4, 4.0, 10) | B Cubed: (0.0, 23)
31 Kappa: 0.6983758700696056 | Exact Match: 0.7523809523809524 | MUC: (0, 99) | CEAFE: (24.734523809523807, 30, 24.734523809523807, 65) | B Cubed: (0.0, 129)


In [10]:
from statsmodels.stats.inter_rater import fleiss_kappa
from statsmodels.stats.inter_rater import cohens_kappa

In [11]:
# Load Kate Annotation
kate = read_annotation('data/kate.csv')
for item in kate:
    item['Turkle.Username'] = "ksanders"

chenyu = read_annotation('data/chenyu.csv')
for item in chenyu:
    item['Turkle.Username'] = "chenyu"

boyuan_patrick = read_annotation('data/results.csv')

results = []
results.extend(kate)
results.extend(boyuan_patrick)
results.extend(chenyu)

scenes = gather_by_scene(results)

[(0, 4, 5)]
[(0, 7, 8), (0, 0, 1)]
[(0, 12, 14)]
[(3, 2, 3), (3, 0, 1)]
[(3, 6, 8)]
[(3, 9, 10), (3, 2, 3)]
[(3, 13, 14), (3, 6, 8)]
[(3, 16, 17), (2, 0, 1)]
[(3, 19, 20), (3, 9, 10)]
[(3, 24, 25), (3, 16, 17)]
[(3, 26, 29)]
[(3, 30, 31), (3, 9, 10)]
[(3, 32, 33)]
[(4, 2, 3), (4, 0, 1)]
[(4, 6, 7), (3, 26, 29)]
[(4, 8, 10)]
[(4, 11, 12), (4, 2, 3)]
[(4, 15, 16)]
[(4, 18, 19), (4, 6, 7)]
[(4, 20, 21), (4, 0, 1)]
[(4, 26, 27), (4, 18, 19)]
[(5, 5, 6), (4, 15, 16)]
[(5, 16, 17)]
[(7, 4, 5)]
[(7, 9, 10)]
[(7, 11, 12)]
[(7, 13, 16)]
[(7, 17, 18)]
[(0, 8, 10)]
[(1, 9, 10), (1, 0, 1)]
[(1, 12, 14)]
[(2, 3, 4), (1, 9, 10)]
[(4, 5, 6), (3, 0, 1)]
[(4, 9, 10)]
[(6, 2, 5), (5, 5, 7)]
[(7, 2, 3), (6, 2, 5), (5, 0, 1)]
[(7, 5, 6)]
[(8, 2, 4), (7, 2, 3)]
[(8, 13, 17)]
[(9, 2, 3), (8, 2, 4)]
[(9, 7, 8), (9, 2, 3)]
[(0, 2, 3), (0, 0, 1)]
[(0, 8, 10)]
[(0, 8, 9)]
[(0, 15, 16)]
[(1, 4, 6), (0, 8, 10)]
[(2, 3, 4), (1, 0, 1)]
[(2, 5, 6), (0, 15, 16)]
[(2, 7, 11)]
[(3, 2, 4)]
[(4, 7, 9)]
[(4, 7, 8), (5, 0,

In [12]:
def label_mention_to_cluster(instance, golden_clusters):
    """
    Assign cluster id to mention query
    If answer is notMention, label==-1
    If answer is singleton, label==-2
    Else, label==cluster_id
    """
    mentions = instance['answers']
    labels = []
    for query, answer in mentions:
        # -1: notMention, -2: Singleton, index: cluster_id
        # If it is not mention
        if answer == "notMention":
            labels.append(-1)
        else:
            idx = -2
            for i, cluster in enumerate(golden_clusters):
                if (answer[0] in cluster) and (len(cluster) != 1):
                    idx = i
            labels.append(idx)
    return labels

In [89]:
def muc_temp(instance1, instance2):
    """
    Counts the mentions in each predicted cluster which need to be re-allocated in
    order for each predicted cluster to be contained by the respective gold cluster.
    <https://aclweb.org/anthology/M/M95/M95-1005.pdf>
    """
    clusters = []
    for cluster in instance1['clusters']:
        if len(cluster)>1:
            clusters.append(cluster)
    # clusters = instance1['clusters']
    mention_to_gold = {}
    for item in instance2['answers']:
        if item[1] not in ["notPresent", "notMention"]:
            mention_to_gold[item[0]] = item[1][0]
    for item in clusters:
        print(item)
    # print(clusters)
    # print(mention_to_gold)
    true_p, all_p = 0, 0
    for cluster in clusters:
        all_p += len(cluster) - 1
        true_p += len(cluster)
        linked = set()
        for mention in cluster:
            if mention in mention_to_gold:
                linked.add(mention_to_gold[mention])
            else:
                true_p -= 1
        true_p -= len(linked)
    return true_p, all_p

In [90]:
print(scenes.keys())

dict_keys([":|Where|'d|you|go|?|I|ca|n't", ':|Oh|,|yeah|,|no|,|this|thing', ':|We|just|wanted|to|see|how|your|class'])


In [91]:
scene = scenes[":|We|just|wanted|to|see|how|your|class"]
score = muc_temp(scene[0], scene[2])
print(score, score[0]/score[1])

[(0, 2, 3), (0, 0, 1)]
[(0, 8, 10), (1, 4, 6), (0, 8, 10)]
[(0, 15, 16), (2, 5, 6), (0, 15, 16)]
[(2, 3, 4), (1, 0, 1)]
[(4, 7, 8), (5, 0, 1), (5, 2, 3), (5, 0, 1), (5, 14, 15), (5, 2, 3)]
[(7, 6, 7), (7, 0, 1)]
[(8, 9, 10), (8, 0, 1), (8, 13, 14), (8, 9, 10), (9, 4, 5), (8, 13, 14)]
[(9, 8, 9), (9, 0, 1), (6, 0, 1), (2, 0, 1), (9, 8, 9), (9, 0, 1), (6, 0, 1), (2, 0, 1), (9, 8, 9), (9, 0, 1), (6, 0, 1), (2, 0, 1), (9, 8, 9), (9, 0, 1), (6, 0, 1), (2, 0, 1)]
[(10, 6, 8), (10, 44, 46), (10, 6, 8)]
[(10, 9, 11), (10, 23, 24), (10, 9, 11)]
[(10, 28, 29), (10, 39, 40), (10, 28, 29), (10, 40, 41), (10, 39, 40)]
[(10, 34, 35), (10, 0, 1), (10, 56, 57), (10, 34, 35), (11, 8, 9), (10, 56, 57)]
[(10, 59, 61), (10, 45, 46)]
[(11, 6, 7), (11, 0, 1), (13, 4, 5), (11, 6, 7), (13, 12, 13), (11, 6, 7)]
[(11, 8, 10), (1, 5, 6)]
[(15, 2, 3), (15, 0, 1), (15, 7, 8), (15, 2, 3), (16, 25, 26), (15, 0, 1)]
[(15, 11, 12), (12, 0, 1)]
[(16, 6, 7), (16, 0, 1), (16, 21, 22), (16, 6, 7), (16, 31, 32), (16, 21, 2

In [17]:
scene = scenes[":|Where|'d|you|go|?|I|ca|n't"]

golden_clusters = scene[0]['clusters']
label1 = label_mention_to_cluster(scene[0], golden_clusters)
label2 = label_mention_to_cluster(scene[1], golden_clusters)
label3 = label_mention_to_cluster(scene[2], golden_clusters)
label4 = label_mention_to_cluster(scene[3], golden_clusters)

In [29]:
temp = {}
for item in temp:
    print(item, temp[item])

for item in scene[0]['answers']:
    print(item[0], item[1])

(0, 4, 5) notPresent
(0, 7, 8) [(0, 0, 1)]
(0, 12, 14) notPresent
(3, 2, 3) [(3, 0, 1)]
(3, 6, 8) notPresent
(3, 9, 10) [(3, 2, 3)]
(3, 13, 14) [(3, 6, 8)]
(3, 16, 17) [(2, 0, 1)]
(3, 19, 20) [(3, 9, 10)]
(3, 24, 25) [(3, 16, 17)]
(3, 26, 29) notPresent
(3, 30, 31) [(3, 9, 10)]
(3, 32, 33) notPresent
(4, 2, 3) [(4, 0, 1)]
(4, 6, 7) [(3, 26, 29)]
(4, 8, 10) notPresent
(4, 11, 12) [(4, 2, 3)]
(4, 15, 16) notPresent
(4, 18, 19) [(4, 6, 7)]
(4, 20, 21) [(4, 0, 1)]
(4, 26, 27) [(4, 18, 19)]
(5, 5, 6) [(4, 15, 16)]
(5, 8, 9) notMention
(5, 12, 15) notMention
(5, 16, 17) notPresent
(7, 2, 3) notMention
(7, 4, 5) notPresent
(7, 9, 10) notPresent
(7, 11, 12) notPresent
(7, 13, 16) notPresent
(7, 17, 18) notPresent


In [26]:
print(scene[0]['answers'])

[[(0, 4, 5), 'notPresent'], [(0, 7, 8), [(0, 0, 1)]], [(0, 12, 14), 'notPresent'], [(3, 2, 3), [(3, 0, 1)]], [(3, 6, 8), 'notPresent'], [(3, 9, 10), [(3, 2, 3)]], [(3, 13, 14), [(3, 6, 8)]], [(3, 16, 17), [(2, 0, 1)]], [(3, 19, 20), [(3, 9, 10)]], [(3, 24, 25), [(3, 16, 17)]], [(3, 26, 29), 'notPresent'], [(3, 30, 31), [(3, 9, 10)]], [(3, 32, 33), 'notPresent'], [(4, 2, 3), [(4, 0, 1)]], [(4, 6, 7), [(3, 26, 29)]], [(4, 8, 10), 'notPresent'], [(4, 11, 12), [(4, 2, 3)]], [(4, 15, 16), 'notPresent'], [(4, 18, 19), [(4, 6, 7)]], [(4, 20, 21), [(4, 0, 1)]], [(4, 26, 27), [(4, 18, 19)]], [(5, 5, 6), [(4, 15, 16)]], [(5, 8, 9), 'notMention'], [(5, 12, 15), 'notMention'], [(5, 16, 17), 'notPresent'], [(7, 2, 3), 'notMention'], [(7, 4, 5), 'notPresent'], [(7, 9, 10), 'notPresent'], [(7, 11, 12), 'notPresent'], [(7, 13, 16), 'notPresent'], [(7, 17, 18), 'notPresent']]


In [25]:
for item in scene[0]:
    print(item)

HITId
HITTypeId
Title
CreationTime
MaxAssignments
AssignmentDurationInSeconds
AssignmentId
WorkerId
AcceptTime
SubmitTime
WorkTimeInSeconds
sentences
answer_spans
Turkle.Username
clusters
answers


In [18]:
print(label1)
print(label2)
print(label3)
print(label4)

[-2, 1, -2, 3, -2, 3, 4, 5, 3, 5, -2, 3, -2, 8, 6, -2, 8, -2, 6, 8, 6, 10, -1, -1, -2, -1, -2, -2, -2, -2, -2]
[-2, 1, -2, 3, -2, 3, 4, 5, 3, 5, -2, 3, -2, 8, 6, -2, 8, -2, 6, 8, 6, 10, -1, -1, -2, -2, -2, -2, -2, -2, -2]
[-2, 1, -2, 3, -2, 3, -2, 5, 3, 5, -2, 3, -2, 8, 6, -2, 8, -2, 6, 8, 6, 10, -2, -2, -2, -2, -2, -1, -2, -2, -2]
[-2, 1, -2, 3, -2, 3, -2, 5, 3, 5, -2, 3, -2, 8, -2, -2, 8, -2, -2, 8, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2]


In [15]:
from utils.b_score import calc_b3

In [93]:
# print(calc_b3(label1, label2))
# print(calc_b3(label1, label3))
print(calc_b3(label1, label4))

[0.7468516826947749, 0.6193548387096774, 0.9404466501240695]


In [21]:
table = [label1, label2]
print(fleiss_kappa(table))

AssertionError: 

In [11]:
print(len(scenes[":|Where|'d|you|go|?|I|ca|n't"]))

4


In [145]:
import csv
import json
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
from scipy.optimize import linear_sum_assignment
import numpy as np
from typing import Any, Dict, List, Tuple
from collections import Counter


def mention_detection_label(instance):
    """
    Assign cluster id to mention query
    If answer is notMention, label==-1
    If answer is singleton, label==-2
    Else, label==cluster_id
    """
    mentions = instance['answers']
    labels = []
    for query, answer in mentions:
        # -1: notMention, -2: Singleton, index: cluster_id
        # If it is not mention
        if answer == "notMention":
            labels.append(1)
        else:
            labels.append(2)
    return labels


def kappa_mention_detection(instance1, instance2):
    label1 = mention_detection_label(instance1)
    label2 = mention_detection_label(instance2)
    return cohen_kappa_score(label1, label2)

def exact_match_mention_detection(instance1, instance2):
    label1 = mention_detection_label(instance1)
    label2 = mention_detection_label(instance2)
    return accuracy_score(label1, label2)

In [146]:
for scene_key in scenes:
    scene = scenes[scene_key]
    for sample1 in scene:
        for sample2 in scene:
            print(sample1['Turkle.Username'], sample2['Turkle.Username'], len(sample1['sentences']), "Kappa:", kappa_mention_detection(sample1, sample2), "|", "Exact:", exact_match_mention_detection(sample1, sample2))
    print("=="*50)

ksanders ksanders 9 Kappa: 1.0 | Exact: 1.0
ksanders bzheng 9 Kappa: 0.7832167832167832 | Exact: 0.967741935483871
ksanders paxia 9 Kappa: -0.05084745762711851 | Exact: 0.8709677419354839
ksanders chenyu 9 Kappa: -0.05084745762711851 | Exact: 0.8709677419354839
bzheng ksanders 9 Kappa: 0.7832167832167832 | Exact: 0.967741935483871
bzheng bzheng 9 Kappa: 1.0 | Exact: 1.0
bzheng paxia 9 Kappa: -0.04494382022471899 | Exact: 0.9032258064516129
bzheng chenyu 9 Kappa: -0.04494382022471899 | Exact: 0.9032258064516129
paxia ksanders 9 Kappa: -0.05084745762711851 | Exact: 0.8709677419354839
paxia bzheng 9 Kappa: -0.04494382022471899 | Exact: 0.9032258064516129
paxia paxia 9 Kappa: 1.0 | Exact: 1.0
paxia chenyu 9 Kappa: -0.033333333333333215 | Exact: 0.9354838709677419
chenyu ksanders 9 Kappa: -0.05084745762711851 | Exact: 0.8709677419354839
chenyu bzheng 9 Kappa: -0.04494382022471899 | Exact: 0.9032258064516129
chenyu paxia 9 Kappa: -0.033333333333333215 | Exact: 0.9354838709677419
chenyu cheny

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


In [None]:
name = ['ksanders', 'bzheng', 'paxia', 'chenyu']

for scene_key in scenes:
    scene = scenes[scene_key]
    for sample1 in scene:
        for sample2 in scene:
            print(sample1['Turkle.Username'], sample2['Turkle.Username'], len(sample1['sentences']), "Kappa:", kappa(sample1, sample2), "|", "Exact Match:", exact_match(sample1, sample2))
    print("=="*50)

In [137]:
name = ['ksanders', 'bzheng', 'paxia', 'chenyu']

for scene_key in scenes:
    scene = scenes[scene_key]
    for sample1 in scene:
        for sample2 in scene:
            print(sample1['Turkle.Username'], sample2['Turkle.Username'], len(sample1['sentences']), len(sample1['answers']), "Kappa:", kappa(sample1, sample2), "|", "Exact Match:",exact_match(sample1, sample2))
    print("=="*50)

ksanders ksanders 9 31 Kappa: 1.0 | Exact Match: 1.0
ksanders bzheng 9 31 Kappa: 0.9576502732240437 | Exact Match: 0.967741935483871
ksanders paxia 9 31 Kappa: 0.7816901408450704 | Exact Match: 0.8387096774193549
ksanders chenyu 9 31 Kappa: 0.5823353293413174 | Exact Match: 0.7096774193548387
bzheng ksanders 9 31 Kappa: 0.9167785234899328 | Exact Match: 0.9354838709677419
bzheng bzheng 9 31 Kappa: 1.0 | Exact Match: 1.0
bzheng paxia 9 31 Kappa: 0.7819971870604782 | Exact Match: 0.8387096774193549
bzheng chenyu 9 31 Kappa: 0.5829596412556053 | Exact Match: 0.7096774193548387
paxia ksanders 9 31 Kappa: 0.7810734463276836 | Exact Match: 0.8387096774193549
paxia bzheng 9 31 Kappa: 0.8213256484149856 | Exact Match: 0.8709677419354839
paxia paxia 9 31 Kappa: 1.0 | Exact Match: 1.0
paxia chenyu 9 31 Kappa: 0.6555555555555554 | Exact Match: 0.7741935483870968
chenyu ksanders 9 31 Kappa: 0.735494880546075 | Exact Match: 0.8387096774193549
chenyu bzheng 9 31 Kappa: 0.7816901408450704 | Exact Mat