In [11]:
import json
import matplotlib.pyplot as plt
import operator

In [18]:
def get_mentions_of_cluster(dataset, cluster_id):
    mentions = []
    for mention in dataset:
        if mention['coref_chain'] == cluster_id:
            mentions.append(mention)

    return mentions


def get_all_chains(mentions):
    clusters = {}
    for mention_dic in mentions:
        chain = mention_dic['coref_chain']
        clusters[chain] = [] if chain not in clusters else clusters[chain]
        clusters[chain].append(mention_dic)

    return clusters


def get_cluster_by_mention_num(clusters, num):
    clusters_names = []
    for cluster, doc_mention in clusters.items():
        num_of_mentions = len(doc_mention)
        if num_of_mentions == num:
            clusters_names.append(cluster)

    return clusters_names


def get_gold_within_doc(mentions):
    wd_cluster = {}
    for mention in mentions:
        chain = mention['coref_chain']
        doc = mention['doc_id']
        id_within_doc = chain + '_' + doc
        wd_cluster[id_within_doc] = [] if id_within_doc not in wd_cluster else wd_cluster[id_within_doc]
        wd_cluster[id_within_doc].append(mention)

    return wd_cluster



def get_metainfo(clusters):
    """
    print num of mentions per clusters
    :param clusters:
    :return:
    """
    dic = {}
    for cluster, doc_mention in clusters.items():
        num_of_mentions = len(doc_mention)
        dic[num_of_mentions] = dic.get(num_of_mentions, 0) + 1

    for length, num_of_clusters in sorted(dic.items()):
        print("There are {} clusters with {} mentions".format(num_of_clusters, length))

    number = dic.values()
    labels = dic.keys()

    #get_pie_chart(number, labels)

def extract_mention_text(cluster):
    mentions = []
    for mention in cluster:
        mention.append(mention['MENTION_TEXT'])
    return mentions


def get_pie_chart(values, labels):
    patches, texts = plt.pie(values, shadow=True, startangle=90)
    plt.legend(patches, labels, loc="best")
    plt.axis('equal')
    plt.show()


def within_to_cross(within_doc_cluster):
    cross_doc = {}
    for within in within_doc_cluster:
        name = within.split('_')[0]
        if name != 'INTRA' and name != 'Singleton':
            cross_doc[name] = [] if name not in cross_doc else cross_doc[name]
            cross_doc[name].append(within)

    return cross_doc


def find_most_popular_word(clusters, within_doc_cluster):
    words = {}
    for cluster in clusters:
        mentions = within_doc_cluster[cluster]
        vocab = set()
        for mention in mentions:
            text = mention['MENTION_TEXT']
            vocab.add(text)

        for word in vocab:
            words[word] = words.get(word, 0) + 1

    most_word = max(words.items(), key=operator.itemgetter(1))
    return most_word[0], most_word[1]/len(clusters)




def get_prob(within_doc_cluster):
    cross_doc = within_to_cross(within_doc_cluster)
    length = 0
    prob = 0
    for cluster, within in cross_doc.items():
        word, coverage = find_most_popular_word(within, within_doc_cluster)
        length += len(within)
        prob += coverage * len(within)

    return prob / length



Select data to explore (ECB+ or MEANTIME)

In [86]:
data = 'ecb_data'

In [87]:
with open(data + '/all_entity_gold_mentions.json', 'r') as f:
    entity_mentions = json.load(f)

with open(data + '/all_event_gold_mentions.json', 'r') as f:
    event_mentions = json.load(f)
    
print('{} entity mentions'.format(len(entity_mentions)))
print('{} event mentions'.format(len(event_mentions)))

8289 entity mentions
6833 event mentions


In [88]:
event_cross_clusters = get_all_chains(event_mentions)
event_within_clusters = get_gold_within_doc(event_mentions)
entity_cross_clusters = get_all_chains(entity_mentions)
entity_within_clusters = get_gold_within_doc(entity_mentions)
print('Number of entity chains: {}'.format(len(entity_cross_clusters)))
print('Number of event chains: {}'.format(len(event_cross_clusters)))

Number of entity chains: 2221
Number of event chains: 2741


In [89]:
entity_singleton = sum([1 for m in entity_mentions if m["coref_chain"].startswith('Singleton')])
event_singleton =  sum([1 for m in event_mentions if m["coref_chain"].startswith('Singleton')])
print('Number of entity singleton: {}'.format(entity_singleton))
print('Number of event singleton: {}'.format(event_singleton))

Number of entity singleton: 1231
Number of event singleton: 1775


In [90]:
print('Event Cross chains: {}'.format(len(event_cross_clusters)))
print('Event Within chains: {}'.format(len(event_within_clusters)))

print('Entity Cross chains: {}'.format(len(entity_cross_clusters)))
print('Entity Within chains: {}'.format(len(entity_within_clusters)))

Event Cross chains: 2741
Event Within chains: 5496
Entity Cross chains: 2221
Entity Within chains: 5850


In [91]:
get_metainfo(event_within_clusters)

There are 4460 clusters with 1 mentions
There are 807 clusters with 2 mentions
There are 179 clusters with 3 mentions
There are 37 clusters with 4 mentions
There are 7 clusters with 5 mentions
There are 4 clusters with 6 mentions
There are 1 clusters with 7 mentions
There are 1 clusters with 8 mentions


In [92]:
get_cluster_by_mention_num(event_within_clusters, 4)

['ACT17239767913146830_32_10ecbplus.xml',
 'ACT17239767913146830_32_11ecbplus.xml',
 'ACT27398642960429027_33_3ecb.xml',
 'ACT16632565623246075_27_1ecb.xml',
 'ACT16869980928391781_7_5ecbplus.xml',
 'ACT16762501657378707_28_11ecb.xml',
 'ACT15986681471021312_1_1ecbplus.xml',
 'ACT16236402809085484_1_7ecb.xml',
 'ACT17050876360621871_31_5ecb.xml',
 'ACT17042418372710956_30_13ecbplus.xml',
 'ACT16979318564046465_30_10ecb.xml',
 'ACT16979225864150215_30_3ecb.xml',
 'ACT16979267820424588_30_3ecb.xml',
 'ACT17042122379497712_30_6ecbplus.xml',
 'ACT16194654011907361_4_4ecbplus.xml',
 'ACT16194654011907361_4_8ecbplus.xml',
 'ACT16190399378875259_3_6ecbplus.xml',
 'ACT16190399378875259_3_8ecbplus.xml',
 'ACT17642048248481893_35_2ecb.xml',
 'ACT18007234386480781_21_7ecbplus.xml',
 'ACT18241002218927644_23_5ecbplus.xml',
 'ACT18241002218927644_23_11ecbplus.xml',
 'ACT18240959809845886_23_1ecbplus.xml',
 'ACT18241002218927644_23_10ecbplus.xml',
 'ACT16240564368961988_2_1ecbplus.xml',
 'ACT1623936

In [93]:
len(event_cross_clusters['OTH57180939930115798'])

KeyError: 'OTH57180939930115798'

In [94]:
outputs = []
for mention in event_cross_clusters['OTH40105343779700492']:
    if mention['doc_id'] != '9549_Reactions_to_Apple.xml':
        outputs.append(mention)

sorted(outputs, key=lambda m: m['sent_id'])

KeyError: 'OTH40105343779700492'

In [101]:
cross_keys = set()
for cluster, mentions in event_cross_clusters.items():
    doc = mentions[0]['doc_id']
    for m in mentions[1:]:
        if m['doc_id'] != doc:
            cross_keys.add(m['coref_chain'])

In [102]:
len(cross_keys)

669

In [61]:
cross_keys

{'OTH39740769611526334',
 'OTH40105343779700492',
 'OTH57180939930115798',
 'OTH57301135384410360',
 'OTH57384457951243371'}