In [1]:
import json
import matplotlib.pyplot as plt
import operator

In [2]:
def get_mentions_of_cluster(dataset, cluster_id):
    mentions = []
    for mention in dataset:
        if mention['coref_chain'] == cluster_id:
            mentions.append(mention)

    return mentions


def get_all_chains(mentions):
    clusters = {}
    for mention_dic in mentions:
        chain = mention_dic['coref_chain']
        clusters[chain] = [] if chain not in clusters else clusters[chain]
        clusters[chain].append(mention_dic)

    return clusters


def get_cluster_by_mention_num(clusters, num):
    clusters_names = []
    for cluster, doc_mention in clusters.items():
        num_of_mentions = len(doc_mention)
        if num_of_mentions == num:
            clusters_names.append(cluster)

    return clusters_names


def get_gold_within_doc(mentions):
    wd_cluster = {}
    for mention in mentions:
        chain = mention['coref_chain']
        doc = mention['doc_id']
        id_within_doc = chain + '_' + doc
        wd_cluster[chain] = [] if id_within_doc not in wd_cluster else wd_cluster[chain]
        wd_cluster[chain].append(mention)

    return wd_cluster



def get_metainfo(clusters):
    """
    print num of mentions per clusters
    :param clusters:
    :return:
    """
    dic = {}
    for cluster, doc_mention in clusters.items():
        num_of_mentions = len(doc_mention)
        dic[num_of_mentions] = dic.get(num_of_mentions, 0) + 1

    for length, num_of_clusters in sorted(dic.items()):
        print("There are {} clusters with {} mentions".format(num_of_clusters, length))

    number = dic.values()
    labels = dic.keys()

    #get_pie_chart(number, labels)

def extract_mention_text(cluster):
    mentions = []
    for mention in cluster:
        mention.append(mention['MENTION_TEXT'])
    return mentions


def get_pie_chart(values, labels):
    patches, texts = plt.pie(values, shadow=True, startangle=90)
    plt.legend(patches, labels, loc="best")
    plt.axis('equal')
    plt.show()


def within_to_cross(within_doc_cluster):
    cross_doc = {}
    for within in within_doc_cluster:
        name = within.split('_')[0]
        if name != 'INTRA' and name != 'Singleton':
            cross_doc[name] = [] if name not in cross_doc else cross_doc[name]
            cross_doc[name].append(within)

    return cross_doc


def find_most_popular_word(clusters, within_doc_cluster):
    words = {}
    for cluster in clusters:
        mentions = within_doc_cluster[cluster]
        vocab = set()
        for mention in mentions:
            text = mention['MENTION_TEXT']
            vocab.add(text)

        for word in vocab:
            words[word] = words.get(word, 0) + 1

    most_word = max(words.items(), key=operator.itemgetter(1))
    return most_word[0], most_word[1]/len(clusters)




def get_prob(within_doc_cluster):
    cross_doc = within_to_cross(within_doc_cluster)
    length = 0
    prob = 0
    for cluster, within in cross_doc.items():
        word, coverage = find_most_popular_word(within, within_doc_cluster)
        length += len(within)
        prob += coverage * len(within)

    return prob / length



Select data to explore (ECB+ or MEANTIME)

In [16]:
data = 'meantime_data'

In [20]:
with open(data + '/all_entity_gold_mentions.json', 'r') as f:
    entity_mentions = json.load(f)

with open(data + '/all_event_gold_mentions.json', 'r') as f:
    event_mentions = json.load(f)
    
print('{} entity mentions'.format(len(entity_mentions)))
print('{} event mentions'.format(len(event_mentions)))

2866 entity mentions
2107 event mentions


In [21]:
entity_cross_clusters = get_all_chains(entity_mentions)
event_cross_clusters = get_all_chains(event_mentions)
print('Number of entity chains: {}'.format(len(entity_cross_clusters)))
print('Number of event chains: {}'.format(len(event_cross_clusters)))

Number of entity chains: 873
Number of event chains: 1892


In [25]:
entity_singleton = sum([1 for m in entity_mentions if m["coref_chain"].startswith('Singleton')])
event_singleton =  sum([1 for m in event_mentions if m["coref_chain"].startswith('Singleton')])
print('Number of entity without instance_id: {}'.format(entity_singleton))
print('Number of event without instance_id: {}'.format(event_singleton))

Number of entity without instance_id: 122
Number of event without instance_id: 1315


In [31]:
a = entity_mentions[0]

In [32]:
a.keys()

dict_keys(['cluster_desc', 'coref_chain', 'doc_id', 'event_entity', 'full_sentence', 'is_pronoun', 'left_sentence', 'm_id', 'mention_type', 'right_sentence', 'sent_id', 'tokens_ids', 'tokens_str', 'topic'])

In [33]:
types = {}
for m in entity_mentions:
    m_type = m['mention_type']
    types[m_type] = types.get(m_type, 0) + 1

In [34]:
types

{'PRO': 750, 'ORG': 903, 'PER': 357, 'LOC': 374, 'FIN': 360, '': 122}

In [35]:
750+903+357+374+360

2744

In [57]:
descriptor = {}
for m in entity_mentions:
    if m["coref_chain"].startswith('Singleton_'):
        desc = m["coref_chain"]
        descriptor[desc] = descriptor.get(desc, 0) + 1

len(descriptor)

In [58]:
len(descriptor)

122

In [59]:
sum([1 for m in entity_mentions if m['coref_chain'].startswith('Singleton')])

122

In [77]:
import csv
event = set()
entity = set()
with open('datasets/meantime_newsreader_english_oct15/list_instances.csv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for line in reader:
        if line[1] == 'EVENT':
            event.add(line[0])
        else:
            entity.add(line[0])

print('Event chains: {}'.format(len(event)))
print('Entity chains: {}'.format(len(entity)))

Event chains: 1185
Entity chains: 791


In [79]:
chains_from_mentions = set()
for m in entity_mentions:
    chains_from_mentions.add(m['coref_chain'])
print(len(chains_from_mentions))

752


In [82]:
for c in entity:
    if c not in chains_from_mentions:
        print(c)

PRO19274271812050726
FIN36825527747670669
FIN36825527589495792
PRO27737916609991057
PRO47037235314943369
PRO36042008717728899
PRO56150570102047852
LOC39494944810590513
FIN27982806499965672
PRO55898465939579653
#ID
ORG48295538177040363
ORG45374187885304059
PRO39503428423512753
PRO39493678270400934
PRO27737342834503629
PRO56092201179325546
ORG27744131024879930
FIN27982311252830625
PRO27738187661870589
PRO56087535324635302
PRO55741017831652172
ORG19093168857338940
PRO47041690110532455
ORG27980955745626409
PER25304436893090468
PRO39493700907346891
ORG27733658216837514
ORG27980020499213139
ORG27984880689122584
PRO47186532201508230
PRO41488049088478591
ORG35073822103445135
PRO27573052674952028
FIN44153078778154325
PRO36276501830687088
PRO27733281450580678
PRO47184870591447505
PRO27734230348663983
ORG19093169632160927
ORG19274271618816482


In [81]:
for m in entity_mentions:
    if m['coref_chain'] == 'ORG44081990077682597':
        print(m)

{'cluster_desc': 'European Central Bank', 'coref_chain': 'ORG44081990077682597', 'doc_id': '113278_Markets_rally_as_world_central_banks_infuse_cash.xml', 'event_entity': 'entity', 'full_sentence': 'Totaling US$ 180 billion , the Federal Reserve arranged to increase existing swap lines with the European Central Bank from US$ 55 billion to US$ 110 billion , and with the Swiss National Bank from US$ 12 billion to US$ 27 billion .', 'is_pronoun': False, 'left_sentence': 'Totaling US$ 180 billion , the Federal Reserve arranged to increase existing swap lines with the', 'm_id': '49', 'mention_type': 'ORG', 'right_sentence': 'from US$ 55 billion to US$ 110 billion , and with the Swiss National Bank from US$ 12 billion to US$ 27 billion .', 'sent_id': '5', 'tokens_ids': [138, 139, 140], 'tokens_str': 'European Central Bank', 'topic': 'stock'}
{'cluster_desc': 'European Central Bank', 'coref_chain': 'ORG44081990077682597', 'doc_id': '113278_Markets_rally_as_world_central_banks_infuse_cash.xml',