In [23]:
from collections import defaultdict
import copy
import json

from json_extractor import from_file_get_n_docs

from tagged_document_parsers import parse_stanford_doc

# Read in Data

In [8]:
stanford_file_name = './50_tagged_by_stanford.jsonl'
wikifier_file_name = './100_tagged_by_wikifier.jsonl'
tagme_file_name = './100_tagged_by_tagme_longtext_0_epsilon_dot1_includecategories_includeallspots.jsonl'

In [24]:
stanford_docs = from_file_get_n_docs(stanford_file_name, 10)

In [25]:
wikifier_docs = from_file_get_n_docs(wikifier_file_name, 10)

In [26]:
tagme_docs = from_file_get_n_docs(tagme_file_name, 10)

# Parse Stanford
From here on, parsing means, converting the output from the stanford tagger into one dictionary with two main keys: entities and tagged-words.

Entities is a map with the entity surface_form and type in json form as a key, and a list of mentions in the doc as value.

tagged-words is a map with a token word index as key, and the mention in which surface_form it is contained as value.

In [14]:
stanford_parsed_docs = [parse_stanford_doc(doc) for doc in stanford_docs]

# Parse Wikifier

In [37]:
wikifier_mention_surface_form = lambda m, words: u' '.join(words[m['wFrom'] : m['wTo'] + 1])

def wikifier_entity_surface_forms (e, words):
    return [wikifier_mention_surface_form(mention, words) for mention in e['support']]

def type_mention(mention_type, mention):
    mention[u'type'] = mention_type
    return copy.deepcopy(mention)

def generate_mentions_for_surface_form (surface_form):
    mentions = []
    mention = {u'surface_form': surface_form}
    
    types = [u'ORGANIZATION', u'LOCATION', u'PERSON']
    
    mentions.extend([type_mention(t, mention) for t in types ])

    return mentions

extend_reducer = lambda l1, l2: l1.extend(l2) if l1 else l2
def all_possible_mentions_for_surface_forms(sfs):
    mentions = []
    map(mentions.extend, map(generate_mentions_for_surface_form, sfs))
    
    return mentions

def annotation_important_information(a):
    important_info = [
        'cosine', 'dbPediaIri', 'dbPediaTypes', 'secTitle',
        'secUrl', 'secLang'
    ]
    return {
        k: a.get(k, u'Not present') for k in important_info 
    }

def add_annotation_to_tagged_words_dict(a, d):
    for mention in a['support']:
        start = mention['wFrom']
        end = mention['wTo']
        for i in range(start, end + 1):
            d[i] = annotation_important_information(a)
    return d

def parse_wikifier_doc(doc):
    annotation_dictionary = defaultdict(lambda: [])
    tagged_words = {}
    for a in doc['annotations']:
        surface_forms = wikifier_entity_surface_forms(a, doc['words'])
        mention_dicts = all_possible_mentions_for_surface_forms(surface_forms)
        
        types = [c['enLabel'] for c in a.get('wikiDataClasses', [])]
        title = a['title']
        
        if (len(types) > 0 and mention_dicts):
            for mention in mention_dicts:
                annotation_dictionary[json.dumps(mention, ensure_ascii=False)].append({
                    u'types': types,
                    u'title': title
                })
            tagged_words = add_annotation_to_tagged_words_dict(a, tagged_words)
                
    return {
        'entities': annotation_dictionary,
        'tagged-words': tagged_words,
        'words': doc['words']
    }

In [38]:
parsed_wikifier_docs = [parse_wikifier_doc(doc) for doc in wikifier_docs]

In [39]:
parsed_wikifier_docs[0]['words']

[u'VETERANS',
 u'saluted',
 u"Worcester's",
 u'first',
 u'ever',
 u'breakfast',
 u'club',
 u'for',
 u'ex-soldiers',
 u'which',
 u'won',
 u'over',
 u'hearts',
 u'minds',
 u'and',
 u'bellies',
 u'The',
 u'Worcester',
 u'Breakfast',
 u'Club',
 u'for',
 u'HM',
 u'Forces',
 u'Veterans',
 u'met',
 u'at',
 u'the',
 u'Postal',
 u'Order',
 u'in',
 u'Foregate',
 u'Street',
 u'at',
 u'10am',
 u'on',
 u'Saturday',
 u'The',
 u'club',
 u'is',
 u'designed',
 u'to',
 u'allow',
 u'veterans',
 u'a',
 u'place',
 u'to',
 u'meet',
 u'socialise',
 u'eat',
 u'and',
 u'drink',
 u'giving',
 u'hunger',
 u'and',
 u'loneliness',
 u'their',
 u'marching',
 u'orders',
 u'Father-of-two',
 u'Dave',
 u'Carney',
 u'aged',
 u'43',
 u'of',
 u'Merrimans',
 u'Hill',
 u'Worcester',
 u'set',
 u'up',
 u'the',
 u'club',
 u'after',
 u'being',
 u'inspired',
 u'by',
 u'other',
 u'similar',
 u'clubs',
 u'across',
 u'the',
 u'country',
 u'He',
 u'said',
 u'As',
 u'you',
 u'can',
 u'see',
 u'from',
 u'the',
 u'picture',
 u'we',
 u'ha