In [20]:
from collections import defaultdict
import copy
import json

from json_extractor import from_file_get_n_docs

from tagged_document_parsing_lib import \
all_possible_mentions_for_surface_forms,\
generate_mentions_for_surface_form,\
flatten

from stanford_parser import parse_stanford_doc
from wikifier_parser import parse_wikifier_doc

# Read in Data

In [21]:
stanford_file_name = './50_tagged_by_stanford.jsonl'
wikifier_file_name = './100_tagged_by_wikifier.jsonl'
tagme_file_name = './100_tagged_by_tagme_longtext_0_epsilon_dot1_includecategories_includeallspots.jsonl'

In [22]:
stanford_docs = from_file_get_n_docs(stanford_file_name, 10)

In [23]:
wikifier_docs = from_file_get_n_docs(wikifier_file_name, 10)

In [24]:
tagme_docs = from_file_get_n_docs(tagme_file_name, 10)

# Parse Stanford
From here on, parsing means, converting the output from the stanford tagger into one dictionary with two main keys: entities and tagged-words.

Entities is a map with the entity surface_form and type in json form as a key, and a list of mentions in the doc as value.

tagged-words is a map with a token word index as key, and the mention in which surface_form it is contained as value.

In [25]:
stanford_parsed_docs = [parse_stanford_doc(doc) for doc in stanford_docs]

# Parse Wikifier

In [26]:
wikifier_parsed_docs = [parse_wikifier_doc(doc) for doc in wikifier_docs]

# Parse Tagme

In [34]:
def parse_tagme_doc(doc):
    indexed_tagme_annotations = defaultdict(lambda: [])
    parsed_doc = {}
    parsed_doc[u'tokens'] = []
    parsed_doc[u'tagged-words'] = {}
    for i, annotation in enumerate(doc['annotations']):
        all_possible_annotation_indexes = generate_mentions_for_surface_form(annotation['spot'])
        print u"Indexing annotation {0}".format(i)
        for index in all_possible_annotation_indexes:
            print u"Indexing {0}".format(json.dumps(index, ensure_ascii=False))
            indexed_tagme_annotations[json.dumps(index, ensure_ascii=False)].append(annotation)
            
        tokens = annotation['spot'].split()
        for tok in tokens:
            print u'adding token {0}'.format(tok)
            parsed_doc[u'tagged-words'][len(tokens)] = copy.deepcopy(annotation)
            parsed_doc[u'tokens'].append(tok)
            
    parsed_doc[u'entities'] = indexed_tagme_annotations

    return parsed_doc

In [35]:
tagme_parsed_docs = map(parse_tagme_doc, tagme_docs)
tagme_parsed_docs[0]['tagged-words'].__len__(), tagme_parsed_docs[0]['tagged-words']

Indexing annotation 0
Indexing {"surface_form": "VETERANS", "type": "ORGANIZATION"}
Indexing {"surface_form": "VETERANS", "type": "LOCATION"}
Indexing {"surface_form": "VETERANS", "type": "PERSON"}
adding token VETERANS
Indexing annotation 1
Indexing {"surface_form": "saluted", "type": "ORGANIZATION"}
Indexing {"surface_form": "saluted", "type": "LOCATION"}
Indexing {"surface_form": "saluted", "type": "PERSON"}
adding token saluted
Indexing annotation 2
Indexing {"surface_form": "Worcester", "type": "ORGANIZATION"}
Indexing {"surface_form": "Worcester", "type": "LOCATION"}
Indexing {"surface_form": "Worcester", "type": "PERSON"}
adding token Worcester
Indexing annotation 3
Indexing {"surface_form": "breakfast club", "type": "ORGANIZATION"}
Indexing {"surface_form": "breakfast club", "type": "LOCATION"}
Indexing {"surface_form": "breakfast club", "type": "PERSON"}
adding token breakfast
adding token club
Indexing annotation 4
Indexing {"surface_form": "won", "type": "ORGANIZATION"}
Inde

(3,
 {1: {u'dbpedia_categories': [u'Future', u'Philosophy of time'],
   u'end': 2334,
   u'id': 163103,
   u'link_probability': u'0.00596',
   u'rho': u'0.06048',
   u'spot': u'future',
   u'start': 2328,
   u'title': u'Future'},
  2: {u'dbpedia_categories': [u'Marian devotions',
    u'Our Lady of Fatima',
    u'Roman Catholic devotions'],
   u'end': 2281,
   u'id': 531486,
   u'link_probability': u'0.01185',
   u'rho': u'0.00593',
   u'spot': u'first Saturday',
   u'start': 2267,
   u'title': u'First Saturday Devotions'},
  3: {u'dbpedia_categories': [u'Foods', u'Food and drink', u'Cuisine'],
   u'end': 2040,
   u'id': 10646,
   u'link_probability': u'0.02989',
   u'rho': u'0.07839',
   u'spot': u'food and drink',
   u'start': 2026,
   u'title': u'Food'}})

In [None]:
len(wikifier_parsed_docs[0]['tokens'])

In [None]:
len(stanford_parsed_docs[0]['tokens'])

In [None]:
wikifier_docs[0]['annotations'].__len__()

In [None]:
tagme_docs[0]['annotations'][0]

In [33]:
print u"Indexing {0}".format(json.dumps({u'surface_form': u'\u2019'}, ensure_ascii=False))

Indexing {"surface_form": "’"}
