In [40]:
from json_extractor import from_file_get_n_docs
from collections import defaultdict
from difflib import SequenceMatcher
import json

# Normalizing the data

## What questions do I want to answer?
    How many entities finds Stanford vs Wikifier and Tagme? 
    How many ORG entities finds Stanford vs Wikifier and Tagme?
    How many LOC entities finds Stanford vs Wikifier and Tagme?
    How many PER entities finds Stanford vs Wikifier and Tagme?

I therefore need to normalize the data in the following way.

Each entity id'd by the NER or linker should be represented as:
````
    {
        Surface form:
        Type:
    }
```

So a document will yield an object with:
```
    {
        Entities: [],
        Mentions: [], //Same as entities, but it contains all the repetitions in order.
        ORG:
        LOC:
        PER:
    }
```
This way I can compare documents into a comparison object: 
```
    {
        Long-Tail: []
        ORG:
        LOC:
        PER:
    }
```

# Normalizing Stanford

In [7]:
stanford_docs = from_file_get_n_docs('./50_tagged_by_stanford.jsonl', 50)

In [8]:
doc = stanford_docs[0]
doc

[[u'VETERANS', u'O'],
 [u'saluted', u'O'],
 [u"Worcester's", u'O'],
 [u'first', u'O'],
 [u'ever', u'O'],
 [u'breakfast', u'O'],
 [u'club', u'O'],
 [u'for', u'O'],
 [u'ex-soldiers', u'O'],
 [u'which', u'O'],
 [u'won', u'O'],
 [u'over', u'O'],
 [u'hearts,', u'O'],
 [u'minds', u'O'],
 [u'and', u'O'],
 [u'bellies.', u'O'],
 [u'The', u'O'],
 [u'Worcester', u'ORGANIZATION'],
 [u'Breakfast', u'ORGANIZATION'],
 [u'Club', u'ORGANIZATION'],
 [u'for', u'ORGANIZATION'],
 [u'HM', u'ORGANIZATION'],
 [u'Forces', u'ORGANIZATION'],
 [u'Veterans', u'ORGANIZATION'],
 [u'met', u'O'],
 [u'at', u'O'],
 [u'the', u'O'],
 [u'Postal', u'O'],
 [u'Order', u'O'],
 [u'in', u'O'],
 [u'Foregate', u'LOCATION'],
 [u'Street', u'LOCATION'],
 [u'at', u'O'],
 [u'10am', u'O'],
 [u'on', u'O'],
 [u'Saturday.', u'O'],
 [u'The', u'O'],
 [u'club', u'O'],
 [u'is', u'O'],
 [u'designed', u'O'],
 [u'to', u'O'],
 [u'allow', u'O'],
 [u'veterans', u'O'],
 [u'a', u'O'],
 [u'place', u'O'],
 [u'to', u'O'],
 [u'meet,', u'O'],
 [u'socialise

In [9]:
starts_new_entity = lambda prev, curr: prev != curr and curr != 'O'
is_in_entity = lambda prev, curr: prev == curr and curr != 'O'
is_outside_entity = lambda prev, curr: prev != curr and curr == 'O'

def stanford_entities_mentions_and_types(doc):
    previous_type = 'O'
    start_word = 0
    end_word = 0
    surface_form = ''
    type_counts = defaultdict(lambda: 0)
    
    document = {}
    document['mentions'] = []
    document['entities'] = defaultdict(lambda: {'counts': 0, 'mentions': []})
    for i, [w, t] in enumerate(doc):
        if starts_new_entity(previous_type, t):
            start_word = i
            end_word = i
            surface_form = w
        elif is_in_entity(previous_type, t):
            end_word = i
            surface_form += u" {0}".format(w)
        elif is_outside_entity(previous_type, t):
            mention = {
                'surface-form': surface_form,
                'type': previous_type,
            }
            json_mention = json.dumps(mention)

            type_counts[previous_type+'_M'] += 1
            type_counts[previous_type+'_E'] += 1 if json_mention not in document['entities'] else 0
            
            document['entities'][json_mention]['counts'] += 1
            document['entities'][json_mention]['mentions'].append({
                'start' : start_word,
                'end' : end_word
            })
            
            mention['start'] = start_word
            mention['end'] = end_word 
            document['mentions'].append(mention)
            surface_form = ''
            
        previous_type = t
    
    document['ORG_MENTIONS'] = type_counts['ORGANIZATION_M']
    document['LOC_MENTIONS'] = type_counts['LOCATION_M']
    document['PER_MENTIONS'] = type_counts['PERSON_M']
    document['ORG_ENTITIES'] = type_counts['ORGANIZATION_E']
    document['LOC_ENTITIES'] = type_counts['LOCATION_E']
    document['PER_ENTITIES'] = type_counts['PERSON_E']
    return document
        
# stanford_entities_mentions_and_types(doc)

In [10]:
stanford_parsed_docs = [stanford_entities_mentions_and_types(doc) for doc in stanford_docs]
# stanford_parsed_docs

In [11]:
stanford_parsed_docs[0]

{'LOC_ENTITIES': 3,
 'LOC_MENTIONS': 3,
 'ORG_ENTITIES': 2,
 'ORG_MENTIONS': 2,
 'PER_ENTITIES': 6,
 'PER_MENTIONS': 7,
 'entities': defaultdict(<function __main__.<lambda>>,
             {'{"surface-form": "Andy Wilson", "type": "PERSON"}': {'counts': 1,
               'mentions': [{'end': 261, 'start': 260}]},
              '{"surface-form": "Bromsgrove", "type": "LOCATION"}': {'counts': 1,
               'mentions': [{'end': 229, 'start': 229}]},
              '{"surface-form": "Carney", "type": "PERSON"}': {'counts': 2,
               'mentions': [{'end': 173, 'start': 173},
                {'end': 290, 'start': 290}]},
              '{"surface-form": "Dave", "type": "PERSON"}': {'counts': 1,
               'mentions': [{'end': 59, 'start': 59}]},
              '{"surface-form": "Derek Hardman", "type": "PERSON"}': {'counts': 1,
               'mentions': [{'end': 248, 'start': 247}]},
              '{"surface-form": "Droitwich", "type": "PERSON"}': {'counts': 1,
               'me

# Normalizing Wikifier

In [12]:
wikifier_docs = from_file_get_n_docs('./100_tagged_by_wikifier.jsonl', 100)

In [13]:
doc = wikifier_docs[0]
doc['annotations'][2]['pageRank']

0.002150784571337383

In [94]:
wikifier_mention_surface_form = lambda m, words: ' '.join(words[m['wFrom'] : m['wTo'] + 1])
wikifier_entity_surface_forms = lambda e, words: [wikifier_mention_surface_form(mention, words) for mention in e['support']]

all_types_for_surface_form = lambda sf: [{'surface_form': sf, 'type': 'ORGANIZATION'},{'surface_form': sf, 'type': 'LOCATION'},{'surface_form': sf, 'type': 'PERSON'}]

extend_reducer = lambda l1, l2: l1.extend(l2) if l1 else l2
add_type_to_surface_forms = lambda sfs: reduce(extend_reducer, [all_types_for_surface_form(sf) for sf in sfs], [])

def wikifier_parsed_entity_dictionary(doc):
    annotation_dictionary = defaultdict(lambda: [])

    for a in doc['annotations']:
        surface_forms = wikifier_entity_surface_forms(a, doc['words'])
        mention_dicts = add_type_to_surface_forms(surface_forms)
        types = [c['enLabel'] for c in a.get('wikiDataClasses', [])]
        title = a['title']
        
        if (len(types) > 0 and mention_dicts):
            for mention in mention_dicts:
                annotation_dictionary[json.dumps(mention)].append({
                    'types': types,
                    'title': title
                })
                
    return annotation_dictionary

similar = lambda a, b, r=0.95: SequenceMatcher(None, a, b).ratio() >= r

In [68]:
%%time
wikifier_parsed_docs = [wikifier_parsed_entity_dictionary(doc) for doc in wikifier_docs]

CPU times: user 1.01 s, sys: 23.6 ms, total: 1.04 s
Wall time: 1.04 s


# Comparing Stanford with a normalized linker functions

In [95]:
find_similar_to_a_in_dict_b = lambda a, b, r=0.95: [(a, b_key) for b_key in b if similar(a, b_key, r)]
a_is_not_in_dict_b = lambda a, b: not find_similar_to_a_in_dict_b(a, b)
compare_linkers_parsed_docs = lambda a, b, n: [(i, a_entity) for (i, (a_doc, b_doc)) in enumerate(zip(a[0:n], b[0:n])) for a_entity in a_doc['entities'] if a_is_not_in_dict_b(a_entity, b_doc)]

# Comparing Stanford with Wikifier

In [83]:
%%time
not_in_wikifier = compare_linkers_parsed_docs(stanford_parsed_docs, wikifier_parsed_docs, 50)

CPU times: user 45.1 s, sys: 147 ms, total: 45.2 s
Wall time: 45.3 s


In [84]:
not_in_wikifier

[(0, '{"surface-form": "Royal British Legion", "type": "ORGANIZATION"}'),
 (0, '{"surface-form": "Carney", "type": "PERSON"}'),
 (0,
  '{"surface-form": "Worcester Breakfast Club for HM Forces Veterans", "type": "ORGANIZATION"}'),
 (1, '{"surface-form": "Bulleit Group", "type": "ORGANIZATION"}'),
 (1, '{"surface-form": "Kelly Mayes", "type": "PERSON"}'),
 (2, '{"surface-form": "Nike Air Max", "type": "ORGANIZATION"}'),
 (3,
  '{"surface-form": "Tory Burch See it \\u00bb Carolina Herrera See", "type": "ORGANIZATION"}'),
 (3,
  '{"surface-form": "Rodarte See it \\u00bb Diesel Black Gold See", "type": "ORGANIZATION"}'),
 (3, '{"surface-form": "Bionic Dong Listen", "type": "ORGANIZATION"}'),
 (3,
  '{"surface-form": "Us Grub Street Bedford & Bowery FOLLOW: Facebook Twitter", "type": "ORGANIZATION"}'),
 (3, '{"surface-form": "Gloria Steinem", "type": "PERSON"}'),
 (3,
  '{"surface-form": "Intelligencer Vulture Science of Us Grub Street Bedford & Bowery Like", "type": "ORGANIZATION"}'),
 (3,

## Why doesn't wikifier find Sean Penn????

In [90]:
recreate_stanford_text = lambda doc: " ".join([u"{0}[{1}]".format(w[0], w[1]) for w in doc])

In [112]:
sean = {
    'surface_form': 'Sean Penn', 'type': 'PERSON'
}
json_sean = json.dumps(sean)

find_similar_to_a_in_dict_b(json.dumps(sean), wikifier_parsed_docs[3], 0.93)


[('{"surface_form": "Sean Penn", "type": "PERSON"}',
  '{"surface_form": "Penn", "type": "PERSON"}')]

In [116]:
wikifier_parsed_docs[3].get('{"surface_form": "Penn", "type": "PERSON"}')

[{'title': u'University of Pennsylvania',
  'types': [u'private university',
   u'research university',
   u'Colonial Colleges',
   u'private not-for-profit educational institution',
   u'university',
   u'private educational institution',
   u'colleges and universities in the United States',
   u'nonprofit organization',
   u'higher education institution',
   u'academic institution',
   u'educational institution',
   u'organization',
   u'school',
   u'educational organization',
   u'facility',
   u'social group',
   u'agent',
   u'instrumental value',
   u'geographical object',
   u'geographic location',
   u'group of humans',
   u'system',
   u'entity',
   u'value',
   u'physical object',
   u'location',
   u'living thing group',
   u'manifestation',
   u'object',
   u'position',
   u'group of objects',
   u'structure',
   u'point',
   u'class',
   u'group',
   u'primitive notion',
   u'mathematical concept',
   u'class',
   u'concept',
   u'abstract object',
   u'mental representat

In [91]:
recreate_stanford_text(stanford_docs[3])

u'NYMag.com[O] Daily[O] Intelligencer[O] Vulture[O] The[O] Cut[O] Science[O] of[O] Us[ORGANIZATION] Grub[ORGANIZATION] Street[ORGANIZATION] Bedford[ORGANIZATION] &[ORGANIZATION] Bowery[ORGANIZATION] FOLLOW:[ORGANIZATION] Facebook[ORGANIZATION] Twitter[ORGANIZATION] UserName[O] LOG[O] IN[O] REGISTER[O] Fashions[O] Runway[O] Street[O] Style[O] Designers[O] Fame[O] Beauty[O] Goods[O] Love[O] &[O] War[O] search[O] Sections[O] Fashions[O] Fame[O] Beauty[O] Goods[O] Love[O] &[O] War[O] Plus[O] Runway[O] Street[O] Style[O] Designers[O] Sites[O] NYMag.com[O] Daily[O] Intelligencer[ORGANIZATION] Vulture[ORGANIZATION] Science[ORGANIZATION] of[ORGANIZATION] Us[ORGANIZATION] Grub[ORGANIZATION] Street[ORGANIZATION] Bedford[ORGANIZATION] &[ORGANIZATION] Bowery[ORGANIZATION] Like[ORGANIZATION] UsFollow[O] Us[O] Popular[O] on[O] The[O] Cut[O] Ask[O] Polly:[O] Should[O] I[O] Just[O] Give[O] Up[O] on[O] My[O] Writing?[O] \xbb[O] Top[O] Shows[O] Oscar[ORGANIZATION] de[ORGANIZATION] la[ORGANIZATION] Renta

# Normalizing Tagme

In [85]:
tagme_docs = from_file_get_n_docs('./100_tagged_by_tagme.jsonl', 100)

In [86]:
doc = tagme_docs[0]
doc

{u'annotations': [{u'end': 8,
   u'id': 327806,
   u'link_probability': u'0.01097',
   u'rho': u'0.04691',
   u'spot': u'VETERANS',
   u'start': 0,
   u'title': u'Veteran'},
  {u'end': 16,
   u'id': 28977,
   u'link_probability': u'0.00269',
   u'rho': u'0.03323',
   u'spot': u'saluted',
   u'start': 9,
   u'title': u'Salute'},
  {u'end': 26,
   u'id': 58681,
   u'link_probability': u'0.26996',
   u'rho': u'0.18812',
   u'spot': u'Worcester',
   u'start': 17,
   u'title': u'Worcester'},
  {u'end': 34,
   u'id': 4764461,
   u'link_probability': u'0.00208',
   u'rho': u'0.08081',
   u'spot': u'first',
   u'start': 29,
   u'title': u'World War I'},
  {u'end': 54,
   u'id': 4406401,
   u'link_probability': u'0.67213',
   u'rho': u'0.33607',
   u'spot': u'breakfast club',
   u'start': 40,
   u'title': u'Breakfast Club (band)'},
  {u'end': 80,
   u'id': 3031782,
   u'link_probability': u'0.00167',
   u'rho': u'0.02555',
   u'spot': u'won',
   u'start': 77,
   u'title': u'Warrant Officer of t

In [92]:
wikifier_parsed_docs[3]

defaultdict(<function __main__.<lambda>>,
            {'{"surface_form": "1", "type": "LOCATION"}': [{'title': u'Record chart',
               'types': [u'information',
                u'abstract object',
                u'object',
                u'entity']}],
             '{"surface_form": "1", "type": "ORGANIZATION"}': [{'title': u'Record chart',
               'types': [u'information',
                u'abstract object',
                u'object',
                u'entity']}],
             '{"surface_form": "1", "type": "PERSON"}': [{'title': u'Record chart',
               'types': [u'information',
                u'abstract object',
                u'object',
                u'entity']}],
             '{"surface_form": "14", "type": "LOCATION"}': [{'title': u'February 14',
               'types': [u'determinator for date of periodic occurrence',
                u'time interval',
                u'interval',
                u'set',
                u'mathematical object',
         