In [48]:
from json_extractor import from_file_get_n_docs
from collections import defaultdict
import json

# Normalizing the data

## What questions do I want to answer?
    How many entities finds Stanford vs Wikifier and Tagme? 
    How many ORG entities finds Stanford vs Wikifier and Tagme?
    How many LOC entities finds Stanford vs Wikifier and Tagme?
    How many PER entities finds Stanford vs Wikifier and Tagme?

I therefore need to normalize the data in the following way.

Each entity id'd by the NER or linker should be represented as:
````
    {
        Surface form:
        Type:
    }
```

So a document will yield an object with:
```
    {
        Entities: [],
        Mentions: [], //Same as entities, but it contains all the repetitions in order.
        ORG:
        LOC:
        PER:
    }
```
This way I can compare documents into a comparison object: 
```
    {
        Long-Tail: []
        ORG:
        LOC:
        PER:
    }
```

## Stanford

In [49]:
stanford_docs = from_file_get_n_docs('./50_tagged_by_stanford.jsonl', 50)

In [56]:
doc = stanford_docs[0]
doc

[[u'VETERANS', u'O'],
 [u'saluted', u'O'],
 [u"Worcester's", u'O'],
 [u'first', u'O'],
 [u'ever', u'O'],
 [u'breakfast', u'O'],
 [u'club', u'O'],
 [u'for', u'O'],
 [u'ex-soldiers', u'O'],
 [u'which', u'O'],
 [u'won', u'O'],
 [u'over', u'O'],
 [u'hearts,', u'O'],
 [u'minds', u'O'],
 [u'and', u'O'],
 [u'bellies.', u'O'],
 [u'The', u'O'],
 [u'Worcester', u'ORGANIZATION'],
 [u'Breakfast', u'ORGANIZATION'],
 [u'Club', u'ORGANIZATION'],
 [u'for', u'ORGANIZATION'],
 [u'HM', u'ORGANIZATION'],
 [u'Forces', u'ORGANIZATION'],
 [u'Veterans', u'ORGANIZATION'],
 [u'met', u'O'],
 [u'at', u'O'],
 [u'the', u'O'],
 [u'Postal', u'O'],
 [u'Order', u'O'],
 [u'in', u'O'],
 [u'Foregate', u'LOCATION'],
 [u'Street', u'LOCATION'],
 [u'at', u'O'],
 [u'10am', u'O'],
 [u'on', u'O'],
 [u'Saturday.', u'O'],
 [u'The', u'O'],
 [u'club', u'O'],
 [u'is', u'O'],
 [u'designed', u'O'],
 [u'to', u'O'],
 [u'allow', u'O'],
 [u'veterans', u'O'],
 [u'a', u'O'],
 [u'place', u'O'],
 [u'to', u'O'],
 [u'meet,', u'O'],
 [u'socialise

In [87]:
starts_new_entity = lambda prev, curr: prev != curr and curr != 'O'
is_in_entity = lambda prev, curr: prev == curr and curr != 'O'
is_outside_entity = lambda prev, curr: prev != curr and curr == 'O'

def stanford_entities_mentions_and_types(doc):
    previous_type = 'O'
    start_word = 0
    end_word = 0
    surface_form = ''
    type_counts = defaultdict(lambda: 0)
    
    document = {}
    document['mentions'] = []
    document['entities'] = defaultdict(lambda: {'counts': 0, 'mentions': []})
    for i, [w, t] in enumerate(doc):
        if starts_new_entity(previous_type, t):
            start_word = i
            end_word = i
            surface_form = w
        elif is_in_entity(previous_type, t):
            end_word = i
            surface_form += u" {0}".format(w)
        elif is_outside_entity(previous_type, t):
            mention = {
                'surface-form': surface_form,
                'type': previous_type,
            }
            json_mention = json.dumps(mention)

            type_counts[previous_type+'_M'] += 1
            type_counts[previous_type+'_E'] += 1 if json_mention not in document['entities'] else 0
            
            document['entities'][json_mention]['counts'] += 1
            document['entities'][json_mention]['mentions'].append({
                'start' : start_word,
                'end' : end_word
            })
            
            mention['start'] = start_word
            mention['end'] = end_word 
            document['mentions'].append(mention)
            surface_form = ''
            
        previous_type = t
    
    document['ORG_MENTIONS'] = type_counts['ORGANIZATION_M']
    document['LOC_MENTIONS'] = type_counts['LOCATION_M']
    document['PER_MENTIONS'] = type_counts['PERSON_M']
    document['ORG_ENTITIES'] = type_counts['ORGANIZATION_E']
    document['LOC_ENTITIES'] = type_counts['LOCATION_E']
    document['PER_ENTITIES'] = type_counts['PERSON_E']
    return document
        
# stanford_entities_mentions_and_types(doc)

In [88]:
stanford_parsed_docs = [stanford_entities_mentions_and_types(doc) for doc in stanford_docs]
# stanford_parsed_docs

In [89]:
stanford_parsed_docs[0]

{'LOC_ENTITIES': 3,
 'LOC_MENTIONS': 3,
 'ORG_ENTITIES': 2,
 'ORG_MENTIONS': 2,
 'PER_ENTITIES': 6,
 'PER_MENTIONS': 7,
 'entities': defaultdict(<function __main__.<lambda>>,
             {'{"surface-form": "Andy Wilson", "type": "PERSON"}': {'counts': 1,
               'mentions': [{'end': 261, 'start': 260}]},
              '{"surface-form": "Bromsgrove", "type": "LOCATION"}': {'counts': 1,
               'mentions': [{'end': 229, 'start': 229}]},
              '{"surface-form": "Carney", "type": "PERSON"}': {'counts': 2,
               'mentions': [{'end': 173, 'start': 173},
                {'end': 290, 'start': 290}]},
              '{"surface-form": "Dave", "type": "PERSON"}': {'counts': 1,
               'mentions': [{'end': 59, 'start': 59}]},
              '{"surface-form": "Derek Hardman", "type": "PERSON"}': {'counts': 1,
               'mentions': [{'end': 248, 'start': 247}]},
              '{"surface-form": "Droitwich", "type": "PERSON"}': {'counts': 1,
               'me

## Wikifier

In [73]:
wikifier_docs = from_file_get_n_docs('./50_tagged_by_wikifier.jsonl', 50)

In [74]:
doc = wikifier_docs[0]
doc

[{u'cosine': 0.1873502906125954,
  u'dbPediaIri': u'http://dbpedia.org/resource/Salute',
  u'dbPediaTypes': [],
  u'lang': u'en',
  u'pageRank': 0.0005204443788780796,
  u'secLang': u'en',
  u'secTitle': u'Salute',
  u'secUrl': u'http://en.wikipedia.org/wiki/Salute',
  u'support': [{u'pMentionGivenSurface': 0.005611672278338945,
    u'pageRank': 5.642965647183741e-05,
    u'wFrom': 1,
    u'wTo': 1}],
  u'supportLen': 1,
  u'title': u'Salute',
  u'url': u'http://en.wikipedia.org/wiki/Salute',
  u'wikiDataClasses': [],
  u'wikiDataItemId': u'Q858893'},
 {u'cosine': 0.2378448458365748,
  u'dbPediaIri': u'http://dbpedia.org/resource/Worcester',
  u'dbPediaTypes': [u'City', u'Settlement', u'PopulatedPlace', u'Place'],
  u'lang': u'en',
  u'pageRank': 0.003229281907652416,
  u'secLang': u'en',
  u'secTitle': u'Worcester',
  u'secUrl': u'http://en.wikipedia.org/wiki/Worcester',
  u'support': [{u'pMentionGivenSurface': 0.01518987341772152,
    u'pageRank': 0.0001527457941764267,
    u'wFrom':

In [None]:
def wikifier_entities_mentions_and_types(doc):
    