In [138]:
from json_extractor import from_file_get_n_docs
from sample_data import n_samples
from pprint import PrettyPrinter
from similarity import similar
import copy
import json

In [34]:
NUMBER_OF_DOCS = 5

# Raw Docs

In [35]:
raw_docs = n_samples(NUMBER_OF_DOCS)

# Tagged Docs

In [36]:
STANFORD_FILENAME = '100_tagged_by_stanford.jsonl'
WIKIFIER_FILENAME = '100_tagged_by_wikifier.jsonl'
TAGME_FILENAME    = '100_tagged_by_tagme_longtext_0_epsilon_dot1_includecategories_includeallspots.jsonl'

In [37]:
stanford_tagged_docs = from_file_get_n_docs(STANFORD_FILENAME, NUMBER_OF_DOCS)

In [38]:
wikifier_tagged_docs = from_file_get_n_docs(WIKIFIER_FILENAME, NUMBER_OF_DOCS)

In [39]:
tagme_tagged_docs    = from_file_get_n_docs(TAGME_FILENAME, NUMBER_OF_DOCS)

# Parsed Tagged Docs

In [40]:
STANFORD_FILENAME = '100_parsed_from_stanford.jsonl'
WIKIFIER_FILENAME = '100_parsed_from_wikifier.jsonl'
TAGME_FILENAME    = '100_parsed_from_tagme.jsonl'

In [41]:
stanford_parsed_docs = from_file_get_n_docs(STANFORD_FILENAME, NUMBER_OF_DOCS)

In [42]:
wikifier_parsed_docs = from_file_get_n_docs(WIKIFIER_FILENAME, NUMBER_OF_DOCS)

In [43]:
tagme_parsed_docs    = from_file_get_n_docs(TAGME_FILENAME, NUMBER_OF_DOCS)

# Processing Steps Dictionary

In [45]:
docs = {
    'raw': raw_docs,
    'stanford': {
        'tagged': stanford_tagged_docs,
        'parsed': stanford_parsed_docs
    },
    'wikifier': {
        'tagged': wikifier_tagged_docs,
        'parsed': wikifier_parsed_docs
    },
    'tagme': {
        'tagged': tagme_tagged_docs,
        'parsed': tagme_parsed_docs
    },
    'tagged': {
        'stanford': stanford_tagged_docs,
        'wikifier': wikifier_tagged_docs,
        'tagme': tagme_tagged_docs,
    },
    'parsed': {
        'stanford': stanford_parsed_docs,
        'wikifier': wikifier_parsed_docs,
        'tagme': tagme_parsed_docs,
    }
}

# Helper Functions

In [68]:
def describe(d, levels=1, lists=1):
    if isinstance(d, dict):
        new_d = {}
        for k, v in d.items():
            new_d[k] = describe(v, levels - 1) if levels > 0 else (type(v))
        return new_d
    elif isinstance(d, list):
        if d:
            return [describe(v) for v in d[:lists]]
        else:
            return []
    else:
        return (type(d), d)
    
pp = PrettyPrinter()

# View Tagger Pipeline

In [187]:
tagger = 'tagme'

In [188]:
pp.pprint(describe(docs[tagger]['tagged'], 1, 1))

[{u'annotations': [{u'dbpedia_categories': [(<type 'unicode'>,
                                             u'Military personnel')],
                    u'end': (<type 'int'>, 8),
                    u'id': (<type 'int'>, 327806),
                    u'link_probability': (<type 'unicode'>, u'0.01097'),
                    u'rho': (<type 'unicode'>, u'0.04836'),
                    u'spot': (<type 'unicode'>, u'VETERANS'),
                    u'start': (<type 'int'>, 0),
                    u'title': (<type 'unicode'>, u'Veteran')}],
  u'api': (<type 'unicode'>, u'tag'),
  u'lang': (<type 'unicode'>, u'en'),
  u'test': (<type 'unicode'>, u'5'),
  u'time': (<type 'int'>, 17310),
  u'timestamp': (<type 'unicode'>, u'2016-07-29T13:04:10')}]


In [154]:
pp.pprint(describe(docs[tagger]['parsed'], 0, 1))

[{u'entities': {u'{"surface_form": "10am", "type": "LOCATION"}': <type 'list'>,
                u'{"surface_form": "10am", "type": "ORGANIZATION"}': <type 'list'>,
                u'{"surface_form": "10am", "type": "PERSON"}': <type 'list'>,
                u'{"surface_form": "Andy Wilson", "type": "LOCATION"}': <type 'list'>,
                u'{"surface_form": "Andy Wilson", "type": "ORGANIZATION"}': <type 'list'>,
                u'{"surface_form": "Andy Wilson", "type": "PERSON"}': <type 'list'>,
                u'{"surface_form": "Bromsgrove", "type": "LOCATION"}': <type 'list'>,
                u'{"surface_form": "Bromsgrove", "type": "ORGANIZATION"}': <type 'list'>,
                u'{"surface_form": "Bromsgrove", "type": "PERSON"}': <type 'list'>,
                u'{"surface_form": "Carney", "type": "LOCATION"}': <type 'list'>,
                u'{"surface_form": "Carney", "type": "ORGANIZATION"}': <type 'list'>,
                u'{"surface_form": "Carney", "type": "PERSON"}': <t

In [103]:
example = docs[tagger]['parsed'][0]['entities'][docs[tagger]['parsed'][0]['entities'].keys()[2]]
pp.pprint(describe(example, 3, 5))

{u'counts': (<type 'int'>, 1),
 u'mentions': [{u'end': (<type 'int'>, 231), u'start': (<type 'int'>, 231)}]}


# Parsed Doc

In [105]:
parsed_docs = docs[tagger]['parsed']

In [106]:
doc_index = 0

In [107]:
doc = parsed_docs[doc_index]

## Entities

In [108]:
doc['entities']

{u'{"surface-form": "Andy Wilson", "type": "PERSON"}': {u'counts': 1,
  u'mentions': [{u'end': 261, u'start': 260}]},
 u'{"surface-form": "Bromsgrove", "type": "LOCATION"}': {u'counts': 1,
  u'mentions': [{u'end': 229, u'start': 229}]},
 u'{"surface-form": "Carney", "type": "PERSON"}': {u'counts': 2,
  u'mentions': [{u'end': 173, u'start': 173}, {u'end': 290, u'start': 290}]},
 u'{"surface-form": "Dave", "type": "PERSON"}': {u'counts': 1,
  u'mentions': [{u'end': 59, u'start': 59}]},
 u'{"surface-form": "Derek Hardman", "type": "PERSON"}': {u'counts': 1,
  u'mentions': [{u'end': 248, u'start': 247}]},
 u'{"surface-form": "Droitwich", "type": "PERSON"}': {u'counts': 1,
  u'mentions': [{u'end': 121, u'start': 121}]},
 u'{"surface-form": "Foregate Street", "type": "LOCATION"}': {u'counts': 1,
  u'mentions': [{u'end': 31, u'start': 30}]},
 u'{"surface-form": "Gloucester", "type": "LOCATION"}': {u'counts': 1,
  u'mentions': [{u'end': 231, u'start': 231}]},
 u'{"surface-form": "Hull", "type"

## Tokens

In [98]:
doc['tokens']

[u'VETERANS',
 u'saluted',
 u'Worcester',
 u'breakfast',
 u'club',
 u'won',
 u'hearts,',
 u'minds',
 u'HM',
 u'Forces',
 u'met',
 u'Postal',
 u'Order',
 u'Foregate',
 u'Street',
 u'10am',
 u'socialise',
 u'eat',
 u'hunger',
 u'loneliness',
 u'marching',
 u'orders',
 u'Dave',
 u'Carney',
 u'aged',
 u'Hill',
 u'set',
 u'up',
 u'clubs',
 u'country',
 u'you',
 u'can',
 u'picture',
 u'we',
 u'good',
 u'Five',
 u'out',
 u'saw',
 u'article',
 u'newspaper',
 u'turned',
 u'old',
 u'chap',
 u'travel',
 u'Droitwich',
 u'parade',
 u'hours',
 u'generated',
 u'lot',
 u'interest',
 u'estimate',
 u'who',
 u'next',
 u'month',
 u'meeting',
 u'will',
 u'people',
 u'Onwards',
 u'and',
 u'upwards',
 u'management',
 u'pub',
 u'hospitable',
 u'bent',
 u'backwards',
 u'us',
 u'looked',
 u'after',
 u'well',
 u'the',
 u'best',
 u'best',
 u'choice',
 u'choice',
 u'of',
 u'venue',
 u'reserved',
 u'armed',
 u'forces',
 u'Promoted',
 u'stories',
 u'reserve',
 u'veteran',
 u'Royal',
 u'Engineers',
 u'wanted',
 u'go'

## Tagged Words

In [99]:
doc['tagged-words']

{u'0': {u'dbpedia_categories': [u'Military personnel',
   u"Military veterans' affairs",
   u'Aftermath of war'],
  u'end': 0,
  u'id': 327806,
  u'link_probability': u'0.01097',
  u'rho': u'0.04836',
  u'spot': u'VETERANS',
  u'start': 0,
  u'title': u'Veteran'},
 u'1': {u'dbpedia_categories': [u'Military life',
   u'Hand gestures',
   u'Greetings'],
  u'end': 1,
  u'id': 28977,
  u'link_probability': u'0.00269',
  u'rho': u'0.03156',
  u'spot': u'saluted',
  u'start': 1,
  u'title': u'Salute'},
 u'10': {u'dbpedia_categories': [u'Metropolitan Police',
   u'Police forces of London',
   u'Organizations established in 1829',
   u'1829 establishments in England'],
  u'end': 10,
  u'id': 192450,
  u'link_probability': u'0.00127',
  u'rho': u'0.05313',
  u'spot': u'met',
  u'start': 10,
  u'title': u'Metropolitan Police Service'},
 u'100': {u'dbpedia_categories': [u'Group processes'],
  u'end': 101,
  u'id': 648520,
  u'link_probability': u'0.01211',
  u'rho': u'0.01286',
  u'spot': u'one u

# Visualize tagged words together to identify differences between taggers and linkers

### PROBLEM: Common list of tokens is missing
Stanford, Wikifier and Tagme, all use different tokenizers. 

    In the case of wikifier I get back a list of the words tokenized from the document, but in the case of Tagme, I don't get that. I only get the recognized 'spots'. 
    

### Partial Solution: Map token lists
    I am currently looking for a way to map two different lists of tokens together using python's SequenceMatcher(a, b).ratio() > 0.6 to get, based on a token in a list, its corresponding token in another list. 
    
### Up next: Identifying mistakes made by linkers and taggers.
    By visualizing all the different tags assigned by the taggers together, I can see their main differences and systematically build a set of statistics based on this.
   
### So far: I can only identify that it is quite common for entity linkers to tag sets of words that are subsets of a bigger long-tail organization. 
    
##### Example
**Worcester Breakfast Club for HM Forces Veterans** is not tagged by the linkers.

   

In [118]:
content = raw_docs[doc_index]['content'][:5000]

In [116]:
def merge_dicts(x, y):
    '''Given two dicts, merge them into a new dict as a shallow copy.'''
    z = x.copy()
    z.update(y)
    
    return z


with_color = {int(k): merge_dicts({'color': 'tomato'}, v) for k, v in parsed_docs[doc_index]['tagged-words'].items()}

In [131]:
no_match = {
    'color': 'white'
}
tagged_words = parsed_docs[doc_index]['tagged-words']
tag_word = lambda i: copy.deepcopy(no_match) if not tagged_words.get(str(i), None) else with_color[i]
for_display = [[tag_word(i)] for i in range(len(content.split()))]

In [132]:
utf_8_display = json.dumps(for_display, ensure_ascii=False).encode('utf-8')

In [133]:
utf_8_display

'[[{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "white"}], [{"color": "tomato", "surface-form": "Worcester Breakfast Club for HM Forces Veterans", "type": "ORGANIZATION", "end": 23, "start": 17}], [{"color": "tomato", "surface-form": "Worcester Breakfast Club for HM Forces Veterans", "type": "ORGANIZATION", "end": 23, "start": 17}], [{"color": "tomato", "surface-form": "Worcester Breakfast Club for HM Forces Veterans", "type": "ORGANIZATION", "end": 23, "start": 17}], [{"color": "tomato", "surface-form": "Worcester Breakfast Club for HM Forces Veterans", "type": "ORGANIZATION", "end": 23, "start": 17}], [{"color": "tomato", "surface-form": "Worcester Breakfast Club for HM Fo

In [143]:
def inverse_scoring(a_i, b_i):
    return 1.0 / float(abs(a_i - b_i) + 1.0)

# change a_index for a relative position in its list
def elements_similarity(a_i, a, a_len, b_i, b, b_len):
    return (inverse_scoring(float(a_i)/float(a_len), float(b_i)/float(b_len)) + similar(a, b, None), similar(a, b, None), a_i, a, b_i, b)

def similar_to_a_in_list(a_index, a_val, a_list, l):
    return max(map(lambda (b_i, b): elements_similarity(a_index, a_val, len(a_list), b_i, b, len(l)), enumerate(l)))

def map_list_a_to_b(a, b):
    return [similar_to_a_in_list(i, tok, a, b) for (i, tok) in enumerate(a)]

In [146]:
wikifier_parsed = docs['wikifier']['parsed'][doc_index]
stanford_parsed = docs['stanford']['parsed'][doc_index]
mapping = map_list_a_to_b(wikifier_parsed['tokens'], stanford_parsed['tokens'])
mapping_dict = { t[2]: t[4] for t in mapping }

In [147]:
mapping_dict

{0: 0,
 1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 55: 55,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 102,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99,
 100: 100,
 101: 101,
 102: 102,
 103: 103,
 104: 104,
 105: 105,
 106: 106,
 107: 107,
 108: 108,
 109: 109,
 110: 110

In [149]:
with_color_wiki = {int(k): merge_dicts({'color': 'royalblue'}, v) for k, v in wikifier_parsed['tagged-words'].items()}