# Entity Recognition from Parsed Data

This notebook is an extention of the `entity_recognition_prototype.ipynb` to enable the usage of parsed data provided by the original authors of the paper.

Please see the other notebook for documentation and explanation about methods.

In [1]:
import os
import sys
preproc_path = os.path.abspath(os.path.join('..'))
print(preproc_path)
if preproc_path not in sys.path:
    sys.path.append(preproc_path)

/Users/andrew/Documents/college/cs8803-css/replication-project/code/src


In [2]:
import json
parsed_data_path = '../../data/data-from-authors/train_data/AFP_ENG_20090609.0525.json'
parsed_data = json.load(open(parsed_data_path, 'r'))
parsed_data.keys()

dict_keys(['docId', 'headline', 'named_entity', 'cluster_json', 'text', 'part_of_speech', 'totalParse', 'processed'])

- `docId`
- `headline`
- `named_entity` named entities and occurances
- `cluster_json` co-reference chains
- `text` tokens by sentence
- `part_of_speech`
- `totalParse`
- `processed`

## Transforming annotation format

First, we'll do our best to transform the given parsed data into a skeleton of the token annotation used by Stanford CoreNLP's API.

In [3]:
sentences = [
    {
        'index': s_idx,
        'tokens': [
            {
                'index': t_idx + 1, # index starting at 1
                'originalText': token
            }
            for t_idx, token in enumerate(sentence)
        ]
    }
    for s_idx, sentence in enumerate(parsed_data['text'])
]

sentences

[{'index': 0,
  'tokens': [{'index': 1, 'originalText': 'EU'},
   {'index': 2, 'originalText': 'seeks'},
   {'index': 3, 'originalText': 'common'},
   {'index': 4, 'originalText': 'strategy'},
   {'index': 5, 'originalText': 'on'},
   {'index': 6, 'originalText': 'swine'},
   {'index': 7, 'originalText': 'flu'},
   {'index': 8, 'originalText': 'vaccine'},
   {'index': 9, 'originalText': '.'}]},
 {'index': 1,
  'tokens': [{'index': 1, 'originalText': 'EU'},
   {'index': 2, 'originalText': 'Health'},
   {'index': 3, 'originalText': 'Commissioner'},
   {'index': 4, 'originalText': 'Androulla'},
   {'index': 5, 'originalText': 'Vassiliou'},
   {'index': 6, 'originalText': 'on'},
   {'index': 7, 'originalText': 'Tuesday'},
   {'index': 8, 'originalText': 'called'},
   {'index': 9, 'originalText': 'for'},
   {'index': 10, 'originalText': 'a'},
   {'index': 11, 'originalText': 'common'},
   {'index': 12, 'originalText': 'European'},
   {'index': 13, 'originalText': 'strategy'},
   {'index': 1

## Named Entity Recognition (`named_entity` field)

Create mapping of all recognized named entities and occurances in the text. Aliases and acronyms are automatically merged (using `wikidata`). `PERSON` entities are also merged using the last name heuristic.

In [4]:
p_entities = parsed_data['named_entity']
p_entities

{'Americas': [[10, 'LOCATION', 41, 41], [11, 'LOCATION', 23, 23]],
 'Androulla Vassiliou': [[2, 'PERSON', 4, 5]],
 'EU': [[1, 'ORGANIZATION', 1, 1], [3, 'ORGANIZATION', 36, 36]],
 'EU Health': [[2, 'ORGANIZATION', 1, 2]],
 'European': [[2, 'MISC', 12, 12], [12, 'MISC', 19, 19], [14, 'MISC', 3, 3]],
 'German': [[12, 'MISC', 1, 1]],
 'Luxembourg': [[3, 'LOCATION', 40, 40], [12, 'LOCATION', 11, 11]],
 'Mexico': [[11, 'LOCATION', 15, 15]],
 'Schmidt': [[14, 'PERSON', 1, 1]],
 'Ulla Schmidt': [[12, 'PERSON', 4, 5]],
 'Vassiliou': [[6, 'PERSON', 33, 33]],
 'WHO': [[8, 'MISC', 26, 26], [15, 'ORGANIZATION', 4, 4]],
 'World Health Organisation': [[6, 'ORGANIZATION', 5, 7]],
 'day': [[5, 'TIME', 13, 13]]}

In [5]:
import importlib
from preproc import entity_extractor

In [6]:
importlib.reload(entity_extractor);

In [7]:
ee = entity_extractor.EntityExtractor()

In [8]:
named_types = entity_extractor.NAMED_TYPES

for entity, occurances in p_entities.items():
    for sent_idx, e_type, start_idx, end_idx in occurances:
        if e_type in named_types:
            key = (e_type, entity)
            ee.add_occurance(key, sent_idx-1, start_idx-1, end_idx) # change to index from 0 and exclusive end

In [9]:
dict(ee.ids)

{('LOCATION', 'Americas'): ('wikidata', 'Q828'),
 ('LOCATION', 'Luxembourg'): ('wikidata', 'Q32'),
 ('LOCATION', 'Mexico'): ('wikidata', 'Q96'),
 ('MISC', 'European'): ('wikidata', 'Q1286'),
 ('MISC', 'German'): ('wikidata', 'Q188'),
 ('MISC', 'WHO'): ('wikidata', 'Q7817'),
 ('ORGANIZATION', 'EU'): ('wikidata', 'Q458'),
 ('ORGANIZATION', 'EU Health'): ('wikidata', 'Q40901196'),
 ('ORGANIZATION', 'WHO'): ('wikidata', 'Q7817'),
 ('ORGANIZATION', 'World Health Organisation'): ('wikidata', 'Q7817'),
 ('PERSON', 'Androulla Vassiliou'): ('wikidata', 'Q262719'),
 ('PERSON', 'Schmidt'): ('wikidata', 'Q15240355'),
 ('PERSON', 'Ulla Schmidt'): ('wikidata', 'Q61307'),
 ('PERSON', 'Vassiliou'): ('wikidata', 'Q21450206')}

In [10]:
ee.occurances

{('wikidata', 'Q1286'): {(1, 11, 12), (11, 18, 19), (13, 2, 3)},
 ('wikidata', 'Q15240355'): {(13, 0, 1)},
 ('wikidata', 'Q188'): {(11, 0, 1)},
 ('wikidata', 'Q21450206'): {(5, 32, 33)},
 ('wikidata', 'Q262719'): {(1, 3, 5)},
 ('wikidata', 'Q32'): {(2, 39, 40), (11, 10, 11)},
 ('wikidata', 'Q40901196'): {(1, 0, 2)},
 ('wikidata', 'Q458'): {(0, 0, 1), (2, 35, 36)},
 ('wikidata', 'Q61307'): {(11, 3, 5)},
 ('wikidata', 'Q7817'): {(5, 4, 7), (7, 25, 26), (14, 3, 4)},
 ('wikidata', 'Q828'): {(9, 40, 41), (10, 22, 23)},
 ('wikidata', 'Q96'): {(10, 14, 15)}}

### Last name merging for `PERSON` types

In [11]:
len(ee.occurances)

12

In [12]:
entity_extractor.merge_people_by_last_name(ee)

In [13]:
len(ee.occurances)

10

In [14]:
dict(ee.ids)

{('LOCATION', 'Americas'): ('wikidata', 'Q828'),
 ('LOCATION', 'Luxembourg'): ('wikidata', 'Q32'),
 ('LOCATION', 'Mexico'): ('wikidata', 'Q96'),
 ('MISC', 'European'): ('wikidata', 'Q1286'),
 ('MISC', 'German'): ('wikidata', 'Q188'),
 ('MISC', 'WHO'): ('wikidata', 'Q7817'),
 ('ORGANIZATION', 'EU'): ('wikidata', 'Q458'),
 ('ORGANIZATION', 'EU Health'): ('wikidata', 'Q40901196'),
 ('ORGANIZATION', 'WHO'): ('wikidata', 'Q7817'),
 ('ORGANIZATION', 'World Health Organisation'): ('wikidata', 'Q7817'),
 ('PERSON', 'Androulla Vassiliou'): ('wikidata', 'Q21450206'),
 ('PERSON', 'Schmidt'): ('wikidata', 'Q15240355'),
 ('PERSON', 'Ulla Schmidt'): ('wikidata', 'Q15240355'),
 ('PERSON', 'Vassiliou'): ('wikidata', 'Q21450206')}

## Annotating Tokens with Entities

In [15]:
from preproc import annotate

In [16]:
importlib.reload(annotate);

In [17]:
annotate.mark_entities(sentences, ee)

In [18]:
for sentence in sentences:
    for token in sentence['tokens']:
        if 'entity_id' in token:
            print((sentence['index'], token['index']), token['originalText'], token['entity_id'])

(0, 1) EU ('wikidata', 'Q458')
(1, 1) EU ('wikidata', 'Q40901196')
(1, 2) Health ('wikidata', 'Q40901196')
(1, 4) Androulla ('wikidata', 'Q21450206')
(1, 5) Vassiliou ('wikidata', 'Q21450206')
(1, 12) European ('wikidata', 'Q1286')
(2, 36) EU ('wikidata', 'Q458')
(2, 40) Luxembourg ('wikidata', 'Q32')
(5, 5) World ('wikidata', 'Q7817')
(5, 6) Health ('wikidata', 'Q7817')
(5, 7) Organisation ('wikidata', 'Q7817')
(5, 33) Vassiliou ('wikidata', 'Q21450206')
(7, 26) WHO ('wikidata', 'Q7817')
(9, 41) Americas ('wikidata', 'Q828')
(10, 15) Mexico ('wikidata', 'Q96')
(10, 23) Americas ('wikidata', 'Q828')
(11, 1) German ('wikidata', 'Q188')
(11, 4) Ulla ('wikidata', 'Q15240355')
(11, 5) Schmidt ('wikidata', 'Q15240355')
(11, 11) Luxembourg ('wikidata', 'Q32')
(11, 19) European ('wikidata', 'Q1286')
(13, 1) Schmidt ('wikidata', 'Q15240355')
(13, 3) European ('wikidata', 'Q1286')
(14, 4) WHO ('wikidata', 'Q7817')


## Identifying and Annotating Entity Mentions with Co-reference Chains

In [19]:
parsed_data['cluster_json']

[[{'end_ind': 2,
   'mention_type': 'PROPER',
   'sent_ind': 1,
   'token_ind': 1,
   'word': 'EU'},
  {'end_ind': 3,
   'mention_type': 'PROPER',
   'sent_ind': 2,
   'token_ind': 1,
   'word': 'EU Health'},
  {'end_ind': 37,
   'mention_type': 'PROPER',
   'sent_ind': 3,
   'token_ind': 36,
   'word': 'EU'}],
 [{'end_ind': 8,
   'mention_type': 'PROPER',
   'sent_ind': 2,
   'token_ind': 1,
   'word': 'EU Health Commissioner Androulla Vassiliou on Tuesday'},
  {'end_ind': 34,
   'mention_type': 'PROPER',
   'sent_ind': 6,
   'token_ind': 33,
   'word': 'Vassiliou'}],
 [{'end_ind': 22,
   'mention_type': 'NOMINAL',
   'sent_ind': 2,
   'token_ind': 18,
   'word': 'a swine flu vaccine'},
  {'end_ind': 24,
   'mention_type': 'NOMINAL',
   'sent_ind': 3,
   'token_ind': 22,
   'word': 'the vaccine'},
  {'end_ind': 11,
   'mention_type': 'NOMINAL',
   'sent_ind': 5,
   'token_ind': 9,
   'word': 'the vaccine'}],
 [{'end_ind': 31,
   'mention_type': 'PRONOMINAL',
   'sent_ind': 3,
   'toke

Remap the keys to match corefence output from Stanford CoreNLP.

In [20]:
coref_chains = [
    [
        {
            'sentNum': m['sent_ind'],
            'startIndex': m['token_ind'],
            'endIndex': m['end_ind']
        } for m in chain
    ] for chain in parsed_data['cluster_json']
]
coref_chains

[[{'endIndex': 2, 'sentNum': 1, 'startIndex': 1},
  {'endIndex': 3, 'sentNum': 2, 'startIndex': 1},
  {'endIndex': 37, 'sentNum': 3, 'startIndex': 36}],
 [{'endIndex': 8, 'sentNum': 2, 'startIndex': 1},
  {'endIndex': 34, 'sentNum': 6, 'startIndex': 33}],
 [{'endIndex': 22, 'sentNum': 2, 'startIndex': 18},
  {'endIndex': 24, 'sentNum': 3, 'startIndex': 22},
  {'endIndex': 11, 'sentNum': 5, 'startIndex': 9}],
 [{'endIndex': 31, 'sentNum': 3, 'startIndex': 30},
  {'endIndex': 3, 'sentNum': 3, 'startIndex': 2},
  {'endIndex': 27, 'sentNum': 3, 'startIndex': 26},
  {'endIndex': 32, 'sentNum': 2, 'startIndex': 29}],
 [{'endIndex': 12, 'sentNum': 12, 'startIndex': 11},
  {'endIndex': 41, 'sentNum': 3, 'startIndex': 40}],
 [{'endIndex': 29, 'sentNum': 6, 'startIndex': 28},
  {'endIndex': 3, 'sentNum': 4, 'startIndex': 2},
  {'endIndex': 3, 'sentNum': 5, 'startIndex': 2},
  {'endIndex': 23, 'sentNum': 4, 'startIndex': 22},
  {'endIndex': 29, 'sentNum': 3, 'startIndex': 28},
  {'endIndex': 10, 

In [21]:
annotate.mark_coref_mentions(sentences, coref_chains)

In [22]:
for sentence in sentences:
    for token in sentence['tokens']:
        if 'entity_id' in token:
            print((sentence['index'], token['index']), token['originalText'], token['entity_id'])

(0, 1) EU ('wikidata', 'Q458')
(1, 1) EU ('wikidata', 'Q40901196')
(1, 2) Health ('wikidata', 'Q40901196')
(1, 4) Androulla ('wikidata', 'Q21450206')
(1, 5) Vassiliou ('wikidata', 'Q21450206')
(1, 12) European ('wikidata', 'Q1286')
(2, 33) a ('wikidata', 'Q458')
(2, 34) meeting ('wikidata', 'Q458')
(2, 35) of ('wikidata', 'Q458')
(2, 36) EU ('wikidata', 'Q458')
(2, 37) health ('wikidata', 'Q458')
(2, 38) ministers ('wikidata', 'Q458')
(2, 40) Luxembourg ('wikidata', 'Q32')
(5, 5) World ('wikidata', 'Q7817')
(5, 6) Health ('wikidata', 'Q7817')
(5, 7) Organisation ('wikidata', 'Q7817')
(5, 33) Vassiliou ('wikidata', 'Q21450206')
(7, 24) April ('wikidata', 'Q7817')
(7, 26) WHO ('wikidata', 'Q7817')
(9, 40) the ('wikidata', 'Q828')
(9, 41) Americas ('wikidata', 'Q828')
(10, 15) Mexico ('wikidata', 'Q96')
(10, 22) the ('wikidata', 'Q828')
(10, 23) Americas ('wikidata', 'Q828')
(11, 1) German ('wikidata', 'Q188')
(11, 4) Ulla ('wikidata', 'Q15240355')
(11, 5) Schmidt ('wikidata', 'Q15240355'