# Testing e2e Library Implementations

## From MPQA Data

In [1]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

In [2]:
import importlib
import annotate, entity_extractor, util

In [3]:
importlib.reload(annotate);
importlib.reload(entity_extractor);
importlib.reload(util);

In [4]:
test_file = '../data/database.mpqa.3.0/docs/20010926/23.17.57-23406'
text = open(test_file, 'r').read()
output = nlp.annotate(text, properties={
    'annotators': 'ner,coref',
    'outputFormat': 'json'
})
sentences = output['sentences']
coref_chains = output['corefs'].values()

In [5]:
ee = entity_extractor.EntityExtractor.from_sentences(sentences)
entity_extractor.merge_people_by_last_name(ee)
annotate.mark_entities(sentences, ee)
annotate.mark_coref_mentions(sentences, coref_chains)

In [6]:
for sentence in sentences:
    for token in sentence['tokens']:
        if 'entity_id' in token:
            print((sentence['index'], token['index']), token['originalText'], token['entity_id'])

(0, 1) TAIPEI ('wikidata', 'Q1867')
(0, 6) AFP ('wikidata', 'Q40464')
(0, 9) Taiwan ('wikidata', 'Q22502')
(0, 11) Chen ('wikidata', 'Q22368')
(0, 12) Shui-bian ('wikidata', 'Q22368')
(0, 16) Taipei ('wikidata', 'Q1867')
(0, 17) 's ('wikidata', 'Q1867')
(0, 22) United ('wikidata', 'Q30')
(0, 23) States ('wikidata', 'Q30')
(0, 25) Washington ('wikidata', 'Q61')
(0, 31) Afghanistan ('wikidata', 'Q889')
(1, 11) Republic ('wikidata', 'Q865')
(1, 12) of ('wikidata', 'Q865')
(1, 13) China ('wikidata', 'Q865')
(1, 15) Taiwan ('wikidata', 'Q22502')
(1, 16) 's ('wikidata', 'Q22502')
(1, 26) our ('wikidata', 'Q22502')
(1, 31) George ('wikidata', 'Q42295')
(1, 32) W. ('wikidata', 'Q42295')
(1, 33) Bush ('wikidata', 'Q42295')
(1, 45) Chen ('wikidata', 'Q804988')
(1, 49) Oregon ('wikidata', 'Q824')
(1, 51) John ('wikidata', 'Q740345')
(1, 52) Kitzhaber ('wikidata', 'Q740345')
(2, 1) Taiwan ('wikidata', 'Q22502')
(2, 20) US ('wikidata', 'Q30')
(2, 31) Chen ('wikidata', 'Q804988')
(3, 3) ROC ('wikida

## From Author's Parsed Data

In [7]:
import json
parsed_data_path = '../data/data-from-authors/train_data/AFP_ENG_20090609.0525.json'
parsed_data = json.load(open(parsed_data_path, 'r'))
sentences = util.sentences_from_parsed(parsed_data['text'])
coref_chains = util.chains_from_parsed(parsed_data['cluster_json'])

In [8]:
ee = entity_extractor.EntityExtractor.from_parsed_data(parsed_data['named_entity'])
entity_extractor.merge_people_by_last_name(ee)
annotate.mark_entities(sentences, ee)
annotate.mark_coref_mentions(sentences, coref_chains)

In [9]:
for sentence in sentences:
    for token in sentence['tokens']:
        if 'entity_id' in token:
            print((sentence['index'], token['index']), token['originalText'], token['entity_id'])

(0, 1) EU ('wikidata', 'Q458')
(1, 1) EU ('wikidata', 'Q40901196')
(1, 2) Health ('wikidata', 'Q40901196')
(1, 4) Androulla ('wikidata', 'Q21450206')
(1, 5) Vassiliou ('wikidata', 'Q21450206')
(1, 12) European ('wikidata', 'Q1286')
(2, 33) a ('wikidata', 'Q458')
(2, 34) meeting ('wikidata', 'Q458')
(2, 35) of ('wikidata', 'Q458')
(2, 36) EU ('wikidata', 'Q458')
(2, 37) health ('wikidata', 'Q458')
(2, 38) ministers ('wikidata', 'Q458')
(2, 40) Luxembourg ('wikidata', 'Q32')
(5, 5) World ('wikidata', 'Q7817')
(5, 6) Health ('wikidata', 'Q7817')
(5, 7) Organisation ('wikidata', 'Q7817')
(5, 33) Vassiliou ('wikidata', 'Q21450206')
(7, 24) April ('wikidata', 'Q7817')
(7, 26) WHO ('wikidata', 'Q7817')
(9, 40) the ('wikidata', 'Q828')
(9, 41) Americas ('wikidata', 'Q828')
(10, 15) Mexico ('wikidata', 'Q96')
(10, 22) the ('wikidata', 'Q828')
(10, 23) Americas ('wikidata', 'Q828')
(11, 1) German ('wikidata', 'Q188')
(11, 4) Ulla ('wikidata', 'Q15240355')
(11, 5) Schmidt ('wikidata', 'Q15240355'