In [50]:
try:
    __IPYTHON__
    USING_IPYTHON = True
    %load_ext autoreload
    %autoreload 2
except NameError:
    USING_IPYTHON = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Argparse

In [595]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('project_root', help='')
ap.add_argument('--mrp-data-dir', default='data', help='')
ap.add_argument('--graphviz-sub-dir', default='visualization/graphviz', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='companion')
ap.add_argument('--jamr-alignment-file', default='jamr.mrp')


ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png')
arg_string = """
    /data/proj29_ds1/home/slai/mrp2019
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [596]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [597]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='companion', graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png', graphviz_sub_dir='visualization/graphviz', jamr_alignment_file='jamr.mrp', mrp_data_dir='data', mrp_file_extension='.mrp', project_root='/data/proj29_ds1/home/slai/mrp2019', train_sub_dir='training')

#### Library imports

In [572]:
import json
import logging
import os
import pprint
import re
import string
from collections import Counter, defaultdict, deque

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import plot_util
from preprocessing import CompanionParseDataset, MrpDataset, JamrAlignmentDataset
from action_state import mrp_json2parser_states, _generate_parser_action_states
                           
from tqdm import tqdm

#### ipython notebook specific imports

In [55]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

matplotlib.pyplot - DEBUG - Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [56]:
sh = logging.StreamHandler()
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
sh.setFormatter(formatter)
logging.basicConfig(level=logging.DEBUG, handlers=[sh])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [57]:
UNKWOWN = 'UNKWOWN'

### Load data

In [58]:
train_dir = os.path.join(args.project_root, args.mrp_data_dir, args.train_sub_dir)

In [59]:
mrp_dataset = MrpDataset()

In [60]:
frameworks, framework2dataset2mrp_jsons = mrp_dataset.load_mrp_json_dir(
    train_dir, args.mrp_file_extension)

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  4.36it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:01,  2.28it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:06<00:06,  2.12s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:09<00:04,  2.30s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:16<00:03,  3.83s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 21.86it/s][A
dataset_name:  57%|█████▋    | 8/14 [00:00<00:00, 18.64it/s][A
dataset_name:  71%|███████▏  | 10/14 [00:01<00:00,  6.36it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  5.48it/s][A
frameworks: 100%|██████████| 5/5 [00:18<00:00,  3.14s/it]t/s][A


### Data Preprocessing companion

In [61]:
companion_dir = os.path.join(args.project_root, args.mrp_data_dir, args.companion_sub_dir)

In [62]:
cparse_dataset = CompanionParseDataset()

In [63]:
dataset2cid2parse = cparse_dataset.load_companion_parse_dir(companion_dir, args.companion_file_extension)

preprocessing - INFO - framework amr found
dataset: 100%|██████████| 13/13 [00:01<00:00,  9.44it/s]
preprocessing - INFO - framework dm found
dataset: 100%|██████████| 5/5 [00:06<00:00,  1.31s/it]
preprocessing - INFO - framework ucca found
dataset: 100%|██████████| 6/6 [00:00<00:00, 37.65it/s]


In [64]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [65]:
# Some data is missing
'20003001' in dataset2cid2parse['wsj']

False

### Load JAMR alignment data

In [66]:
jalignment_dataset = JamrAlignmentDataset()

In [67]:
cid2alignment = jalignment_dataset.load_jamr_alignment_file(os.path.join(
    args.project_root,
    args.mrp_data_dir,
    args.companion_sub_dir,
    args.jamr_alignment_file
))

### Define the state at each step

In [68]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

__main__ - INFO - ucca
__main__ - INFO - ['wiki', 'ewt']
__main__ - INFO - psd
__main__ - INFO - ['wsj']
__main__ - INFO - eds
__main__ - INFO - ['wsj']
__main__ - INFO - dm
__main__ - INFO - ['wsj']
__main__ - INFO - amr
__main__ - INFO - ['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Test module

In [758]:
from action_state import mrp_json2parser_states, _generate_parser_action_states

In [785]:
from action_state import sentence_spliter

In [821]:
framework, dataset = [
    ('dm', 'wsj'),
    ('psd', 'wsj'),
    ('eds', 'wsj'),
    ('ucca', 'wiki'),
    ('amr', 'wsj'),
    ('amr', 'wiki'),
][0]

mrp_jsons = framework2dataset2mrp_jsons[framework][dataset]
framework, dataset

('dm', 'wsj')

In [822]:
mrp_json = mrp_jsons[0]

In [823]:
mrp_json = [mrp_json for mrp_json in mrp_jsons if mrp_json.get('id') == '20209013'][0]

In [824]:
mrp_json['input']

'A similar technique is almost impossible to apply to other crops, such as cotton, soybeans and rice.'

In [825]:
alignment = {}
if framework == 'amr':
    cid = mrp_json.get('id', '')
    alignment = cid2alignment[cid]

In [826]:
logger.info(args.graphviz_file_template.format(
    framework, dataset, mrp_json.get('id')))

__main__ - INFO - http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/dm/wsj.mrp/20209013.png


In [827]:
parser_states, meta_data = mrp_json2parser_states(mrp_json, framework, alignment)

In [812]:
(
    doc,
    nodes,
    node_id2node,
    edge_id2edge,
    top_oriented_edges,
    token_nodes,
    abstract_node_id_set,
    parent_id2indegree,
    parent_id2child_id_set,
    child_id2parent_id_set,
    child_id2edge_id_set,
    parent_id2edge_id_set,
    token_node_id_set,
    actions,
    anchor2token_id,
) = meta_data

In [813]:
actions

[(1, None),
 (1, None),
 (2, 1),
 (2, 2),
 (1, None),
 (2, 2),
 (1, None),
 (1, None),
 (1, None),
 (1, None),
 (2, 1),
 (1, None),
 (1, None),
 (1, None),
 (2, 1),
 (1, None),
 (2, 1),
 (1, None),
 (1, None),
 (2, 1),
 (2, 3),
 (1, None),
 (2, 3),
 (2, 2),
 (2, 3),
 (2, 3),
 (2, 2),
 (2, 2)]

In [814]:
parser_node_id_set = set()
parser_edge_id_set = set()
for (node_id, actions, edge_state, abstract_node_state, 
     complete_node_state, node_state, token_stack, pending_token_stack) in parser_states:
    parser_node_id_set.add(node_id)
    for edge_id in edge_state:
        parser_edge_id_set.add(edge_id)
    
    node = node_id2node[node_id]
    node_edges = [edge_id2edge[edge_id] for edge_id in edge_state]
    pprint.pprint((
        node.get('id'),
        actions, 
        node.get('label'), 
        [edge.get('label') for edge in node_edges], 
#         abstract_node_state,
        complete_node_state,
        node_state,
        token_stack,
        pending_token_stack,
    ))
    
print({node.get('id', -1) for node in mrp_json.get('nodes')} - parser_node_id_set)
assert len(parser_node_id_set) == len(mrp_json.get('nodes'))
print({edge_id for edge_id, edge in enumerate(mrp_json.get('edges'))} - parser_edge_id_set)
assert len(parser_edge_id_set) == len(mrp_json.get('edges'))

(0, [], '_a_q', [], [], [], [], [])
(1, [(1, None)], '_similar_a_to', [], [], [(1, 1, None)], [1], [])
(2,
 [(1, None), (2, 1)],
 'comp',
 ['ARG1'],
 [1],
 [(1, 1, None), (2, 2, [(2, 2, None)])],
 [1, 2],
 [])
(1,
 [(2, 2)],
 '_similar_a_to',
 ['ARG1'],
 [3],
 [(1, 1, [(1, 1, None), (2, 2, [(2, 2, None)])])],
 [1],
 [])
(3,
 [(1, None), (2, 2)],
 '_technique_n_1',
 [],
 [],
 [(3, 3, [(1, 1, [(1, 1, None), (2, 2, [(2, 2, None)])]), (3, 3, None)])],
 [3],
 [])
(3,
 [],
 '_technique_n_1',
 [],
 [],
 [(3, 3, [(1, 1, [(1, 1, None), (2, 2, [(2, 2, None)])]), (3, 3, None)])],
 [3],
 [])
(4,
 [(1, None)],
 '_almost_a_1',
 [],
 [],
 [(3, 3, [(1, 1, [(1, 1, None), (2, 2, [(2, 2, None)])]), (3, 3, None)]),
  (4, 4, None)],
 [3, 4],
 [])
(5,
 [(1, None)],
 '_impossible_a_for',
 ['ARG1'],
 [],
 [(3, 3, [(1, 1, [(1, 1, None), (2, 2, [(2, 2, None)])]), (3, 3, None)]),
  (4, 4, None),
  (5, 5, None)],
 [3, 4, 5],
 [])
(6,
 [(1, None)],
 '_apply_v_to',
 ['ARG1'],
 [],
 [(3, 3, [(1, 1, [(1, 1, None), (2

AssertionError: 

In [709]:
action_states = [s[1] for s in parser_states]

In [696]:
token_poss = []
prev_token_pos = 0
for token in sentence_spliter(doc):
    token_poss.append((prev_token_pos, prev_token_pos + len(token)))
    prev_token_pos += len(token) + 1

In [697]:
list(sentence_spliter(doc))

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29.']

In [684]:
token_poss

[(0, 6),
 (7, 13),
 (14, 15),
 (16, 18),
 (19, 24),
 (25, 28),
 (29, 30),
 (31, 35),
 (36, 40),
 (41, 44),
 (45, 50),
 (51, 53),
 (54, 55),
 (56, 68),
 (69, 77),
 (78, 82),
 (83, 86)]

In [687]:
[n['anchors'] for n in nodes]

[[{'from': 0, 'to': 6}],
 [{'from': 7, 'to': 13}],
 [{'from': 15, 'to': 17}],
 [{'from': 18, 'to': 23}],
 [{'from': 24, 'to': 27}],
 [{'from': 34, 'to': 38}],
 [{'from': 39, 'to': 42}],
 [{'from': 43, 'to': 48}],
 [{'from': 49, 'to': 51}],
 [{'from': 52, 'to': 53}],
 [{'from': 54, 'to': 66}],
 [{'from': 67, 'to': 75}],
 [{'from': 76, 'to': 80}],
 [{'from': 81, 'to': 83}]]

In [683]:
nodes

[{'id': 0,
  'label': 'Pierre',
  'properties': ['pos', 'frame'],
  'values': ['NNP', 'named:x-c'],
  'anchors': [{'from': 0, 'to': 6}]},
 {'id': 1,
  'label': 'Vinken',
  'properties': ['pos', 'frame'],
  'values': ['NNP', 'named:x-c'],
  'anchors': [{'from': 7, 'to': 13}]},
 {'id': 3,
  'label': '61',
  'properties': ['pos', 'frame'],
  'values': ['CD', 'card:i-i-c'],
  'anchors': [{'from': 15, 'to': 17}]},
 {'id': 4,
  'label': 'year',
  'properties': ['pos', 'frame'],
  'values': ['NNS', 'n:x'],
  'anchors': [{'from': 18, 'to': 23}]},
 {'id': 5,
  'label': 'old',
  'properties': ['pos', 'frame'],
  'values': ['JJ', 'a:e-p'],
  'anchors': [{'from': 24, 'to': 27}]},
 {'id': 8,
  'label': 'join',
  'properties': ['pos', 'frame'],
  'values': ['VB', 'v:e-i-p'],
  'anchors': [{'from': 34, 'to': 38}]},
 {'id': 9,
  'label': 'the',
  'properties': ['pos', 'frame'],
  'values': ['DT', 'q:i-h-h'],
  'anchors': [{'from': 39, 'to': 42}]},
 {'id': 10,
  'label': 'board',
  'properties': ['pos'

In [669]:
nodes = mrp_json['nodes']

In [670]:
nodes

[{'id': 0,
  'label': 'Pierre',
  'properties': ['pos', 'frame'],
  'values': ['NNP', 'named:x-c'],
  'anchors': [{'from': 0, 'to': 6}]},
 {'id': 1,
  'label': 'Vinken',
  'properties': ['pos', 'frame'],
  'values': ['NNP', 'named:x-c'],
  'anchors': [{'from': 7, 'to': 13}]},
 {'id': 3,
  'label': '61',
  'properties': ['pos', 'frame'],
  'values': ['CD', 'card:i-i-c'],
  'anchors': [{'from': 15, 'to': 17}]},
 {'id': 4,
  'label': 'year',
  'properties': ['pos', 'frame'],
  'values': ['NNS', 'n:x'],
  'anchors': [{'from': 18, 'to': 23}]},
 {'id': 5,
  'label': 'old',
  'properties': ['pos', 'frame'],
  'values': ['JJ', 'a:e-p'],
  'anchors': [{'from': 24, 'to': 27}]},
 {'id': 8,
  'label': 'join',
  'properties': ['pos', 'frame'],
  'values': ['VB', 'v:e-i-p'],
  'anchors': [{'from': 34, 'to': 38}]},
 {'id': 9,
  'label': 'the',
  'properties': ['pos', 'frame'],
  'values': ['DT', 'q:i-h-h'],
  'anchors': [{'from': 39, 'to': 42}]},
 {'id': 10,
  'label': 'board',
  'properties': ['pos'

In [671]:
[n['anchors'] for n in nodes]

[[{'from': 0, 'to': 6}],
 [{'from': 7, 'to': 13}],
 [{'from': 15, 'to': 17}],
 [{'from': 18, 'to': 23}],
 [{'from': 24, 'to': 27}],
 [{'from': 34, 'to': 38}],
 [{'from': 39, 'to': 42}],
 [{'from': 43, 'to': 48}],
 [{'from': 49, 'to': 51}],
 [{'from': 52, 'to': 53}],
 [{'from': 54, 'to': 66}],
 [{'from': 67, 'to': 75}],
 [{'from': 76, 'to': 80}],
 [{'from': 81, 'to': 83}]]

In [672]:
APPEND = 1  # APPEND_TO_CURRENT_GROUP
RESOLVE = 2  # RESOLVE_GROUP

In [711]:
actions = []
for action_state in action_states:
    for action in action_state:
        action_type, arg = action
        if action_type == APPEND:
            actions.append(action)

[[(1, None), (2, 1)],
 [(1, None)],
 [(0, None), (1, None), (2, 1)],
 [(1, None), (2, 2)],
 [(1, None), (2, 2)],
 [(2, 3)],
 [],
 [],
 [(0, None), (0, None), (1, None)],
 [(1, None), (2, 1)],
 [(1, None), (2, 2)],
 [],
 [(1, None)],
 [(1, None), (2, 1)],
 [(1, None), (2, 1)],
 [(1, None), (2, 3)],
 [(2, 2)],
 [],
 [(1, None), (2, 1)],
 [(1, None), (2, 2)],
 [(2, 5)],
 []]

In [712]:
len(actions)

14

In [501]:
# for i in range(1000):
#     mrp_json = mrp_jsons[i]
#     parser_states, meta_data = mrp_json2parser_states(mrp_json, framework, alignment)

In [469]:
# count = 0
# for i, mrp_json in enumerate(mrp_jsons):
#     if len(mrp_json.get('tops', [1])) > 1:
# #         print(i)
#         count += 1
# count

In [470]:
# set([len(mrp_json.get('tops', [1])) for mrp_json in mrp_jsons])

In [494]:
logger.setLevel(logging.INFO)

In [525]:
num_pops = []
error_num = 0
for i, mrp_json in tqdm(enumerate(mrp_jsons)):
#     print(i)
    parser_states, meta_data = mrp_json2parser_states(mrp_json, framework, alignment)
    if not parser_states:
        logger.info(i)
        error_num += 1
        continue
    action_states = [s[1] for s in parser_states]
    for action_state in action_states:
        for action in action_state:
            action_type, arg = action
            if action_type == RESOLVE:
                num_pop = arg
                num_pops.append(num_pop)
















0it [00:00, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














__main__ - INFO - 4
__main__ - INFO - 7
__main__ - INFO - 9















__main__ - INFO - 10
__main__ - INFO - 14















__main__ - INFO - 17
__main__ - INFO - 18
__main__ - INFO - 19















__main__ - INFO - 20
__main__ - INFO - 25















__main__ - INFO - 27
__main__ - INFO - 30















__main__ - INFO - 33















__main__ - INFO - 38
__main__ - INFO - 40
__main__ - INFO - 42















__main__ - INFO - 45
__main__ - INFO - 46
__main__ - INFO - 49















__main__ - INFO - 52
__main__ - INFO - 54
__main__ - INFO - 57
__main__ - INFO - 59















__main__ - INFO - 60
__main__ - INFO - 62
__main__ - INFO - 63
__main__ - INFO - 66















__main__ - INFO - 67
__main__ - INFO - 68
__main__ - INFO - 70
__main__ - INFO - 71
__main__ - INFO - 72
__main__ - INFO - 73
__main__ - INFO - 74
__main__ - INFO - 75
__main__ - INFO - 76






KeyboardInterrupt: 

In [527]:
mrp_json['input']

"South Korea registered a trade deficit of $101 million in October, reflecting the country's economic sluggishness, according to government figures released Wednesday."

In [528]:
action_states

[[(1, None), (2, 1)],
 [(1, None), (2, 1)],
 [(1, None), (2, 1)],
 [(1, None)],
 [(1, None), (2, 1)],
 [(1, None), (2, 1)],
 [(1, None)],
 [(1, None), (2, 1)],
 [(1, None), (2, 1)],
 [(2, 5)],
 [(2, 5)]]

In [418]:
len(mrp_jsons)

1662

In [419]:
sum(num_pops) / len(num_pops)

1.9701025551885973

In [591]:
action_states

[[(1, None), (2, 1)],
 [(1, None), (2, 2)],
 [],
 [(1, None)],
 [(1, None)],
 [(1, None), (2, 1)],
 [(1, None), (2, 2)],
 [],
 [],
 [(1, None), (2, 1)],
 [(1, None), (2, 1)],
 [(1, None), (2, 3)],
 [(2, 3)],
 [(2, 3)],
 []]

In [145]:
error_num

53

In [112]:
len(mrp_jsons)

35656

In [777]:
mrp_json['input']

'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.'

In [771]:
parser_states = _generate_parser_action_states(
    doc,
    nodes,
    # edges,
    top_oriented_edges,
    node_id2node,
    token_nodes,
    abstract_node_id_set,
    child_id2edge_id_set,
    parent_id2indegree,
    parent_id2child_id_set,
    child_id2parent_id_set,
    token_node_id_set,
)

action_state - INFO - ('curr_node_id', 0)
action_state - INFO - (0, [], True, True, True, True)
action_state - INFO - (0, 0, [(0, 0, [(0, 0, None)])])
action_state - INFO - [(1, None), (2, 1)]
action_state - INFO - (0, 4, 1, {5})
action_state - INFO - ('token stack', [0], [])
action_state - INFO - ('visited states', {0}, {0}, set(), {0})
action_state - INFO - ('curr_node_id', 1)
action_state - INFO - (1, [(0, 0, [(0, 0, None)])], False, False, True, True)
action_state - INFO - (1, 1, [(0, 0, [(0, 0, None)]), (1, 1, None)])
action_state - INFO - [(1, None)]
action_state - INFO - (1, 8, 8, {16, 1, 10, 11})
action_state - INFO - ('token stack', [0, 1], [])
action_state - INFO - ('visited states', {0, 1}, {0, 1}, set(), {0})
action_state - INFO - ('curr_node_id', 3)
action_state - INFO - (3, [(0, 0, [(0, 0, None)]), (1, 1, None)], True, True, True, True)
action_state - INFO - (3, 3, [(0, 0, [(0, 0, None)]), (1, 1, None), (3, 3, [(3, 3, None)])])
action_state - INFO - [(1, None), (2, 1)]
ac

action_state - INFO - (12,
 12,
 [(1,
   1,
   [(0, 0, [(0, 0, None)]),
    (1, 1, None),
    (5, 5, [(4, 4, [(3, 3, [(3, 3, None)]), (4, 4, None)]), (5, 5, None)])]),
  (8, 8, None),
  (10, 10, [(9, 9, [(9, 9, None)]), (10, 10, None)]),
  (11, 11, None),
  (12, 12, [(12, 12, None)])])
action_state - INFO - [(1, None), (2, 1)]
action_state - INFO - (12, 2, 14, {13})
action_state - INFO - ('token stack', [1, 8, 10, 11, 12], [])
action_state - INFO - ('visited states',
 {0, 1, 3, 4, 5, 8, 9, 10, 11, 12},
 {0, 1, 3, 4, 5, 8, 9, 10, 11, 12},
 {1, 10, 4, 5},
 {0, 1, 3, 4, 5, 9, 10, 12})
action_state - INFO - ('curr_node_id', 13)
action_state - INFO - (13,
 [(1,
   1,
   [(0, 0, [(0, 0, None)]),
    (1, 1, None),
    (5, 5, [(4, 4, [(3, 3, [(3, 3, None)]), (4, 4, None)]), (5, 5, None)])]),
  (8, 8, None),
  (10, 10, [(9, 9, [(9, 9, None)]), (10, 10, None)]),
  (11, 11, None),
  (12, 12, [(12, 12, None)])],
 True,
 True,
 True,
 True)
action_state - INFO - (13,
 13,
 [(1,
   1,
   [(0, 0, [(0

action_state - INFO - [(2, 5)]
action_state - INFO - ('token stack', [8], [])
action_state - INFO - ('visited states',
 {0, 1, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16},
 {0, 1, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16},
 {1, 4, 5, 8, 10, 11, 14, 16},
 {0, 1, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16})
action_state - INFO - ('curr_node_id', 16)
action_state - INFO - (16,
 [(8,
   8,
   [(1,
     1,
     [(0, 0, [(0, 0, None)]),
      (1, 1, None),
      (5, 5, [(4, 4, [(3, 3, [(3, 3, None)]), (4, 4, None)]), (5, 5, None)])]),
    (8, 8, None),
    (10, 10, [(9, 9, [(9, 9, None)]), (10, 10, None)]),
    (11,
     11,
     [(11, 11, None),
      (14,
       14,
       [(12, 12, [(12, 12, None)]),
        (13, 13, [(13, 13, None)]),
        (14, 14, None)])]),
    (16, 16, [(15, 15, [(15, 15, None)]), (16, 16, None)])])],
 False,
 True,
 False,
 False)
action_state - INFO - (16,
 16,
 [(8,
   8,
   [(1,
     1,
     [(0, 0, [(0, 0, None)]),
      (1, 1, None),
      (5, 5, [(4, 4, [(3, 

[[(1, None), (2, 1)],
 [(1, None)],
 [(1, None), (2, 1)],
 [(1, None), (2, 2)],
 [(1, None), (2, 2)],
 [(2, 3)],
 [],
 [],
 [(1, None)],
 [(1, None), (2, 1)],
 [(1, None), (2, 2)],
 [],
 [(1, None)],
 [(1, None), (2, 1)],
 [(1, None), (2, 1)],
 [(1, None), (2, 3)],
 [(2, 2)],
 [],
 [(1, None), (2, 1)],
 [(1, None), (2, 2)],
 [(2, 5)],
 []]

In [721]:
top_oriented_edges

[{'source': 20, 'target': 1, 'label': 'P', 'id': 0, 'parent': 20, 'child': 1},
 {'source': 25, 'target': 8, 'label': 'E', 'id': 1, 'parent': 25, 'child': 8},
 {'source': 21,
  'target': 24,
  'label': 'H',
  'id': 2,
  'parent': 21,
  'child': 24},
 {'source': 24,
  'target': 25,
  'label': 'A',
  'id': 3,
  'parent': 24,
  'child': 25},
 {'source': 28,
  'target': 18,
  'label': 'E',
  'id': 4,
  'parent': 28,
  'child': 18},
 {'source': 22, 'target': 3, 'label': 'C', 'id': 5, 'parent': 22, 'child': 3},
 {'source': 25,
  'target': 26,
  'label': 'E',
  'id': 6,
  'parent': 25,
  'child': 26},
 {'source': 24,
  'target': 0,
  'label': 'A',
  'properties': ['remote'],
  'values': [True],
  'id': 7,
  'parent': 24,
  'child': 0},
 {'source': 21, 'target': 6, 'label': 'U', 'id': 8, 'parent': 21, 'child': 6},
 {'source': 28,
  'target': 19,
  'label': 'U',
  'id': 9,
  'parent': 28,
  'child': 19},
 {'source': 23, 'target': 5, 'label': 'C', 'id': 10, 'parent': 23, 'child': 5},
 {'source': 

In [722]:
mrp_json['input']

'Bowie moved to Switzerland in 1976, purchasing a chalet in the hills to the north of Lake Geneva.'

In [723]:
parent_id2child_id_set

defaultdict(set,
            {20: {0, 1, 22, 23},
             25: {8, 9, 26},
             21: {6, 20, 24},
             24: {7, 25},
             28: {16, 17, 18, 19, 27},
             22: {2, 3},
             23: {4, 5},
             27: {13, 14, 15},
             26: {10, 11, 12, 28}})

In [724]:
child_id2parent_id_set

defaultdict(set,
            {1: {20},
             8: {25},
             24: {21},
             25: {24},
             18: {28},
             3: {22},
             26: {25},
             6: {21},
             19: {28},
             5: {23},
             9: {25},
             16: {28},
             15: {27},
             22: {20},
             28: {26},
             10: {26},
             20: {21},
             12: {26},
             13: {27},
             4: {23},
             27: {28},
             11: {26},
             17: {28},
             23: {20},
             0: {20},
             14: {27},
             2: {22},
             7: {24}})

In [725]:
child_id2parent_id_set

defaultdict(set,
            {1: {20},
             8: {25},
             24: {21},
             25: {24},
             18: {28},
             3: {22},
             26: {25},
             6: {21},
             19: {28},
             5: {23},
             9: {25},
             16: {28},
             15: {27},
             22: {20},
             28: {26},
             10: {26},
             20: {21},
             12: {26},
             13: {27},
             4: {23},
             27: {28},
             11: {26},
             17: {28},
             23: {20},
             0: {20},
             14: {27},
             2: {22},
             7: {24}})

In [726]:
top_oriented_edges

[{'source': 20, 'target': 1, 'label': 'P', 'id': 0, 'parent': 20, 'child': 1},
 {'source': 25, 'target': 8, 'label': 'E', 'id': 1, 'parent': 25, 'child': 8},
 {'source': 21,
  'target': 24,
  'label': 'H',
  'id': 2,
  'parent': 21,
  'child': 24},
 {'source': 24,
  'target': 25,
  'label': 'A',
  'id': 3,
  'parent': 24,
  'child': 25},
 {'source': 28,
  'target': 18,
  'label': 'E',
  'id': 4,
  'parent': 28,
  'child': 18},
 {'source': 22, 'target': 3, 'label': 'C', 'id': 5, 'parent': 22, 'child': 3},
 {'source': 25,
  'target': 26,
  'label': 'E',
  'id': 6,
  'parent': 25,
  'child': 26},
 {'source': 24,
  'target': 0,
  'label': 'A',
  'properties': ['remote'],
  'values': [True],
  'id': 7,
  'parent': 24,
  'child': 0},
 {'source': 21, 'target': 6, 'label': 'U', 'id': 8, 'parent': 21, 'child': 6},
 {'source': 28,
  'target': 19,
  'label': 'U',
  'id': 9,
  'parent': 28,
  'child': 19},
 {'source': 23, 'target': 5, 'label': 'C', 'id': 10, 'parent': 23, 'child': 5},
 {'source': 

In [727]:
node_id2node

{0: {'id': 0, 'anchors': [{'from': 0, 'to': 5}], 'label': 'Bowie'},
 1: {'id': 1, 'anchors': [{'from': 6, 'to': 11}], 'label': 'moved'},
 2: {'id': 2, 'anchors': [{'from': 12, 'to': 14}], 'label': 'to'},
 3: {'id': 3, 'anchors': [{'from': 15, 'to': 26}], 'label': 'Switzerland'},
 4: {'id': 4, 'anchors': [{'from': 27, 'to': 29}], 'label': 'in'},
 5: {'id': 5, 'anchors': [{'from': 30, 'to': 34}], 'label': '1976'},
 6: {'id': 6, 'anchors': [{'from': 34, 'to': 35}], 'label': ','},
 7: {'id': 7, 'anchors': [{'from': 36, 'to': 46}], 'label': 'purchasing'},
 8: {'id': 8, 'anchors': [{'from': 47, 'to': 48}], 'label': 'a'},
 9: {'id': 9, 'anchors': [{'from': 49, 'to': 55}], 'label': 'chalet'},
 10: {'id': 10, 'anchors': [{'from': 56, 'to': 58}], 'label': 'in'},
 11: {'id': 11, 'anchors': [{'from': 59, 'to': 62}], 'label': 'the'},
 12: {'id': 12, 'anchors': [{'from': 63, 'to': 68}], 'label': 'hills'},
 13: {'id': 13, 'anchors': [{'from': 69, 'to': 71}], 'label': 'to'},
 14: {'id': 14, 'anchors':

In [728]:
token_nodes

[{'id': 0, 'anchors': [{'from': 0, 'to': 5}], 'label': 'Bowie'},
 {'id': 1, 'anchors': [{'from': 6, 'to': 11}], 'label': 'moved'},
 {'id': 2, 'anchors': [{'from': 12, 'to': 14}], 'label': 'to'},
 {'id': 3, 'anchors': [{'from': 15, 'to': 26}], 'label': 'Switzerland'},
 {'id': 4, 'anchors': [{'from': 27, 'to': 29}], 'label': 'in'},
 {'id': 5, 'anchors': [{'from': 30, 'to': 34}], 'label': '1976'},
 {'id': 6, 'anchors': [{'from': 34, 'to': 35}], 'label': ','},
 {'id': 7, 'anchors': [{'from': 36, 'to': 46}], 'label': 'purchasing'},
 {'id': 8, 'anchors': [{'from': 47, 'to': 48}], 'label': 'a'},
 {'id': 9, 'anchors': [{'from': 49, 'to': 55}], 'label': 'chalet'},
 {'id': 10, 'anchors': [{'from': 56, 'to': 58}], 'label': 'in'},
 {'id': 11, 'anchors': [{'from': 59, 'to': 62}], 'label': 'the'},
 {'id': 12, 'anchors': [{'from': 63, 'to': 68}], 'label': 'hills'},
 {'id': 13, 'anchors': [{'from': 69, 'to': 71}], 'label': 'to'},
 {'id': 14, 'anchors': [{'from': 72, 'to': 75}], 'label': 'the'},
 {'id'

In [729]:
edge_id2edge

{0: {'source': 20, 'target': 1, 'label': 'P', 'id': 0},
 1: {'source': 25, 'target': 8, 'label': 'E', 'id': 1},
 2: {'source': 21, 'target': 24, 'label': 'H', 'id': 2},
 3: {'source': 24, 'target': 25, 'label': 'A', 'id': 3},
 4: {'source': 28, 'target': 18, 'label': 'E', 'id': 4},
 5: {'source': 22, 'target': 3, 'label': 'C', 'id': 5},
 6: {'source': 25, 'target': 26, 'label': 'E', 'id': 6},
 7: {'source': 24,
  'target': 0,
  'label': 'A',
  'properties': ['remote'],
  'values': [True],
  'id': 7},
 8: {'source': 21, 'target': 6, 'label': 'U', 'id': 8},
 9: {'source': 28, 'target': 19, 'label': 'U', 'id': 9},
 10: {'source': 23, 'target': 5, 'label': 'C', 'id': 10},
 11: {'source': 25, 'target': 9, 'label': 'C', 'id': 11},
 12: {'source': 28, 'target': 16, 'label': 'R', 'id': 12},
 13: {'source': 27, 'target': 15, 'label': 'C', 'id': 13},
 14: {'source': 20, 'target': 22, 'label': 'A', 'id': 14},
 15: {'source': 26, 'target': 28, 'label': 'E', 'id': 15},
 16: {'source': 26, 'target':

In [730]:
for parent_id, child_id_set in sorted(parent_id2child_id_set.items()):
    print(
        (parent_id, node_id2node[parent_id].get('label')),
        [(child_id, node_id2node[child_id].get('label')) for child_id in child_id_set]
    )

(20, None) [(0, 'Bowie'), (1, 'moved'), (22, None), (23, None)]
(21, None) [(24, None), (20, None), (6, ',')]
(22, None) [(2, 'to'), (3, 'Switzerland')]
(23, None) [(4, 'in'), (5, '1976')]
(24, None) [(25, None), (7, 'purchasing')]
(25, None) [(8, 'a'), (9, 'chalet'), (26, None)]
(26, None) [(10, 'in'), (11, 'the'), (28, None), (12, 'hills')]
(27, None) [(13, 'to'), (14, 'the'), (15, 'north')]
(28, None) [(16, 'of'), (17, 'Lake'), (18, 'Geneva'), (19, '.'), (27, None)]


In [731]:
for child_id, parent_id_set in sorted(child_id2parent_id_set.items()):
    print(
        (child_id, node_id2node[child_id].get('label')),
        [(parent_id, node_id2node[parent_id].get('label')) for parent_id in parent_id_set]
    )

(0, 'Bowie') [(20, None)]
(1, 'moved') [(20, None)]
(2, 'to') [(22, None)]
(3, 'Switzerland') [(22, None)]
(4, 'in') [(23, None)]
(5, '1976') [(23, None)]
(6, ',') [(21, None)]
(7, 'purchasing') [(24, None)]
(8, 'a') [(25, None)]
(9, 'chalet') [(25, None)]
(10, 'in') [(26, None)]
(11, 'the') [(26, None)]
(12, 'hills') [(26, None)]
(13, 'to') [(27, None)]
(14, 'the') [(27, None)]
(15, 'north') [(27, None)]
(16, 'of') [(28, None)]
(17, 'Lake') [(28, None)]
(18, 'Geneva') [(28, None)]
(19, '.') [(28, None)]
(20, None) [(21, None)]
(22, None) [(20, None)]
(23, None) [(20, None)]
(24, None) [(21, None)]
(25, None) [(24, None)]
(26, None) [(25, None)]
(27, None) [(28, None)]
(28, None) [(26, None)]


In [732]:
for parent_id, indegree in sorted(parent_id2indegree.items()):
    print((parent_id, node_id2node[parent_id].get('label'), indegree))

(20, None, 4)
(21, None, 3)
(22, None, 2)
(23, None, 2)
(24, None, 2)
(25, None, 3)
(26, None, 4)
(27, None, 3)
(28, None, 5)


In [733]:
# mrp_json.get('edges')[17]

In [734]:
parser_node_id_set = set()
parser_edge_id_set = set()
for (node_id, actions, edge_state, abstract_node_state, 
     complete_node_state, node_state, token_stack, pending_token_stack) in parser_states:
    parser_node_id_set.add(node_id)
    for edge_id in edge_state:
        parser_edge_id_set.add(edge_id)
    
    node = node_id2node[node_id]
    node_edges = [edge_id2edge[edge_id] for edge_id in edge_state]
    pprint.pprint((
        node.get('id'),
        actions, 
        node.get('label'), 
        [edge.get('label') for edge in node_edges], 
#         abstract_node_state,
        complete_node_state,
        node_state,
        token_stack,
        pending_token_stack,
    ))
    
print({node.get('id', -1) for node in mrp_json.get('nodes')} - parser_node_id_set)
assert len(parser_node_id_set) == len(mrp_json.get('nodes'))
print({edge_id for edge_id, edge in enumerate(mrp_json.get('edges'))} - parser_edge_id_set)
assert len(parser_edge_id_set) == len(mrp_json.get('edges'))

(0, [(1, None), (2, 1)], 'Bowie', [], [], [(0, 0, [(0, 0, None)])], [0], [])
(1,
 [(1, None), (2, 1)],
 'moved',
 [],
 [],
 [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])],
 [0, 1],
 [])
(2,
 [(1, None), (2, 1)],
 'to',
 [],
 [],
 [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, [(2, 2, None)])],
 [0, 1, 2],
 [])
(3,
 [(1, None), (2, 1)],
 'Switzerland',
 ['C'],
 [22],
 [(0, 0, [(0, 0, None)]),
  (1, 1, [(1, 1, None)]),
  (2, 2, [(2, 2, None)]),
  (3, 3, [(3, 3, None)])],
 [0, 1, 2, 3],
 [])
(22,
 [(1, None), (2, 3)],
 None,
 [],
 [],
 [(0, 0, [(0, 0, None)]),
  (1, 1, [(1, 1, None)]),
  (22, 22, [(2, 2, [(2, 2, None)]), (3, 3, [(3, 3, None)]), (22, 22, None)])],
 [0, 1, 22],
 [])
(4,
 [(1, None), (2, 1)],
 'in',
 [],
 [],
 [(0, 0, [(0, 0, None)]),
  (1, 1, [(1, 1, None)]),
  (22, 22, [(2, 2, [(2, 2, None)]), (3, 3, [(3, 3, None)]), (22, 22, None)]),
  (4, 4, [(4, 4, None)])],
 [0, 1, 22, 4],
 [])
(5,
 [(1, None), (2, 1)],
 '1976',
 ['C'],
 [23],
 [(0, 0, [(0, 0, None)]),
  (1,

AssertionError: 

In [707]:
mrp_json['input'][61: 63 + 1]

'uti'

In [575]:
token_nodes

[{'id': 1, 'label': '_the_q', 'anchors': [{'from': 0, 'to': 3}]},
 {'id': 4, 'label': '_asbestos_n_1', 'anchors': [{'from': 4, 'to': 12}]},
 {'id': 5, 'label': '_fiber_n_1', 'anchors': [{'from': 13, 'to': 19}]},
 {'id': 7,
  'label': '_crocidolite_n_unknown',
  'anchors': [{'from': 20, 'to': 32}]},
 {'id': 8, 'label': '_unusually_x_deg', 'anchors': [{'from': 36, 'to': 45}]},
 {'id': 9,
  'label': '_resilient_a_unknown',
  'anchors': [{'from': 46, 'to': 55}]},
 {'id': 10, 'label': '_once_x_subord', 'anchors': [{'from': 56, 'to': 60}]},
 {'id': 13, 'label': '_enter_v_1', 'anchors': [{'from': 64, 'to': 70}]},
 {'id': 14, 'label': '_the_q', 'anchors': [{'from': 71, 'to': 74}]},
 {'id': 15, 'label': '_lung_n_1', 'anchors': [{'from': 75, 'to': 81}]},
 {'id': 16, 'label': '_with_x_subord', 'anchors': [{'from': 82, 'to': 86}]},
 {'id': 17, 'label': '_even_x_deg', 'anchors': [{'from': 87, 'to': 91}]},
 {'id': 19, 'label': '_brief_a_1', 'anchors': [{'from': 92, 'to': 97}]},
 {'id': 20,
  'label'

In [571]:
nodes

[{'id': 0, 'label': 'appos', 'anchors': [{'from': 0, 'to': 32}]},
 {'id': 1, 'label': '_the_q', 'anchors': [{'from': 0, 'to': 3}]},
 {'id': 2, 'label': 'compound', 'anchors': [{'from': 4, 'to': 19}]},
 {'id': 3, 'label': 'udef_q', 'anchors': [{'from': 4, 'to': 12}]},
 {'id': 4, 'label': '_asbestos_n_1', 'anchors': [{'from': 4, 'to': 12}]},
 {'id': 5, 'label': '_fiber_n_1', 'anchors': [{'from': 13, 'to': 19}]},
 {'id': 6, 'label': 'udef_q', 'anchors': [{'from': 20, 'to': 32}]},
 {'id': 7,
  'label': '_crocidolite_n_unknown',
  'anchors': [{'from': 20, 'to': 32}]},
 {'id': 8, 'label': '_unusually_x_deg', 'anchors': [{'from': 36, 'to': 45}]},
 {'id': 9,
  'label': '_resilient_a_unknown',
  'anchors': [{'from': 46, 'to': 55}]},
 {'id': 10, 'label': '_once_x_subord', 'anchors': [{'from': 56, 'to': 60}]},
 {'id': 11, 'label': 'pron', 'anchors': [{'from': 61, 'to': 63}]},
 {'id': 12, 'label': 'pronoun_q', 'anchors': [{'from': 61, 'to': 63}]},
 {'id': 13, 'label': '_enter_v_1', 'anchors': [{'f

In [149]:
token_node_id_set

{0, 1, 3, 4, 5, 8, 10, 13, 14, 15, 16}

In [517]:
mrp_json['nodes'][]

[{'id': 0,
  'label': 'the',
  'properties': ['pos', 'frame'],
  'values': ['DT', 'q:i-h-h'],
  'anchors': [{'from': 0, 'to': 3}]},
 {'id': 1,
  'label': 'asbestos',
  'properties': ['pos', 'frame'],
  'values': ['NN', 'n:x'],
  'anchors': [{'from': 4, 'to': 12}]},
 {'id': 2,
  'label': 'fiber',
  'properties': ['pos', 'frame'],
  'values': ['NN', 'n:x'],
  'anchors': [{'from': 13, 'to': 18}]},
 {'id': 4,
  'label': 'crocidolite',
  'properties': ['pos', 'frame'],
  'values': ['NN', 'n:x'],
  'anchors': [{'from': 20, 'to': 31}]},
 {'id': 7,
  'label': 'unusually',
  'properties': ['pos', 'frame'],
  'values': ['RB', 'x:e-u'],
  'anchors': [{'from': 36, 'to': 45}]},
 {'id': 8,
  'label': 'resilient',
  'properties': ['pos', 'frame'],
  'values': ['JJ', 'a:e-u'],
  'anchors': [{'from': 46, 'to': 55}]},
 {'id': 9,
  'label': 'once',
  'properties': ['pos', 'frame'],
  'values': ['IN', 'x:e-h-h'],
  'anchors': [{'from': 56, 'to': 60}]},
 {'id': 10,
  'label': 'it',
  'properties': ['pos', 

In [514]:
mrp_json['tops'][0]

30

set()

In [671]:
mrp_json

{'id': '20001001',
 'flavor': 1,
 'framework': 'eds',
 'version': 0.9,
 'time': '2019-04-10 (20:21)',
 'input': 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.',
 'tops': [10],
 'nodes': [{'id': 0, 'label': 'proper_q', 'anchors': [{'from': 0, 'to': 28}]},
  {'id': 1, 'label': 'compound', 'anchors': [{'from': 0, 'to': 14}]},
  {'id': 2, 'label': 'proper_q', 'anchors': [{'from': 0, 'to': 6}]},
  {'id': 3,
   'label': 'named',
   'properties': ['carg'],
   'values': ['Pierre'],
   'anchors': [{'from': 0, 'to': 6}]},
  {'id': 4,
   'label': 'named',
   'properties': ['carg'],
   'values': ['Vinken'],
   'anchors': [{'from': 7, 'to': 14}]},
  {'id': 5, 'label': 'measure', 'anchors': [{'from': 15, 'to': 23}]},
  {'id': 6, 'label': 'udef_q', 'anchors': [{'from': 15, 'to': 23}]},
  {'id': 7,
   'label': 'card',
   'properties': ['carg'],
   'values': ['61'],
   'anchors': [{'from': 15, 'to': 17}]},
  {'id': 8, 'label': '_year_n_1', 'anchors': [{'from': 18,

In [502]:
{node.get('id') for node in mrp_json.get('nodes')} - parser_node_id_set

set()

In [690]:
child_id2edge_id_set

defaultdict(set,
            {22: {0, 9, 16},
             8: {1, 5, 11},
             18: {2, 10},
             4: {3, 6, 7, 15},
             16: {4, 13, 21},
             10: {8, 12},
             12: {14, 20},
             3: {17, 19},
             9: {18},
             2: set(),
             1: set(),
             0: set(),
             7: set(),
             6: set(),
             5: set(),
             11: set(),
             13: set(),
             14: set(),
             15: set(),
             21: set(),
             17: set(),
             19: set(),
             20: set()})