In [144]:
try:
    __IPYTHON__
    USING_IPYTHON = True
    %load_ext autoreload
    %autoreload 2
except NameError:
    USING_IPYTHON = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Argparse

In [145]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('project_root', help='')
ap.add_argument('--mrp-data-dir', default='data', help='')
ap.add_argument('--mrp-test-dir', default='src/tests', help='')
ap.add_argument('--tests-fixtures-file', default='fixtures/test.jsonl', help='')

ap.add_argument('--graphviz-sub-dir', default='visualization/graphviz', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='companion')
ap.add_argument('--jamr-alignment-file', default='jamr.mrp')

ap.add_argument('--test-input-file', default='evaluation/input.mrp', help='')
ap.add_argument('--test-companion-file', default='evaluation/udpipe.mrp', help='')
ap.add_argument('--allennlp-mrp-json-file-template', default='allennlp-mrp-json-small-{}.jsonl', help='')


ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png')
ap.add_argument('--parse-plot-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.png')

arg_string = """
    /data/proj29_ds1/home/slai/mrp2019
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [146]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [147]:
args

Namespace(allennlp_mrp_json_file_template='allennlp-mrp-json-small-{}.jsonl', companion_file_extension='.conllu', companion_sub_dir='companion', graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png', graphviz_sub_dir='visualization/graphviz', jamr_alignment_file='jamr.mrp', mrp_data_dir='data', mrp_file_extension='.mrp', mrp_test_dir='src/tests', parse_plot_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.png', project_root='/data/proj29_ds1/home/slai/mrp2019', test_companion_file='evaluation/udpipe.mrp', test_input_file='evaluation/input.mrp', tests_fixtures_file='fixtures/test.jsonl', train_sub_dir='training')

#### Library imports

In [5]:
import json
import logging
import os
import pprint
import re
import string
from collections import Counter, defaultdict, deque

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import plot_util
import torch
from action_state import mrp_json2parser_states, _generate_parser_action_states
from action_state import ERROR, APPEND, RESOLVE, IGNORE
from preprocessing import (CompanionParseDataset, MrpDataset, JamrAlignmentDataset,
                           read_companion_parse_json_file, read_mrp_json_file, parse2parse_json)            
from torch import nn
from tqdm import tqdm

#### ipython notebook specific imports

In [6]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [40]:
sh = logging.StreamHandler()
formatter = logging.Formatter('%(levelname)-8s [%(name)s:%(lineno)d] %(message)s')
sh.setFormatter(formatter)
logging.basicConfig(level=logging.DEBUG, handlers=[sh])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [8]:
UNKWOWN = 'UNKWOWN'

### Load data

In [9]:
train_dir = os.path.join(args.project_root, args.mrp_data_dir, args.train_sub_dir)

In [10]:
mrp_dataset = MrpDataset()

In [11]:
frameworks, framework2dataset2mrp_jsons = mrp_dataset.load_mrp_json_dir(
    train_dir, args.mrp_file_extension)

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  2.87it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:03,  1.32it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:04<00:05,  1.74s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:09<00:05,  2.70s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:14<00:03,  3.40s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  36%|███▌      | 5/14 [00:00<00:00, 43.14it/s][A
dataset_name:  50%|█████     | 7/14 [00:00<00:00, 17.24it/s][A
dataset_name:  64%|██████▍   | 9/14 [00:00<00:00, 15.97it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  5.12it/s][A
frameworks: 100%|██████████| 5/5 [00:16<00:00,  2.86s/it]t/s][A


### Data Preprocessing companion

In [12]:
companion_dir = os.path.join(args.project_root, args.mrp_data_dir, args.companion_sub_dir)

In [13]:
cparse_dataset = CompanionParseDataset()

In [14]:
dataset2cid2parse = cparse_dataset.load_companion_parse_dir(companion_dir, args.companion_file_extension)

INFO     [preprocessing.py:168] framework amr found
dataset: 100%|██████████| 13/13 [00:01<00:00,  9.58it/s]
INFO     [preprocessing.py:168] framework dm found
dataset: 100%|██████████| 5/5 [00:04<00:00,  1.11it/s]
INFO     [preprocessing.py:168] framework ucca found
dataset: 100%|██████████| 6/6 [00:00<00:00, 25.91it/s]


In [15]:
dataset2cid2parse_json = cparse_dataset.convert_parse2parse_json()

In [16]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [17]:
# Some data is missing
'20003001' in dataset2cid2parse['wsj']

False

### Load JAMR alignment data

In [18]:
jalignment_dataset = JamrAlignmentDataset()

In [19]:
cid2alignment = jalignment_dataset.load_jamr_alignment_file(os.path.join(
    args.project_root,
    args.mrp_data_dir,
    args.companion_sub_dir,
    args.jamr_alignment_file
))

### Load testing data

In [20]:
test_input_filename = os.path.join(args.project_root, args.mrp_data_dir, args.test_input_file)
test_companion_filename = os.path.join(args.project_root, args.mrp_data_dir, args.test_companion_file)

In [21]:
test_mrp_jsons = read_mrp_json_file(test_input_filename)
test_parse_jsons = read_companion_parse_json_file(test_companion_filename)

In [22]:
parse_json = test_parse_jsons['102990']

In [120]:
mrp_json = framework2dataset2mrp_jsons['psd']['wsj'][1]

In [175]:
framework = 'ucca'
dataset = 'wiki'

# framework = 'dm'
# dataset = 'wsj'

In [180]:
cid = list(dataset2cid2parse_json[dataset].keys())[1]

In [181]:
idx, mrp_json = [
    (idx, mrp_json)
    for idx, mrp_json in enumerate(framework2dataset2mrp_jsons[framework][dataset])
    if mrp_json.get('id') == cid
][0]
idx

70

In [130]:
parse_json = dataset2cid2parse_json[dataset][cid]

In [131]:
doc = mrp_json['input']

In [132]:
doc

"An investment company said it offered to acquire Arby's Inc., the fast-food operator, for $205 million."

In [133]:
token_pos = 0
anchors = []
char_pos2tokenized_parse_node_id = []

for node_id, node in enumerate(parse_json.get('nodes')):
    label = node.get('label')
    label_size = len(label)
    while doc[token_pos] == ' ':
        token_pos += 1
        char_pos2tokenized_parse_node_id.append(node_id)
    anchors.append((token_pos, token_pos + label_size))
    char_pos2tokenized_parse_node_id.extend([node_id] * (label_size))
    print(node_id, doc[token_pos: token_pos + label_size], anchors[-1], len(char_pos2tokenized_parse_node_id))
    token_pos += label_size

0 An (0, 2) 2
1 investment (3, 13) 13
2 company (14, 21) 21
3 said (22, 26) 26
4 it (27, 29) 29
5 offered (30, 37) 37
6 to (38, 40) 40
7 acquire (41, 48) 48
8 Arby (49, 53) 53
9 's (53, 55) 55
10 Inc. (56, 60) 60
11 , (60, 61) 61
12 the (62, 65) 65
13 fast-food (66, 75) 75
14 operator (76, 84) 84
15 , (84, 85) 85
16 for (86, 89) 89
17 $ (90, 91) 91
18 205 (91, 94) 94
19 million (95, 102) 102
20 . (102, 103) 103


In [134]:
doc

"An investment company said it offered to acquire Arby's Inc., the fast-food operator, for $205 million."

In [135]:
len(char_pos2tokenized_parse_node_id)

103

In [136]:
doc = mrp_json['input']

In [137]:
mrp_json['tops']

[3]

In [143]:
mrp_parser_states, mrp_meta_data = mrp_json2parser_states(
    mrp_json, 
    tokenized_parse_nodes=parse_json['nodes'],
)

DEBUG    [action_state.py:206] {3}
DEBUG    [action_state.py:206] {2, 5}
DEBUG    [action_state.py:206] {0, 1, 4, 7}
DEBUG    [action_state.py:206] {16, 7}
DEBUG    [action_state.py:206] {17}
DEBUG    [action_state.py:206] {19}
DEBUG    [action_state.py:206] {18}
DEBUG    [action_state.py:414] ('prev anchors', 0)
DEBUG    [action_state.py:427] ('anchors', 0, 2, 0, 1)
DEBUG    [action_state.py:433] ('curr_node_id', 0)
DEBUG    [action_state.py:456] (0, [], True, True, True, True)
DEBUG    [action_state.py:502] (0, 0, [(0, 0, [(0, 0, None)])])
DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 0, 'label': 'an', 'properties': ['pos', 'frame'], 'values': ['DT', 'q:i-h-h'], 'anchors': [{'from': 0, 'to': 2}]}, [[]]))]
DEBUG    [action_state.py:516] (0, 1, 2, {1})
DEBUG    [action_state.py:576] ('token stack', [(0, 'an', [(0, 'an', 'An')])])
DEBUG    [action_state.py:578] ('visited states', {0}, {0}, set(), {0})
DEBUG    [action_state.py:414] ('prev anchors', 1)
DEBUG    [action_state.

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 7, 12}, {0, 1, 2, 3, 4, 5, 7, 12}, {2}, {0, 1, 2})
DEBUG    [action_state.py:414] ('prev anchors', 13)
DEBUG    [action_state.py:427] ('anchors', 66, 75, 13, 14)
DEBUG    [action_state.py:433] ('curr_node_id', 13)
DEBUG    [action_state.py:456] (13,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (3, 3, None),
  (4, 4, None),
  (5, 5, None),
  (7, 7, None)],
 True,
 True,
 True,
 True)
DEBUG    [action_state.py:502] (13,
 13,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (3, 3, None),
  (4, 4, None),
  (5, 5, None),
  (7, 7, None)])
DEBUG    [action_state.py:503] []
DEBUG    [action_state.py:576] ('token stack',
 [(2,
   'company',
   [(0, 'an', [(0, 'an', 'An')]),
    (1, 'investment', [(1, 'investment', 'investment')]),
    (2, 'company', 'company')]),
  (3, 'say', 'said'),
  (4, 'it', 'it'),
  (5, 'offer', 'offered'),
  (7, 'acquire', 'acquire')])
DEBUG    [

DEBUG    [action_state.py:516] (17, 10, 16, set())
DEBUG    [action_state.py:576] ('token stack',
 [(2,
   'company',
   [(0, 'an', [(0, 'an', 'An')]),
    (1, 'investment', [(1, 'investment', 'investment')]),
    (2, 'company', 'company')]),
  (3, 'say', 'said'),
  (4, 'it', 'it'),
  (5, 'offer', 'offered'),
  (7, 'acquire', 'acquire'),
  (16, 'for', 'fast-food'),
  (17,
   '$',
   [(17, '$', 'operator'),
    (19,
     'million',
     [(18, '205', [(18, '205', ',')]), (19, 'million', 'for')])])])
DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 7, 12, 13, 14, 16, 17, 18, 19}, {0, 1, 2, 3, 4, 5, 7, 12, 13, 14, 16, 17, 18, 19}, {16, 17, 2, 19}, {0, 1, 2, 17, 18, 19})
DEBUG    [action_state.py:433] ('curr_node_id', 16)
DEBUG    [action_state.py:456] (16,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (3, 3, None),
  (4, 4, None),
  (5, 5, None),
  (7, 7, None),
  (16, 16, None),
  (17,
   17,
   [(17, 17, None), (19, 19, [(18, 18, [(18, 18

DEBUG    [action_state.py:433] ('curr_node_id', 3)
DEBUG    [action_state.py:456] (3,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (5,
   5,
   [(3, 3, None),
    (4, 4, None),
    (4,
     4,
     [(5, 5, None),
      (7,
       7,
       [(7, 7, None),
        (16,
         16,
         [(16, 16, None),
          (17,
           17,
           [(17, 17, None),
            (19, 19, [(18, 18, [(18, 18, None)]), (19, 19, None)])])])])])])],
 False,
 True,
 False,
 True)


In [139]:
mrp_meta_data[-1]

[]

In [140]:
companion_parser_states, companion_meta_data = mrp_json2parser_states(
    parse_json,
    mrp_doc=doc,
    tokenized_parse_nodes=parse_json['nodes'],
)

DEBUG    [action_state.py:206] {3}
DEBUG    [action_state.py:206] {2, 20, 5}
DEBUG    [action_state.py:206] {0, 1, 4, 7}
DEBUG    [action_state.py:206] {17, 10, 6}
DEBUG    [action_state.py:206] {8, 11, 14, 15, 16, 19}
DEBUG    [action_state.py:206] {9, 18, 12, 13}
DEBUG    [action_state.py:433] ('curr_node_id', 0)
DEBUG    [action_state.py:456] (0, [], True, True, True, True)
DEBUG    [action_state.py:502] (0, 0, [(0, 0, [(0, 0, None)])])
DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 0, 'label': 'An', 'properties': ['lemma', 'upos', 'xpos'], 'values': ['a', 'DET', 'DT']}, [[]]))]
DEBUG    [action_state.py:516] (0, 0, 2, {1})
DEBUG    [action_state.py:576] ('token stack', [(0, 'An', [(0, 'An', 'An')])])
DEBUG    [action_state.py:578] ('visited states', {0}, {0}, set(), {0})
DEBUG    [action_state.py:433] ('curr_node_id', 1)
DEBUG    [action_state.py:456] (1, [(0, 0, [(0, 0, None)])], True, True, True, True)
DEBUG    [action_state.py:502] (1, 1, [(0, 0, [(0, 0, None)]), (1, 

DEBUG    [action_state.py:516] (8, 7, 10, {8, 11, 14, 15})
DEBUG    [action_state.py:576] ('token stack',
 [(2,
   'company',
   [(0, 'An', [(0, 'An', 'An')]),
    (1, 'investment', [(1, 'investment', 'investment')]),
    (2, 'company', 'company')]),
  (3, 'said', 'said'),
  (4, 'it', [(4, 'it', 'it')]),
  (5, 'offered', 'offered'),
  (6, 'to', [(6, 'to', 'to')]),
  (7, 'acquire', 'acquire'),
  (8, 'Arby', 'Arby')])
DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8}, {0, 1, 2, 3, 4, 5, 6, 7, 8}, {2}, {0, 1, 2, 4, 6})
DEBUG    [action_state.py:433] ('curr_node_id', 9)
DEBUG    [action_state.py:456] (9,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (3, 3, None),
  (4, 4, [(4, 4, None)]),
  (5, 5, None),
  (6, 6, [(6, 6, None)]),
  (7, 7, None),
  (8, 8, None)],
 True,
 True,
 True,
 True)
DEBUG    [action_state.py:502] (9,
 9,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (3, 3, None),
  (4, 4, [(4, 4,

DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 13, 'label': 'fast-food', 'properties': ['lemma', 'upos', 'xpos'], 'values': ['fast-food', 'NOUN', 'NN']}, [[]]))]
DEBUG    [action_state.py:516] (13, 12, 14, set())
DEBUG    [action_state.py:576] ('token stack',
 [(2,
   'company',
   [(0, 'An', [(0, 'An', 'An')]),
    (1, 'investment', [(1, 'investment', 'investment')]),
    (2, 'company', 'company')]),
  (3, 'said', 'said'),
  (4, 'it', [(4, 'it', 'it')]),
  (5, 'offered', 'offered'),
  (6, 'to', [(6, 'to', 'to')]),
  (7, 'acquire', 'acquire'),
  (8, 'Arby', [(8, 'Arby', 'Arby'), (9, '’s', [(9, '’s', '’s')])]),
  (10, 'Inc.', 'Inc.'),
  (11, ',', [(11, ',', ',')]),
  (12, 'the', [(12, 'the', 'the')]),
  (13, 'fast-food', [(13, 'fast-food', 'fast-food')])])
DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, {8, 2, 14}, {0, 1, 2, 4, 6, 8, 9, 11, 12, 13})
DEBUG    [action_state.py:

DEBUG    [action_state.py:516] (10, 9, 7, {17})
DEBUG    [action_state.py:576] ('token stack',
 [(2,
   'company',
   [(0, 'An', [(0, 'An', 'An')]),
    (1, 'investment', [(1, 'investment', 'investment')]),
    (2, 'company', 'company')]),
  (3, 'said', 'said'),
  (4, 'it', [(4, 'it', 'it')]),
  (5, 'offered', 'offered'),
  (6, 'to', [(6, 'to', 'to')]),
  (7, 'acquire', 'acquire'),
  (10,
   'Inc.',
   [(8, 'Arby', [(8, 'Arby', 'Arby'), (9, '’s', [(9, '’s', '’s')])]),
    (10, 'Inc.', 'Inc.'),
    (11, ',', [(11, ',', ',')]),
    (14,
     'operator',
     [(12, 'the', [(12, 'the', 'the')]),
      (13, 'fast-food', [(13, 'fast-food', 'fast-food')]),
      (14, 'operator', 'operator')]),
    (15, ',', [(15, ',', ',')])])])
DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {8, 2, 10, 14}, {0, 1, 2, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15})
DEBUG    [action_state.py:433] ('curr_node_

DEBUG    [action_state.py:502] (19,
 19,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (3, 3, None),
  (4, 4, [(4, 4, None)]),
  (5, 5, None),
  (6, 6, [(6, 6, None)]),
  (7, 7, None),
  (10,
   10,
   [(8, 8, [(8, 8, None), (9, 9, [(9, 9, None)])]),
    (10, 10, None),
    (11, 11, [(11, 11, None)]),
    (14,
     14,
     [(12, 12, [(12, 12, None)]), (13, 13, [(13, 13, None)]), (14, 14, None)]),
    (15, 15, [(15, 15, None)])]),
  (16, 16, [(16, 16, None)]),
  (17, 17, None),
  (19, 19, [(18, 18, [(18, 18, None)]), (19, 19, None)])])
DEBUG    [action_state.py:503] [(0, None), (1, (2, {'id': 19, 'label': 'million', 'properties': ['lemma', 'upos', 'xpos'], 'values': ['million', 'NUM', 'CD']}, [[{'source': 18, 'target': 19, 'label': 'compound', 'id': 17, 'parent': 19, 'child': 18}], []]))]
DEBUG    [action_state.py:516] (19, 18, 17, set())
DEBUG    [action_state.py:576] ('token stack',
 [(2,
   'company',
   [(0, 'An', [(0, 'An', 'An')]),
    (1, 'investmen

DEBUG    [action_state.py:502] (5,
 5,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (3, 3, None),
  (5,
   5,
   [(4, 4, [(4, 4, None)]),
    (5, 5, None),
    (7,
     7,
     [(6, 6, [(6, 6, None)]),
      (7, 7, None),
      (10,
       10,
       [(8, 8, [(8, 8, None), (9, 9, [(9, 9, None)])]),
        (10, 10, None),
        (11, 11, [(11, 11, None)]),
        (14,
         14,
         [(12, 12, [(12, 12, None)]),
          (13, 13, [(13, 13, None)]),
          (14, 14, None)]),
        (15, 15, [(15, 15, None)])]),
      (17,
       17,
       [(16, 16, [(16, 16, None)]),
        (17, 17, None),
        (19, 19, [(18, 18, [(18, 18, None)]), (19, 19, None)])])])])])
DEBUG    [action_state.py:503] [(1, (3, {'id': 5, 'label': 'offered', 'properties': ['lemma', 'upos', 'xpos'], 'values': ['offer', 'VERB', 'VBD']}, [[{'source': 4, 'target': 5, 'label': 'nsubj', 'id': 3, 'parent': 5, 'child': 4}], [], [{'source': 7, 'target': 5, 'label': 'xcomp', 'id': 6

DEBUG    [action_state.py:433] ('curr_node_id', 3)
DEBUG    [action_state.py:456] (3,
 [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
  (3, 3, None),
  (5,
   5,
   [(4, 4, [(4, 4, None)]),
    (5, 5, None),
    (7,
     7,
     [(6, 6, [(6, 6, None)]),
      (7, 7, None),
      (10,
       10,
       [(8, 8, [(8, 8, None), (9, 9, [(9, 9, None)])]),
        (10, 10, None),
        (11, 11, [(11, 11, None)]),
        (14,
         14,
         [(12, 12, [(12, 12, None)]),
          (13, 13, [(13, 13, None)]),
          (14, 14, None)]),
        (15, 15, [(15, 15, None)])]),
      (17,
       17,
       [(16, 16, [(16, 16, None)]),
        (17, 17, None),
        (19, 19, [(18, 18, [(18, 18, None)]), (19, 19, None)])])])]),
  (20, 20, [(20, 20, None)])],
 False,
 True,
 False,
 True)
DEBUG    [action_state.py:502] (3,
 3,
 [(3,
   3,
   [(2, 2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, None)]),
    (3, 3, None),
    (5,
     5,
     [(4, 4, [(4, 4,

In [89]:
logger.info(args.graphviz_file_template.format(
    framework, dataset, cid))

INFO     [<ipython-input-89-f68a566eeeb4>:2] http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/dm/wsj.mrp/20988006.png


In [55]:
mrp_json['input']

"Its parking lot is inconvenient, the MGM lion's-head logo still appears in places, and customers still call it the Grand, rather than the Bally Grand."

In [56]:
mrp_parser_states

[(0,
  [(0, None),
   (1,
    (1,
     {'id': 0,
      'label': '#PersPron',
      'properties': ['pos'],
      'values': ['PRP$'],
      'anchors': [{'from': 0, 'to': 3}]},
     [[]]))],
  [],
  [],
  [],
  [(0, 0, [(0, 0, None)])],
  [(0, 'Its')]),
 (1,
  [(0, None),
   (1,
    (1,
     {'id': 1,
      'label': 'parking',
      'properties': ['pos'],
      'values': ['NN'],
      'anchors': [{'from': 4, 'to': 11}]},
     [[]]))],
  [2],
  [],
  [2],
  [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])],
  [(0, 'Its'), (1, 'parking')]),
 (2,
  [(0, None),
   (1,
    (3,
     {'id': 2,
      'label': 'lot',
      'properties': ['pos'],
      'values': ['NN'],
      'anchors': [{'from': 12, 'to': 15}]},
     [[{'source': 2,
        'target': 0,
        'label': 'APP',
        'id': 13,
        'parent': 2,
        'child': 0}],
      [{'source': 2,
        'target': 1,
        'label': 'RSTR',
        'id': 2,
        'parent': 2,
        'child': 1}],
      []]))],
  [],
  [],
  [],
  [(2

In [170]:
[(node['id'], node.get('label')) for node in mrp_json['nodes']]

[(1, 'be'),
 (2, 'clear'),
 (3, 'what'),
 (4, 'effect'),
 (6, 'sale'),
 (9, 'shopping'),
 (10, 'center'),
 (12, 'have'),
 (14, 'earnings')]

In [175]:
doc

'It is unclear what effect the sale of the shopping centers will have on earnings.'

In [172]:
parse_json['nodes']

[{'id': 0,
  'label': 'It',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['it', 'PRON', 'PRP']},
 {'id': 1,
  'label': 'is',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['be', 'VERB', 'VBZ']},
 {'id': 2,
  'label': 'unclear',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['unclear', 'ADJ', 'JJ']},
 {'id': 3,
  'label': 'what',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['what', 'DET', 'WDT']},
 {'id': 4,
  'label': 'effect',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['effect', 'NOUN', 'NN']},
 {'id': 5,
  'label': 'the',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['the', 'DET', 'DT']},
 {'id': 6,
  'label': 'sale',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['sale', 'NOUN', 'NN']},
 {'id': 7,
  'label': 'of',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['of', 'ADP', 'IN']},
 {'id': 8,
  'label': 'the',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['the', 'DET', 'DT']},
 {'id': 9,
  'labe

In [169]:
[(node['id'], node['label']) for node in parse_json['nodes']]

[(0, 'It'),
 (1, 'is'),
 (2, 'unclear'),
 (3, 'what'),
 (4, 'effect'),
 (5, 'the'),
 (6, 'sale'),
 (7, 'of'),
 (8, 'the'),
 (9, 'shopping'),
 (10, 'centers'),
 (11, 'will'),
 (12, 'have'),
 (13, 'on'),
 (14, 'earnings'),
 (15, '.')]

In [154]:
anchors

[(0, 1),
 (1, 4),
 (4, 8),
 (8, 12),
 (12, 19),
 (19, 20),
 (20, 23),
 (23, 25),
 (25, 28),
 (28, 36),
 (36, 41),
 (41, 43),
 (43, 45),
 (45, 49),
 (49, 52),
 (52, 61),
 (61, 63),
 (63, 65),
 (65, 70),
 (70, 73),
 (73, 82),
 (82, 89),
 (89, 90)]

### Create training instance

In [150]:
total_count = 0
with_parse_count = 0
data_size_limit = 100
ignore_framework_set = {'amr', 'dm', 'psd', 'eds'}

In [160]:
allennlp_tests_fixtures_output_file = os.path.join(
    args.project_root, args.mrp_test_dir, args.tests_fixtures_file)
allennlp_train_output_file = os.path.join(
    args.project_root, args.allennlp_mrp_json_file_template.format('train'))
allennlp_test_output_file = os.path.join(
    args.project_root, args.allennlp_mrp_json_file_template.format('test'))

In [185]:
# Create tests fixture jsonl
fixture_combinations = [
    ('ucca', 'wiki', 70)
]

with open(allennlp_tests_fixtures_output_file, 'w') as wf:
    for framework, dataset, idx in fixture_combinations:
        mrp_json = framework2dataset2mrp_jsons[framework][dataset][idx]
        cid = mrp_json.get('id')
        doc = mrp_json.get('input')
        
        alignment = {}
        if framework == 'amr':
            alignment = cid2alignment[cid]  
        parse_json = dataset2cid2parse_json.get(dataset, {}).get(cid, {})

        if parse_json:
            with_parse_count += 1
            mrp_parser_states, mrp_meta_data = mrp_json2parser_states(
                mrp_json, 
                tokenized_parse_nodes=parse_json['nodes'],
                alignment=alignment,
            )
            companion_parser_states, companion_meta_data = mrp_json2parser_states(
                parse_json, 
                mrp_doc=doc,
                tokenized_parse_nodes=parse_json['nodes'],
            )

            data_instance = {
                'mrp_json': mrp_json,
                'parse_json': parse_json,
                'mrp_parser_states': mrp_parser_states,
                'mrp_meta_data': mrp_meta_data,
                'companion_parser_states': companion_parser_states,
                'companion_meta_data': companion_meta_data,
            }
            json_encoded_instance = json.dumps(data_instance)
            wf.write(json_encoded_instance + '\n')

DEBUG    [action_state.py:60] ('remote 1', 3)
DEBUG    [action_state.py:60] ('remote 1', 10)
DEBUG    [action_state.py:60] ('remote 1', 11)
DEBUG    [action_state.py:206] {34}
DEBUG    [action_state.py:206] {33, 39, 17, 18, 19}
DEBUG    [action_state.py:206] {32, 35, 36, 38, 7, 8, 9, 40, 12, 24}
DEBUG    [action_state.py:206] {4, 5, 6, 37, 41, 10, 11, 13, 16, 20, 21, 22, 23, 25, 26, 27, 31}
DEBUG    [action_state.py:206] {0, 1, 2, 3, 14, 15, 28, 29, 30}
DEBUG    [action_state.py:84] ('remote 2', -1)
DEBUG    [action_state.py:84] ('remote 2', -1)
DEBUG    [action_state.py:84] ('remote 2', -1)
DEBUG    [action_state.py:414] ('prev anchors', 0)
DEBUG    [action_state.py:427] ('anchors', 0, 2, 0, 1)
DEBUG    [action_state.py:433] ('curr_node_id', 0)
DEBUG    [action_state.py:456] (0, [], True, True, True, True)
DEBUG    [action_state.py:502] (0, 0, [(0, 0, [(0, 0, None)])])
DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 0, 'anchors': [{'from': 0, 'to': 2}], 'label': 'In'}, [[]])

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 32, 31}, {0, 1, 2, 3, 4, 5, 6, 31}, {32, 31}, {0, 1, 2, 3, 4, 5, 6, 31})
DEBUG    [action_state.py:433] ('curr_node_id', 32)
DEBUG    [action_state.py:456] (32,
 [(31,
   31,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, [(3, 3, None)])]),
  (4, 4, [(4, 4, None)]),
  (5, 5, [(5, 5, None)]),
  (6, 6, [(6, 6, None)])],
 False,
 True,
 False,
 True)
DEBUG    [action_state.py:502] (32,
 32,
 [(32,
   32,
   [(31,
     31,
     [(0, 0, [(0, 0, None)]),
      (1, 1, [(1, 1, None)]),
      (2, 2, [(2, 2, None)]),
      (3, 3, [(3, 3, None)])]),
    (4, 4, [(4, 4, None)]),
    (5, 5, [(5, 5, None)]),
    (6, 6, [(6, 6, None)])])])
DEBUG    [action_state.py:503] [(1, (4, {'id': 32}, [[{'source': 32, 'target': 31, 'label': 'E', 'id': 42, 'parent': 32, 'child': 31}], [{'source': 32, 'target': 4, 'label': 'R', 'id': 0, 'parent': 32, 'child': 4}], [{'source': 32, 'target': 5, 'la

DEBUG    [action_state.py:414] ('prev anchors', 11)
DEBUG    [action_state.py:427] ('anchors', 49, 53, 11, 12)
DEBUG    [action_state.py:433] ('curr_node_id', 11)
DEBUG    [action_state.py:456] (11,
 [(32,
   32,
   [(31,
     31,
     [(0, 0, [(0, 0, None)]),
      (1, 1, [(1, 1, None)]),
      (2, 2, [(2, 2, None)]),
      (3, 3, [(3, 3, None)])]),
    (4, 4, [(4, 4, None)]),
    (5, 5, [(5, 5, None)]),
    (6, 6, [(6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, [(9, 9, None)]),
  (10, 10, [(10, 10, None)])],
 True,
 True,
 True,
 True)
DEBUG    [action_state.py:502] (11,
 11,
 [(32,
   32,
   [(31,
     31,
     [(0, 0, [(0, 0, None)]),
      (1, 1, [(1, 1, None)]),
      (2, 2, [(2, 2, None)]),
      (3, 3, [(3, 3, None)])]),
    (4, 4, [(4, 4, None)]),
    (5, 5, [(5, 5, None)]),
    (6, 6, [(6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, [(9, 9, None)]),
  (10, 10, [(10, 10, None)]),
  (11, 11, [(11, 11, None)])])
DEBU

DEBUG    [action_state.py:414] ('prev anchors', 14)
DEBUG    [action_state.py:427] ('anchors', 64, 71, 14, 15)
DEBUG    [action_state.py:433] ('curr_node_id', 14)
DEBUG    [action_state.py:456] (14,
 [(32,
   32,
   [(31,
     31,
     [(0, 0, [(0, 0, None)]),
      (1, 1, [(1, 1, None)]),
      (2, 2, [(2, 2, None)]),
      (3, 3, [(3, 3, None)])]),
    (4, 4, [(4, 4, None)]),
    (5, 5, [(5, 5, None)]),
    (6, 6, [(6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, [(9, 9, None)]),
  (35, 35, [(10, 10, [(10, 10, None)]), (11, 11, [(11, 11, None)])]),
  (12, 12, [(12, 12, None)]),
  (13, 13, [(13, 13, None)])],
 True,
 True,
 True,
 True)
DEBUG    [action_state.py:502] (14,
 14,
 [(32,
   32,
   [(31,
     31,
     [(0, 0, [(0, 0, None)]),
      (1, 1, [(1, 1, None)]),
      (2, 2, [(2, 2, None)]),
      (3, 3, [(3, 3, None)])]),
    (4, 4, [(4, 4, None)]),
    (5, 5, [(5, 5, None)]),
    (6, 6, [(6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, 

DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 16, 'anchors': [{'from': 79, 'to': 85}, {'from': 86, 'to': 92}], 'label': 'RobertParish'}, [[]]))]
DEBUG    [action_state.py:516] (16, 4, 36, set())
DEBUG    [action_state.py:576] ('token stack',
 [(32,
   '',
   [(31,
     '',
     [(0, 'In', [(0, 'In', 'In')]),
      (1, 'the', [(1, 'the', 'the')]),
      (2, 'final', [(2, 'final', 'final')]),
      (3, 'minute', [(3, 'minute', 'minute')])]),
    (4, 'of', [(4, 'of', 'of')]),
    (5, 'the', [(5, 'the', 'the')]),
    (6, 'game', [(6, 'game', 'game')])]),
  (7, ',', [(7, ',', ',')]),
  (8, 'Johnson', [(8, 'Johnson', 'Johnson')]),
  (9, 'had', [(9, 'had', 'had')]),
  (35,
   '',
   [(10, 'the', [(10, 'the', 'the')]), (11, 'ball', [(11, 'ball', 'ball')])]),
  (12, 'stolen', [(12, 'stolen', 'stolen')]),
  (13, 'by', [(13, 'by', 'by')]),
  (37,
   '',
   [(14, 'Celtics', [(14, 'Celtics', 'Celtics')]),
    (15, 'center', [(15, 'center', 'center')])]),
  (16, 'RobertParish', [(16, 'Ro

DEBUG    [action_state.py:502] (17,
 17,
 [(33,
   33,
   [(32,
     32,
     [(31,
       31,
       [(0, 0, [(0, 0, None)]),
        (1, 1, [(1, 1, None)]),
        (2, 2, [(2, 2, None)]),
        (3, 3, [(3, 3, None)])]),
      (4, 4, [(4, 4, None)]),
      (5, 5, [(5, 5, None)]),
      (6, 6, [(6, 6, None)])]),
    (7, 7, [(7, 7, None)]),
    (8, 8, [(8, 8, None)]),
    (9, 9, [(9, 9, None)]),
    (35, 35, [(10, 10, [(10, 10, None)]), (11, 11, [(11, 11, None)])]),
    (12, 12, [(12, 12, None)]),
    (36,
     36,
     [(13, 13, [(13, 13, None)]),
      (37, 37, [(14, 14, [(14, 14, None)]), (15, 15, [(15, 15, None)])]),
      (16, 16, [(16, 16, None)])])]),
  (17, 17, [(17, 17, None)])])
DEBUG    [action_state.py:503] [(2, None), (0, None), (1, (1, {'id': 17, 'anchors': [{'from': 92, 'to': 93}], 'label': ','}, [[]]))]
DEBUG    [action_state.py:516] (17, 23, 34, {39, 18, 19})
DEBUG    [action_state.py:576] ('token stack',
 [(33,
   '',
   [(32,
     '',
     [(31,
       '',
       [

DEBUG    [action_state.py:414] ('prev anchors', 21)
DEBUG    [action_state.py:427] ('anchors', 103, 109, 21, 22)
DEBUG    [action_state.py:433] ('curr_node_id', 20)
DEBUG    [action_state.py:456] (20,
 [(33,
   33,
   [(32,
     32,
     [(31,
       31,
       [(0, 0, [(0, 0, None)]),
        (1, 1, [(1, 1, None)]),
        (2, 2, [(2, 2, None)]),
        (3, 3, [(3, 3, None)])]),
      (4, 4, [(4, 4, None)]),
      (5, 5, [(5, 5, None)]),
      (6, 6, [(6, 6, None)])]),
    (7, 7, [(7, 7, None)]),
    (8, 8, [(8, 8, None)]),
    (9, 9, [(9, 9, None)]),
    (35, 35, [(10, 10, [(10, 10, None)]), (11, 11, [(11, 11, None)])]),
    (12, 12, [(12, 12, None)]),
    (36,
     36,
     [(13, 13, [(13, 13, None)]),
      (37, 37, [(14, 14, [(14, 14, None)]), (15, 15, [(15, 15, None)])]),
      (16, 16, [(16, 16, None)])])]),
  (17, 17, [(17, 17, None)]),
  (18, 18, [(18, 18, None)]),
  (19, 19, [(19, 19, None)])],
 True,
 True,
 True,
 True)
DEBUG    [action_state.py:502] (20,
 20,
 [(33,
   3

DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 22, 'anchors': [{'from': 114, 'to': 118}], 'label': 'free'}, [[]]))]
DEBUG    [action_state.py:516] (22, 1, 38, {23})
DEBUG    [action_state.py:576] ('token stack',
 [(33,
   '',
   [(32,
     '',
     [(31,
       '',
       [(0, 'In', [(0, 'In', 'In')]),
        (1, 'the', [(1, 'the', 'the')]),
        (2, 'final', [(2, 'final', 'final')]),
        (3, 'minute', [(3, 'minute', 'minute')])]),
      (4, 'of', [(4, 'of', 'of')]),
      (5, 'the', [(5, 'the', 'the')]),
      (6, 'game', [(6, 'game', 'game')])]),
    (7, ',', [(7, ',', ',')]),
    (8, 'Johnson', [(8, 'Johnson', 'Johnson')]),
    (9, 'had', [(9, 'had', 'had')]),
    (35,
     '',
     [(10, 'the', [(10, 'the', 'the')]), (11, 'ball', [(11, 'ball', 'ball')])]),
    (12, 'stolen', [(12, 'stolen', 'stolen')]),
    (36,
     '',
     [(13, 'by', [(13, 'by', 'by')]),
      (37,
       '',
       [(14, 'Celtics', [(14, 'Celtics', 'Celtics')]),
        (15, 'center', [(15, 

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 31, 32, 33, 35, 36, 37, 38}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 31, 32, 33, 35, 36, 37, 38}, {32, 33, 35, 36, 37, 38, 31}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 31, 32, 33, 35, 36, 37, 38})
DEBUG    [action_state.py:414] ('prev anchors', 25)
DEBUG    [action_state.py:427] ('anchors', 126, 130, 25, 26)
DEBUG    [action_state.py:433] ('curr_node_id', 24)
DEBUG    [action_state.py:456] (24,
 [(33,
   33,
   [(32,
     32,
     [(31,
       31,
       [(0, 0, [(0, 0, None)]),
        (1, 1, [(1, 1, None)]),
        (2, 2, [(2, 2, None)]),
        (3, 3, [(3, 3, None)])]),
      (4, 4, [(4, 4, None)]),
      (5, 5, [(5, 5, None)]),
      (6, 6, [(6, 6, None)])]),
    (7, 7, [(7, 7, None)]),
    (8, 8, [(8, 8, None)]),
    (9, 9, [(9, 9, None)]),
    (35, 35

DEBUG    [action_state.py:414] ('prev anchors', 27)
DEBUG    [action_state.py:427] ('anchors', 137, 141, 27, 28)
DEBUG    [action_state.py:433] ('curr_node_id', 26)
DEBUG    [action_state.py:456] (26,
 [(33,
   33,
   [(32,
     32,
     [(31,
       31,
       [(0, 0, [(0, 0, None)]),
        (1, 1, [(1, 1, None)]),
        (2, 2, [(2, 2, None)]),
        (3, 3, [(3, 3, None)])]),
      (4, 4, [(4, 4, None)]),
      (5, 5, [(5, 5, None)]),
      (6, 6, [(6, 6, None)])]),
    (7, 7, [(7, 7, None)]),
    (8, 8, [(8, 8, None)]),
    (9, 9, [(9, 9, None)]),
    (35, 35, [(10, 10, [(10, 10, None)]), (11, 11, [(11, 11, None)])]),
    (12, 12, [(12, 12, None)]),
    (36,
     36,
     [(13, 13, [(13, 13, None)]),
      (37, 37, [(14, 14, [(14, 14, None)]), (15, 15, [(15, 15, None)])]),
      (16, 16, [(16, 16, None)])])]),
  (17, 17, [(17, 17, None)]),
  (18, 18, [(18, 18, None)]),
  (19, 19, [(19, 19, None)]),
  (38,
   38,
   [(20, 20, [(20, 20, None)]),
    (21, 21, [(21, 21, None)]),
   

DEBUG    [action_state.py:414] ('prev anchors', 29)
DEBUG    [action_state.py:427] ('anchors', 146, 149, 29, 30)
DEBUG    [action_state.py:433] ('curr_node_id', 28)
DEBUG    [action_state.py:456] (28,
 [(33,
   33,
   [(32,
     32,
     [(31,
       31,
       [(0, 0, [(0, 0, None)]),
        (1, 1, [(1, 1, None)]),
        (2, 2, [(2, 2, None)]),
        (3, 3, [(3, 3, None)])]),
      (4, 4, [(4, 4, None)]),
      (5, 5, [(5, 5, None)]),
      (6, 6, [(6, 6, None)])]),
    (7, 7, [(7, 7, None)]),
    (8, 8, [(8, 8, None)]),
    (9, 9, [(9, 9, None)]),
    (35, 35, [(10, 10, [(10, 10, None)]), (11, 11, [(11, 11, None)])]),
    (12, 12, [(12, 12, None)]),
    (36,
     36,
     [(13, 13, [(13, 13, None)]),
      (37, 37, [(14, 14, [(14, 14, None)]), (15, 15, [(15, 15, None)])]),
      (16, 16, [(16, 16, None)])])]),
  (17, 17, [(17, 17, None)]),
  (18, 18, [(18, 18, None)]),
  (19, 19, [(19, 19, None)]),
  (38,
   38,
   [(20, 20, [(20, 20, None)]),
    (21, 21, [(21, 21, None)]),
   

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 35, 36, 37, 38}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 35, 36, 37, 38}, {32, 33, 35, 36, 37, 38, 31}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 35, 36, 37, 38})
DEBUG    [action_state.py:414] ('prev anchors', 31)
DEBUG    [action_state.py:427] ('anchors', 154, 155, 31, 32)
DEBUG    [action_state.py:433] ('curr_node_id', 30)
DEBUG    [action_state.py:456] (30,
 [(33,
   33,
   [(32,
     32,
     [(31,
       31,
       [(0, 0, [(0, 0, None)]),
        (1, 1, [(1, 1, None)]),
        (2, 2, [(2, 2, None)]),
        (3, 3, [(3, 3, None)])]),
      (4, 4, [(4, 4, None)]),
      (5, 5, [(5, 5, None)]),
      (6, 6, [(6, 6, None)])]),
    (7, 7, [(7, 7, None

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 40, 41}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 41}, {32, 33, 35, 36, 37, 38, 40, 41, 31}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 41})
DEBUG    [action_state.py:433] ('curr_node_id', 40)
DEBUG    [action_state.py:456] (40,
 [(33,
   33,
   [(32,
     32,
     [(31,
       31,
       [(0, 0, [(0, 0, None)]),
        (1, 1, [(1, 1, None)]),
        (2, 2, [(2, 2, None)]),
        (3, 3, [(3, 3, None)])]),
      (4, 4, [(4, 4, None)]),
      (5, 5, [(5, 5, None)]),
      (6, 6, [(6, 6, None)])]),
    (7, 7, [(7, 7, None)]),
    (8, 8, [(8, 8, None)]),
    (9, 9, [(9, 9, None)]),
    (35, 35, [(1

DEBUG    [action_state.py:516] (39, 33, 34, set())
DEBUG    [action_state.py:576] ('token stack',
 [(33,
   '',
   [(32,
     '',
     [(31,
       '',
       [(0, 'In', [(0, 'In', 'In')]),
        (1, 'the', [(1, 'the', 'the')]),
        (2, 'final', [(2, 'final', 'final')]),
        (3, 'minute', [(3, 'minute', 'minute')])]),
      (4, 'of', [(4, 'of', 'of')]),
      (5, 'the', [(5, 'the', 'the')]),
      (6, 'game', [(6, 'game', 'game')])]),
    (7, ',', [(7, ',', ',')]),
    (8, 'Johnson', [(8, 'Johnson', 'Johnson')]),
    (9, 'had', [(9, 'had', 'had')]),
    (35,
     '',
     [(10, 'the', [(10, 'the', 'the')]), (11, 'ball', [(11, 'ball', 'ball')])]),
    (12, 'stolen', [(12, 'stolen', 'stolen')]),
    (36,
     '',
     [(13, 'by', [(13, 'by', 'by')]),
      (37,
       '',
       [(14, 'Celtics', [(14, 'Celtics', 'Celtics')]),
        (15, 'center', [(15, 'center', 'center')])]),
      (16, 'RobertParish', [(16, 'RobertParish', 'Robert')])])]),
  (17, ',', [(17, ',', ',')]),
  (

DEBUG    [action_state.py:516] (0, 0, 3, {1, 2, 6})
DEBUG    [action_state.py:576] ('token stack', [(0, 'In', [(0, 'In', 'In')])])
DEBUG    [action_state.py:578] ('visited states', {0}, {0}, set(), {0})
DEBUG    [action_state.py:433] ('curr_node_id', 1)
DEBUG    [action_state.py:456] (1, [(0, 0, [(0, 0, None)])], True, True, True, True)
DEBUG    [action_state.py:502] (1, 1, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])])
DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 1, 'label': 'the', 'properties': ['lemma', 'upos', 'xpos'], 'values': ['the', 'DET', 'DT']}, [[]]))]
DEBUG    [action_state.py:516] (1, 1, 3, {2, 6})
DEBUG    [action_state.py:576] ('token stack', [(0, 'In', [(0, 'In', 'In')]), (1, 'the', [(1, 'the', 'the')])])
DEBUG    [action_state.py:578] ('visited states', {0, 1}, {0, 1}, set(), {0, 1})
DEBUG    [action_state.py:433] ('curr_node_id', 2)
DEBUG    [action_state.py:456] (2, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])], True, True, True, True)
DEBUG    [ac

DEBUG    [action_state.py:502] (7,
 7,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)])])
DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 7, 'label': ',', 'properties': ['lemma', 'upos', 'xpos'], 'values': [',', 'PUNCT', ',']}, [[]]))]
DEBUG    [action_state.py:516] (7, 7, 9, {8, 12, 18, 21, 31})
DEBUG    [action_state.py:576] ('token stack',
 [(3,
   'minute',
   [(0, 'In', [(0, 'In', 'In')]),
    (1, 'the', [(1, 'the', 'the')]),
    (2, 'final', [(2, 'final', 'final')]),
    (3, 'minute', 'minute'),
    (6,
     'game',
     [(4, 'of', [(4, 'of', 'of')]),
      (5, 'the', [(5, 'the', 'the')]),
      (6, 'game', 'game')])]),
  (7, ',', [(7, ',', ',')])])
DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7}, {0, 1, 2, 3, 4, 5, 6, 7}, {3, 6}, {0, 1, 2, 3, 4, 5, 6, 7})
DEBUG    [action_state

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {11, 3, 6}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11})
DEBUG    [action_state.py:433] ('curr_node_id', 12)
DEBUG    [action_state.py:456] (12,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)])],
 False,
 False,
 True,
 True)
DEBUG    [action_state.py:502] (12,
 12,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)]),
  (12, 12, None)])
DEBUG 

DEBUG    [action_state.py:502] (16,
 16,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)]),
  (12, 12, None),
  (13, 13, [(13, 13, None)]),
  (14, 14, [(14, 14, None)]),
  (15, 15, [(15, 15, None)]),
  (16, 16, [(16, 16, None)])])
DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 16, 'label': 'Robert', 'properties': ['lemma', 'upos', 'xpos'], 'values': ['Robert', 'PROPN', 'NNP']}, [[]]))]
DEBUG    [action_state.py:516] (16, 15, 17, set())
DEBUG    [action_state.py:576] ('token stack',
 [(3,
   'minute',
   [(0, 'In', [(0, 'In', 'In')]),
    (1, 'the', [(1, 'the', 'the')]),
    (2, 'final', [(2, 'final', 'final')]),
    (3, 'minute', 'minute'),
    (6,
     'game',
     [(4, 'of', [(4, 'of', 'of')]),
      (5, 'the',

DEBUG    [action_state.py:503] []
DEBUG    [action_state.py:516] (17, 16, 12, set())
DEBUG    [action_state.py:576] ('token stack',
 [(3,
   'minute',
   [(0, 'In', [(0, 'In', 'In')]),
    (1, 'the', [(1, 'the', 'the')]),
    (2, 'final', [(2, 'final', 'final')]),
    (3, 'minute', 'minute'),
    (6,
     'game',
     [(4, 'of', [(4, 'of', 'of')]),
      (5, 'the', [(5, 'the', 'the')]),
      (6, 'game', 'game')])]),
  (7, ',', [(7, ',', ',')]),
  (8, 'Johnson', [(8, 'Johnson', 'Johnson')]),
  (9, 'had', 'had'),
  (12,
   'stolen',
   [(11, 'ball', [(10, 'the', [(10, 'the', 'the')]), (11, 'ball', 'ball')]),
    (12, 'stolen', 'stolen'),
    (17,
     'Parish',
     [(13, 'by', [(13, 'by', 'by')]),
      (14, 'Celtics', [(14, 'Celtics', 'Celtics')]),
      (15, 'center', [(15, 'center', 'center')]),
      (16, 'Robert', [(16, 'Robert', 'Robert')]),
      (17, 'Parish', 'Parish')])])])
DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, {3, 6, 11, 12, 17}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20})
DEBUG    [action_state.py:433] ('curr_node_id', 21)
DEBUG    [action_state.py:456] (21,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (12,
   12,
   [(11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)]),
    (12, 12, None),
    (17,
     17,
     [(13, 13, [(13, 13, None)]),
      (14, 14, [(14, 14, None)]),
      (15, 15, [(15, 15, None)]),
      (16, 16, [(16, 16, None)]),
      (17, 17, None)])]),
  (18, 18, [(18, 18, None)]),
  (19, 19, [(19, 19, None)]),
  (20, 20, [(20, 20, No

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, {3, 6, 11, 12, 17}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23})
DEBUG    [action_state.py:433] ('curr_node_id', 24)
DEBUG    [action_state.py:456] (24,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (12,
   12,
   [(11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)]),
    (12, 12, None),
    (17,
     17,
     [(13, 13, [(13, 13, None)]),
      (14, 14, [(14, 14, None)]),
      (15, 15, [(15, 15, None)]),
      (16, 16, [(16, 16, None)]),
      (17, 17, None)])]),
  (18, 18, [(18, 18, None)]),
  (19, 19, [(19, 19, 

DEBUG    [action_state.py:503] [(0, None), (1, (1, {'id': 26, 'label': 'could', 'properties': ['lemma', 'upos', 'xpos'], 'values': ['could', 'AUX', 'MD']}, [[]]))]
DEBUG    [action_state.py:516] (26, 25, 28, {27, 30})
DEBUG    [action_state.py:576] ('token stack',
 [(3,
   'minute',
   [(0, 'In', [(0, 'In', 'In')]),
    (1, 'the', [(1, 'the', 'the')]),
    (2, 'final', [(2, 'final', 'final')]),
    (3, 'minute', 'minute'),
    (6,
     'game',
     [(4, 'of', [(4, 'of', 'of')]),
      (5, 'the', [(5, 'the', 'the')]),
      (6, 'game', 'game')])]),
  (7, ',', [(7, ',', ',')]),
  (8, 'Johnson', [(8, 'Johnson', 'Johnson')]),
  (9, 'had', 'had'),
  (12,
   'stolen',
   [(11, 'ball', [(10, 'the', [(10, 'the', 'the')]), (11, 'ball', 'ball')]),
    (12, 'stolen', 'stolen'),
    (17,
     'Parish',
     [(13, 'by', [(13, 'by', 'by')]),
      (14, 'Celtics', [(14, 'Celtics', 'Celtics')]),
      (15, 'center', [(15, 'center', 'center')]),
      (16, 'Robert', [(16, 'Robert', 'Robert')]),
      (

DEBUG    [action_state.py:578] ('visited states', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}, {3, 6, 11, 12, 17}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 25, 26, 27})
DEBUG    [action_state.py:433] ('curr_node_id', 29)
DEBUG    [action_state.py:456] (29,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (12,
   12,
   [(11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)]),
    (12, 12, None),
    (17,
     17,
     [(13, 13, [(13, 13, None)]),
      (14, 14, [(14, 14, None)]),
      (15, 15, [(15, 15, None)]),
      (16, 16, [(16, 16, None)]),
      (17, 17, None)])]),

DEBUG    [action_state.py:502] (28,
 28,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (12,
   12,
   [(11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)]),
    (12, 12, None),
    (17,
     17,
     [(13, 13, [(13, 13, None)]),
      (14, 14, [(14, 14, None)]),
      (15, 15, [(15, 15, None)]),
      (16, 16, [(16, 16, None)]),
      (17, 17, None)])]),
  (18, 18, [(18, 18, None)]),
  (19, 19, [(19, 19, None)]),
  (20, 20, [(20, 20, None)]),
  (21, 21, None),
  (22, 22, [(22, 22, None)]),
  (23, 23, [(23, 23, None)]),
  (24, 24, None),
  (28,
   28,
   [(25, 25, [(25, 25, None)]),
    (26, 26, [(26, 26, None)]),
    (27, 27, [(27, 27, None)]),
    (28, 28, None),
    (30, 30, [(29, 29, [(29, 29, None)]), (30, 30, None)])])])
DEBUG    [action_state.py:503] [(1, (5, {'id':

DEBUG    [action_state.py:502] (21,
 21,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (12,
   12,
   [(11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)]),
    (12, 12, None),
    (17,
     17,
     [(13, 13, [(13, 13, None)]),
      (14, 14, [(14, 14, None)]),
      (15, 15, [(15, 15, None)]),
      (16, 16, [(16, 16, None)]),
      (17, 17, None)])]),
  (18, 18, [(18, 18, None)]),
  (21,
   21,
   [(19, 19, [(19, 19, None)]),
    (20, 20, [(20, 20, None)]),
    (21, 21, None),
    (24,
     24,
     [(22, 22, [(22, 22, None)]),
      (23, 23, [(23, 23, None)]),
      (24, 24, None),
      (28,
       28,
       [(25, 25, [(25, 25, None)]),
        (26, 26, [(26, 26, None)]),
        (27, 27, [(27, 27, None)]),
        (28, 28, None),
        (30, 30, [(29, 29, [(29, 29

DEBUG    [action_state.py:502] (31,
 31,
 [(3,
   3,
   [(0, 0, [(0, 0, None)]),
    (1, 1, [(1, 1, None)]),
    (2, 2, [(2, 2, None)]),
    (3, 3, None),
    (6, 6, [(4, 4, [(4, 4, None)]), (5, 5, [(5, 5, None)]), (6, 6, None)])]),
  (7, 7, [(7, 7, None)]),
  (8, 8, [(8, 8, None)]),
  (9, 9, None),
  (12,
   12,
   [(11, 11, [(10, 10, [(10, 10, None)]), (11, 11, None)]),
    (12, 12, None),
    (17,
     17,
     [(13, 13, [(13, 13, None)]),
      (14, 14, [(14, 14, None)]),
      (15, 15, [(15, 15, None)]),
      (16, 16, [(16, 16, None)]),
      (17, 17, None)])]),
  (18, 18, [(18, 18, None)]),
  (21,
   21,
   [(19, 19, [(19, 19, None)]),
    (20, 20, [(20, 20, None)]),
    (21, 21, None),
    (24,
     24,
     [(22, 22, [(22, 22, None)]),
      (23, 23, [(23, 23, None)]),
      (24, 24, None),
      (28,
       28,
       [(25, 25, [(25, 25, None)]),
        (26, 26, [(26, 26, None)]),
        (27, 27, [(27, 27, None)]),
        (28, 28, None),
        (30, 30, [(29, 29, [(29, 29

In [148]:
# Create train jsonl
if os.path.isfile(allennlp_train_output_file):
    logger.info('allennlp_train_output_file found, stop generation')
else:
    data_size = 0
    with open(allennlp_train_output_file, 'w') as wf:
        for _, dataset, mrp_json in tqdm(mrp_dataset.mrp_json_generator(
            ignore_framework_set=ignore_framework_set
        )):
            total_count += 1
            if data_size >= data_size_limit:
                break
            cid = mrp_json.get('id')
            framework = mrp_json.get('framework')
            alignment = {}
            if framework == 'amr':
                alignment = cid2alignment[cid]  
            parse_json = dataset2cid2parse_json.get(dataset, {}).get(cid, {})
            
            if parse_json:
                data_size += 1
                with_parse_count += 1
                mrp_parser_states, mrp_meta_data = mrp_json2parser_states(mrp_json, alignment)
                companion_parser_states, companion_meta_data = mrp_json2parser_states(parse_json, {})

                data_instance = {
                    'mrp_json': mrp_json,
                    'parse_json': parse_json,
                    'mrp_parser_states': mrp_parser_states,
                    'mrp_meta_data': mrp_meta_data,
                    'companion_parser_states': companion_parser_states,
                    'companion_meta_data': companion_meta_data,
                }
                json_encoded_instance = json.dumps(data_instance)
                wf.write(json_encoded_instance + '\n')

                
# Create test jsonl
if os.path.isfile(allennlp_test_output_file):
    logger.info('allennlp_test_output_file found, stop generation')
else:
    data_size = 0
    with open(allennlp_test_output_file, 'w') as wf:
        alignment = {}
        for mrp_json in tqdm(test_mrp_jsons):
            data_size += 1
            if data_size >= data_size_limit:
                break
            cid = mrp_json.get('id', '')
            framework = mrp_json.get('framework', '')
            if framework in ignore_framework_set:
                continue
            parse_json = test_parse_jsons[cid]
            companion_parser_states, companion_meta_data = mrp_json2parser_states(parse_json, {})
            data_instance = {
                'mrp_json': mrp_json,
                'parse_json': parse_json,
                'companion_parser_states': companion_parser_states,
                'companion_meta_data': companion_meta_data,
            }
            json_encoded_instance = json.dumps(data_instance)
            wf.write(json_encoded_instance + '\n')

INFO     [<ipython-input-148-ffe7de92f95e>:17] allennlp_train_output_file found, stop generation
INFO     [<ipython-input-148-ffe7de92f95e>:53] allennlp_test_output_file found, stop generation


In [146]:
mrp_meta_data[0]

'In 1989, he joined forces with fellow Merseysiders Gerry Marsden and Holly Johnson to record an updated version of "Ferry Cross the Mersey", for the Hillsborough disaster appeal fund.'

In [169]:
mrp_parser_states

[(0,
  [(0, None),
   (1,
    (1, {'id': 0, 'anchors': [{'from': 0, 'to': 2}], 'label': 'In'}, [[]]))],
  [],
  [],
  [],
  [(0, 0, [(0, 0, None)])],
  [0],
  []),
 (1,
  [(0, None),
   (1,
    (1, {'id': 1, 'anchors': [{'from': 3, 'to': 7}], 'label': '1989'}, [[]]))],
  [19],
  [],
  [29],
  [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])],
  [0, 1],
  []),
 (29,
  [(1,
    (2,
     {'id': 29},
     [[{'source': 29,
        'target': 0,
        'label': 'R',
        'id': 8,
        'parent': 29,
        'child': 0}],
      [{'source': 29,
        'target': 1,
        'label': 'C',
        'id': 19,
        'parent': 29,
        'child': 1}]]))],
  [],
  [],
  [],
  [(29, 29, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])])],
  [29],
  []),
 (2,
  [(0, None),
   (1, (1, {'id': 2, 'anchors': [{'from': 7, 'to': 8}], 'label': ','}, [[]]))],
  [],
  [],
  [],
  [(29, 29, [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])]),
   (2, 2, [(2, 2, None)])],
  [29, 2],
  []),
 (3,
  [(0, None),
  

In [145]:
mrp_json['input']

'Now my sorrow is comforted a little.'

### Test allennlp dataset reader

In [121]:
import torch.optim as optim

from mrp_library.dataset_readers.mrp_jsons import MRPDatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.training.metrics import CategoricalAccuracy

from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer

In [124]:
import json
import logging
from typing import Dict

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.models import Model
from overrides import overrides

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

UNKNOWN_POS = 'XXX'
UNKNOWN_LABEL = 'XXXXXX'

# @DatasetReader.register("mrp_json_reader")
class MRPDatasetReader(DatasetReader):
    def __init__(
            self,
            lazy: bool = False,
#             tokenizer: Tokenizer = None,
            token_indexers: Dict[str, TokenIndexer] = None,
            pos_indexers: Dict[str, TokenIndexer] = None,
            parse_feature_window_size: int = 5,
    ) -> None:
        super().__init__(lazy)
#         self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace='token'),
        }
        self._pos_indexers = pos_indexers or {
            'pos': SingleIdTokenIndexer(namespace='pos'),
        }
        self.parse_feature_window_size = parse_feature_window_size

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line in data_file:
                line = line.strip("\n")
                if not line:
                    continue

                data_instance = json.loads(line)
                yield from self.process_mrp_graph(data_instance)

    def process_mrp_graph(self, data_instance):
        mrp_json = data_instance.get('mrp_json', {})
        parse_json = data_instance.get('parse_json', {})
        mrp_parser_states = data_instance.get('mrp_parser_states', {})
        mrp_meta_data = data_instance.get('mrp_meta_data', {})
        companion_parser_states = data_instance.get('companion_parser_states',
                                                    {})
        companion_meta_data = data_instance.get('companion_meta_data', {})

        # Generate features
        framework = mrp_json.get('framework', '')
        parse_nodes = parse_json.get('nodes', [])
        parse_node_labels = []
        parse_node_lemmas = []
        parse_node_uposs = []
        parse_node_xposs = []
        
        for node in parse_nodes:
            parse_node_labels.append(Token(node.get('label', UNKNOWN_LABEL)))
            for prop, value in zip(node.get('properties', []), node.get('values', [])):
                if prop == 'lemma':
                    parse_node_lemmas.append(Token(value))
                if prop == 'upos':
                    parse_node_uposs.append(Token(value))
                if prop == 'xpos':
                    parse_node_xposs.append(Token(value))
            
        for token_id in range(len(parse_nodes)):
            yield self.text_to_instance(token_id, parse_node_labels,
                                        parse_node_lemmas, parse_node_uposs,
                                        parse_node_xposs)

    @overrides
    def text_to_instance(self, token_id, parse_node_labels, parse_node_lemmas,
                         parse_node_uposs,
                         parse_node_xposs) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_curr_label = [parse_node_labels[token_id]]
        tokenized_curr_lemma = [parse_node_lemmas[token_id]]
        tokenized_curr_upos = [parse_node_uposs[token_id]]
        tokenized_curr_xpos = [parse_node_xposs[token_id]]
        
        curr_label_field = TextField(tokenized_curr_label, self._token_indexers)
        curr_lemma_field = TextField(tokenized_curr_lemma, self._token_indexers)
        curr_upos_field = TextField(tokenized_curr_upos, self._pos_indexers)
        curr_xpos_field = TextField(tokenized_curr_xpos, self._pos_indexers)
        
        instance_fields = {
            'curr_label_field': curr_label_field,
            'curr_lemma_field': curr_lemma_field,
            'curr_upos_field': curr_upos_field,
            'curr_xpos_field': curr_xpos_field,
        }
        return Instance(instance_fields)

In [126]:
reader = MRPDatasetReader()

In [127]:
train_dataset = reader.read(cached_path(allennlp_train_output_file))

0it [00:00, ?it/s]__main__ - INFO - Reading instances from lines in file at: /data/proj29_ds1/home/slai/mrp2019/allennlp-mrp-json-small-train.jsonl
2875it [00:00, 7205.88it/s]


In [128]:
test_dataset = reader.read(cached_path(allennlp_test_output_file))

0it [00:00, ?it/s]__main__ - INFO - Reading instances from lines in file at: /data/proj29_ds1/home/slai/mrp2019/allennlp-mrp-json-small-test.jsonl
1314it [00:00, 14901.57it/s]


In [129]:
vocab = Vocabulary.from_instances(train_dataset + test_dataset)

allennlp.data.vocabulary - INFO - Fitting token dictionary from dataset.
100%|██████████| 4189/4189 [00:00<00:00, 104713.87it/s]


In [165]:
vocab.print_statistics()

allennlp.data.vocabulary - INFO - Printed vocabulary statistics are only for the part of the vocabulary generated from instances. If vocabulary is constructed by extending saved vocabulary with dataset instances, the directly loaded portion won't be considered here.




----Vocabulary Statistics----


Top 10 most frequent tokens in namespace 'token':
	Token: the		Frequency: 464
	Token: ,		Frequency: 414
	Token: .		Frequency: 360
	Token: a		Frequency: 169
	Token: and		Frequency: 168
	Token: to		Frequency: 166
	Token: in		Frequency: 163
	Token: of		Frequency: 158
	Token: be		Frequency: 157
	Token: “		Frequency: 146

Top 10 longest tokens in namespace 'token':
	Token: African-American		length: 16	Frequency: 1
	Token: african-american		length: 16	Frequency: 1
	Token: mainstream-from		length: 15	Frequency: 2
	Token: insubordination		length: 15	Frequency: 2
	Token: professionally		length: 14	Frequency: 2
	Token: lowest-selling		length: 14	Frequency: 2
	Token: coast-to-coast		length: 14	Frequency: 2
	Token: collaboration		length: 13	Frequency: 2
	Token: collaborators		length: 13	Frequency: 2
	Token: Bryan-Michael		length: 13	Frequency: 2

Top 10 shortest tokens in namespace 'token':
	Token: –		length: 1	Frequency: 2
	Token: -		length: 1	Frequency: 2
	Token

In [99]:
vocab.

['default']

In [167]:
vocab.get_vocab_size('token')

1718

In [168]:
vocab.get_vocab_size('pos')

60

In [64]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [65]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [77]:
token = Token(
    'apple',
    pos_='NNP'
)

In [78]:
token.pos_

'NNP'

In [66]:
class LstmTagger(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        
    def forward(self,
                sentence: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)
        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [67]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(word_embeddings, lstm, vocab)

In [71]:
cuda_device = -1
optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=train_dataset,
                  test_datase=test_dataset
                  patience=10,
                  num_epochs=1000,
                  cuda_device=cuda_device)

In [72]:
trainer.train()

allennlp.training.trainer - INFO - Beginning training.
allennlp.training.trainer - INFO - Epoch 0/999
allennlp.training.trainer - INFO - Peak CPU memory usage MB: 5777.16
allennlp.training.trainer - INFO - GPU 0 memory usage MB: 0
allennlp.training.trainer - INFO - GPU 1 memory usage MB: 0
allennlp.training.trainer - INFO - GPU 2 memory usage MB: 0
allennlp.training.trainer - INFO - GPU 3 memory usage MB: 0
allennlp.training.trainer - INFO - GPU 4 memory usage MB: 0
allennlp.training.trainer - INFO - GPU 5 memory usage MB: 0
allennlp.training.trainer - INFO - GPU 6 memory usage MB: 0
allennlp.training.trainer - INFO - GPU 7 memory usage MB: 0
allennlp.training.trainer - INFO - Training
  0%|          | 0/25903 [00:00<?, ?it/s]

KeyError: 'sentence'

In [None]:
dataset2cid2parse_json['wiki']['502000']

In [None]:
mrp_json

In [None]:
mrp_parser_states[0]

In [None]:
with_parse_count, total_count

In [None]:
class Resolver(nn.Module):
    def __init__(self, vocab_size,vocab_embed_dim):
        super(Resolver, self).__init__()
        self.vocab_embeds = nn.Embedding(vocab_size, vocab_embed_dim)
        self.pos_embeds = nn.Embedding(pos_size, pos_embed_dim)
        self.reset()
    
    def reset(self):
        self.curr_token_id = 0
        self.token_stack = []
        self.token_id2property_dict = {}
        
    def forward(self, token_tensor, lemma_tensor, upos_tensor, xpos_tensor):
        token_id = self.curr_token_id
        token_embeds = self.vocab_embeds(token_tensor)
        
        lemma_embeds = self.vocab_embeds(lemma_tensor)
        upos_embeds = self.pos_embeds(upos_tensor)
        xpos_embeds = self.pos_embeds(xpos_tensor)
        
        features = (token_embed, lemma_embed, upos_embed, xpos_embed)
        
        action_prob = self.choose_action(features)
        action_type = 
        
        if action_type == APPEND:
            self.token_stack.append(token)
            params = None
        elif action_type == RESOLVE:
            
            params = (num_pop, resolve_node, resolved_edges)
        else:
            params = None
        self.curr_token_id += 1
        return action_type, params
        

In [None]:
torch.LongTensor([12])

In [None]:
parse_json

In [None]:
resolver = Resolver()

In [None]:
mrp_parser_states

In [None]:
# plot parse json
cid = mrp_json.get('id')
framework = mrp_json.get('framework')

dataset_dir = os.path.join(args.project_root, args.graphviz_sub_dir, dataset)
plot_util.draw_mrp_graphviz(parse_json, dataset_dir)

logger.info(args.graphviz_file_template.format(framework, dataset, cid))
logger.info(args.parse_plot_file_template.format(dataset, cid))

In [None]:
parser_states, meta_data = mrp_json2parser_states(parse_json, {})
actions = meta_data[-1]

In [None]:
actions

In [None]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

### Test module

In [None]:
from action_state import mrp_json2parser_states, _generate_parser_action_states

In [None]:
from action_state import sentence_spliter

In [None]:
framework, dataset = [
    ('dm', 'wsj'),
    ('psd', 'wsj'),
    ('eds', 'wsj'),
    ('ucca', 'wiki'),
    ('amr', 'wsj'),
    ('amr', 'wiki'),
][0]

mrp_jsons = framework2dataset2mrp_jsons[framework][dataset]
framework, dataset

In [None]:
mrp_json = mrp_jsons[0]

In [None]:
# mrp_json = [mrp_json for mrp_json in mrp_jsons if mrp_json.get('id') == '20209013'][0]

In [None]:
mrp_json['input']

In [None]:
alignment = {}
if framework == 'amr':
    cid = mrp_json.get('id', '')
    alignment = cid2alignment[cid]

In [None]:
logger.info(args.graphviz_file_template.format(
    framework, dataset, mrp_json.get('id')))

In [None]:
parser_states, meta_data = mrp_json2parser_states(mrp_json, alignment)

In [None]:
import allennlp
from allennlp.data import Instance
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder
from allennlp.data.fields import LabelField, TextField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer

In [None]:
wt = WordTokenizer()

In [None]:
tokenized_doc = wt.tokenize(mrp_json['input'])

In [None]:
token_indexers = {"tokens": SingleIdTokenIndexer()}

In [None]:
doc_field = TextField(tokenized_doc, token_indexers)

In [None]:
instance_fields = {'doc': doc_field}

In [None]:
instance = Instance(instance_fields)

In [None]:
doc_field.as_tensor(padding_lengths=2)

In [None]:
doc_field._indexer_name_to_indexed_token

In [None]:
tokenized_doc[1].text_id

In [None]:
allennlp.data.tokenizers.token.Token('Apple').text

In [None]:
type(tokenized_doc[0])

In [None]:
mrp_json['input']

In [None]:
(
    doc,
    nodes,
    node_id2node,
    edge_id2edge,
    top_oriented_edges,
    token_nodes,
    abstract_node_id_set,
    parent_id2indegree,
    parent_id2child_id_set,
    child_id2parent_id_set,
    child_id2edge_id_set,
    parent_id2edge_id_set,
    token_node_id_set,
    actions,
    anchor2token_id,
    parent_child_id2edge_id_set,
) = meta_data

In [None]:
assert all([edge.get('id') == i for i, edge in enumerate(top_oriented_edges)])

In [None]:
abstract_node_id_set

In [None]:
parent_id2edge_id_set

In [None]:
parent_id2child_id_set

In [None]:
# node_id2node

In [None]:
top_oriented_edges[27]

In [None]:
top_oriented_edges[5]

In [None]:
tokenized_doc

In [None]:
nodes

In [None]:
parser_node_id_set = set()
parser_edge_id_set = set()
for (node_id, actions, edge_state, abstract_node_state, 
     complete_node_state, node_state, token_stack, pending_token_stack) in parser_states:
    parser_node_id_set.add(node_id)
    for edge_id in edge_state:
        parser_edge_id_set.add(edge_id)
    
    node = node_id2node[node_id]
    node_edges = [edge_id2edge[edge_id] for edge_id in edge_state]
    pprint.pprint((
        node.get('id'),
        actions, 
        node.get('label'), 
        [edge.get('label') for edge in node_edges], 
#         abstract_node_state,
        complete_node_state,
        node_state,
        token_stack,
        pending_token_stack,
    ))
    
print({node.get('id', -1) for node in mrp_json.get('nodes')} - parser_node_id_set)
assert len(parser_node_id_set) == len(mrp_json.get('nodes'))
print({edge_id for edge_id, edge in enumerate(mrp_json.get('edges'))} - parser_edge_id_set)
assert len(parser_edge_id_set) == len(mrp_json.get('edges'))

In [None]:
action_states = [s[1] for s in parser_states]

In [None]:
token_poss = []
prev_token_pos = 0
for token in sentence_spliter(doc):
    token_poss.append((prev_token_pos, prev_token_pos + len(token)))
    prev_token_pos += len(token) + 1

In [None]:
list(sentence_spliter(doc))

In [None]:
token_poss

In [None]:
[n['anchors'] for n in nodes]

In [None]:
nodes

In [None]:
nodes = mrp_json['nodes']

In [None]:
logger.setLevel(logging.INFO)

In [None]:
num_pops = []
error_num = 0
for i, mrp_json in tqdm(enumerate(mrp_jsons)):
#     print(i)
    parser_states, meta_data = mrp_json2parser_states(mrp_json, framework, alignment)
    if not parser_states:
        logger.info(i)
        error_num += 1
        continue
    action_states = [s[1] for s in parser_states]
    for action_state in action_states:
        for action in action_state:
            action_type, arg = action
            if action_type == RESOLVE:
                num_pop = arg
                num_pops.append(num_pop)