In [115]:
try:
    __IPYTHON__
    USING_IPYTHON = True
    %load_ext autoreload
    %autoreload 2
except NameError:
    USING_IPYTHON = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Argparse

In [314]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('project_root', help='')
ap.add_argument('--mrp-data-dir', default='data', help='')
ap.add_argument('--mrp-test-dir', default='src/tests', help='')
ap.add_argument('--tests-fixtures-file-template', default='fixtures/{}-test.jsonl', help='')

ap.add_argument('--graphviz-sub-dir', default='visualization/graphviz', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='companion')
ap.add_argument('--jamr-alignment-file', default='jamr.mrp')

ap.add_argument('--test-input-file', default='evaluation/input.mrp', help='')
ap.add_argument('--test-companion-file', default='evaluation/udpipe.mrp', help='')
ap.add_argument('--allennlp-mrp-json-file-template', default='allennlp-mrp-json-small-{}-{}.jsonl', help='')
ap.add_argument('--data-size-limit', type=int, default=1000, help='')

ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png')
ap.add_argument('--parse-plot-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.png')

ap.add_argument('--cuda-device', type=int, default=0)


arg_string = """
    /data/proj29_ds1/home/slai/mrp2019
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [315]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [316]:
args

Namespace(allennlp_mrp_json_file_template='allennlp-mrp-json-small-{}-{}.jsonl', companion_file_extension='.conllu', companion_sub_dir='companion', cuda_device=0, data_size_limit=1000, graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png', graphviz_sub_dir='visualization/graphviz', jamr_alignment_file='jamr.mrp', mrp_data_dir='data', mrp_file_extension='.mrp', mrp_test_dir='src/tests', parse_plot_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.png', project_root='/data/proj29_ds1/home/slai/mrp2019', test_companion_file='evaluation/udpipe.mrp', test_input_file='evaluation/input.mrp', tests_fixtures_file_template='fixtures/{}-test.jsonl', train_sub_dir='training')

#### Library imports

In [119]:
import json
import logging
import os
import pprint
import re
import string
from collections import Counter, defaultdict, deque

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import plot_util
import torch
from action_state import mrp_json2parser_states, _generate_parser_action_states
from action_state import ERROR, APPEND, RESOLVE, IGNORE
from preprocessing import (CompanionParseDataset, MrpDataset, JamrAlignmentDataset,
                           read_companion_parse_json_file, read_mrp_json_file, parse2parse_json)            
from torch import nn
from tqdm import tqdm

#### ipython notebook specific imports

In [120]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

DEBUG    [matplotlib.pyplot:219] Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [121]:
sh = logging.StreamHandler()
formatter = logging.Formatter('%(levelname)-8s [%(name)s:%(lineno)d] %(message)s')
sh.setFormatter(formatter)
logging.basicConfig(
    level=logging.DEBUG, 
    handlers=[sh]
)
mute_logger_names = ['allennlp.data.iterators.data_iterator']
for logger_name in mute_logger_names:
    logger = logging.getLogger(logger_name)  # pylint: disable=invalid-name
    logger.setLevel(logging.INFO)

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [122]:
UNKWOWN = 'UNKWOWN'

### Load data

In [123]:
train_dir = os.path.join(args.project_root, args.mrp_data_dir, args.train_sub_dir)

In [124]:
mrp_dataset = MrpDataset()

In [125]:
frameworks, framework2dataset2mrp_jsons = mrp_dataset.load_mrp_json_dir(
    train_dir, args.mrp_file_extension)


frameworks:   0%|          | 0/5 [00:00<?, ?it/s][A

dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A[A

dataset_name:  50%|█████     | 1/2 [00:00<00:00,  4.67it/s][A[A

dataset_name: 100%|██████████| 2/2 [00:00<00:00,  4.89it/s][A[A
frameworks:  20%|██        | 1/5 [00:00<00:01,  2.48it/s][A

dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A[A

dataset_name: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it][A[A
frameworks:  40%|████      | 2/5 [00:07<00:06,  2.30s/it][A

dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A[A

dataset_name: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it][A[A
frameworks:  60%|██████    | 3/5 [00:09<00:04,  2.42s/it][A

dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A[A

dataset_name: 100%|██████████| 1/1 [00:08<00:00,  8.44s/it][A[A
frameworks:  80%|████████  | 4/5 [00:18<00:04,  4.23s/it][A

dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A[A

dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 21.37it/s][A[A

da

In [126]:
framework2dataset2mrp_jsons.keys()

dict_keys(['ucca', 'psd', 'eds', 'dm', 'amr'])

### Data Preprocessing companion

In [13]:
companion_dir = os.path.join(args.project_root, args.mrp_data_dir, args.companion_sub_dir)

In [14]:
cparse_dataset = CompanionParseDataset()

In [15]:
dataset2cid2parse = cparse_dataset.load_companion_parse_dir(companion_dir, args.companion_file_extension)

INFO     [preprocessing:179] framework amr found
dataset: 100%|██████████| 13/13 [00:01<00:00, 10.32it/s]
INFO     [preprocessing:179] framework dm found
dataset: 100%|██████████| 5/5 [00:04<00:00,  1.03s/it]
INFO     [preprocessing:179] framework ucca found
dataset: 100%|██████████| 6/6 [00:00<00:00, 35.65it/s]


In [16]:
dataset2cid2parse_json = cparse_dataset.convert_parse2parse_json()

In [17]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [18]:
# Some data is missing
'20003001' in dataset2cid2parse['wsj']

False

### Load JAMR alignment data

In [19]:
jalignment_dataset = JamrAlignmentDataset()

In [20]:
cid2alignment = jalignment_dataset.load_jamr_alignment_file(os.path.join(
    args.project_root,
    args.mrp_data_dir,
    args.companion_sub_dir,
    args.jamr_alignment_file
))

### Load testing data

In [21]:
test_input_filename = os.path.join(args.project_root, args.mrp_data_dir, args.test_input_file)
test_companion_filename = os.path.join(args.project_root, args.mrp_data_dir, args.test_companion_file)

In [22]:
test_mrp_jsons = read_mrp_json_file(test_input_filename)
test_parse_jsons = read_companion_parse_json_file(test_companion_filename)

In [23]:
parse_json = test_parse_jsons['102990']

In [24]:
mrp_json = framework2dataset2mrp_jsons['psd']['wsj'][1]

In [25]:
test_configs = [
    ('ucca', 'wiki', 70),
]
framework, dataset, idx = test_configs[0]

In [26]:
mrp_json = framework2dataset2mrp_jsons[framework][dataset][idx]
cid = mrp_json.get('id')

In [27]:
parse_json = dataset2cid2parse_json[dataset][cid]

In [28]:
doc = mrp_json['input']

In [29]:
doc

'In the final minute of the game, Johnson had the ball stolen by Celtics center Robert Parish, and then missed two free throws that could have won the game.'

In [30]:
token_pos = 0
anchors = []
char_pos2tokenized_parse_node_id = []

for node_id, node in enumerate(parse_json.get('nodes')):
    label = node.get('label')
    label_size = len(label)
    while doc[token_pos] == ' ':
        token_pos += 1
        char_pos2tokenized_parse_node_id.append(node_id)
    anchors.append((token_pos, token_pos + label_size))
    char_pos2tokenized_parse_node_id.extend([node_id] * (label_size))
    print(node_id, doc[token_pos: token_pos + label_size], anchors[-1], len(char_pos2tokenized_parse_node_id))
    token_pos += label_size

0 In (0, 2) 2
1 the (3, 6) 6
2 final (7, 12) 12
3 minute (13, 19) 19
4 of (20, 22) 22
5 the (23, 26) 26
6 game (27, 31) 31
7 , (31, 32) 32
8 Johnson (33, 40) 40
9 had (41, 44) 44
10 the (45, 48) 48
11 ball (49, 53) 53
12 stolen (54, 60) 60
13 by (61, 63) 63
14 Celtics (64, 71) 71
15 center (72, 78) 78
16 Robert (79, 85) 85
17 Parish (86, 92) 92
18 , (92, 93) 93
19 and (94, 97) 97
20 then (98, 102) 102
21 missed (103, 109) 109
22 two (110, 113) 113
23 free (114, 118) 118
24 throws (119, 125) 125
25 that (126, 130) 130
26 could (131, 136) 136
27 have (137, 141) 141
28 won (142, 145) 145
29 the (146, 149) 149
30 game (150, 154) 154
31 . (154, 155) 155


In [31]:
doc

'In the final minute of the game, Johnson had the ball stolen by Celtics center Robert Parish, and then missed two free throws that could have won the game.'

In [32]:
len(char_pos2tokenized_parse_node_id)

155

In [33]:
doc = mrp_json['input']

In [34]:
mrp_json['tops']

[34]

In [35]:
mrp_parser_states, mrp_meta_data = mrp_json2parser_states(
    mrp_json, 
    tokenized_parse_nodes=parse_json['nodes'],
)

In [36]:
(
    doc,
    nodes,
    node_id2node,
    edge_id2edge,
    top_oriented_edges,
    token_nodes,
    # abstract_node_id_set,
    parent_id2indegree,
    # parent_id2child_id_set,
    # child_id2parent_id_set,
    # child_id2edge_id_set,
    # parent_id2edge_id_set,
    # parent_child_id2edge_id_set,
    parse_nodes_anchors,
    char_pos2tokenized_node_id,
    curr_node_ids,
    token_states,
    actions,
) = mrp_meta_data

In [37]:
curr_node_ids = mrp_meta_data[-3]
token_states = mrp_meta_data[-2]
actions = mrp_meta_data[-1]

In [38]:
*_, curr_node_ids, token_states, actions = mrp_meta_data

In [39]:
actions[:4]

[(0, None),
 (1,
  (1,
   0,
   {'id': 0,
    'anchors': [{'from': 0, 'to': 2}],
    'label': 'In',
    'propagate_label': 'R'},
   [[]])),
 (0, None),
 (1,
  (1,
   0,
   {'id': 1,
    'anchors': [{'from': 3, 'to': 6}],
    'label': 'the',
    'propagate_label': 'E'},
   [[]]))]

In [40]:
for curr_node_id, action, token_state in zip(curr_node_ids, actions, token_states):
    action_type, params = action
#     pprint.pprint((curr_node_id, action[0]))
#     pprint.pprint(([token_group[:4] for token_group in token_state]))
    pprint.pprint((curr_node_id, action[0], [token_group[:4] for token_group in token_state]))

(0, 0, [(0, False, 'In', [])])
(1, 1, [(0, True, 'R', [(0, False, 'In', [])])])
(1, 0, [(0, True, 'R', [(0, False, 'In', [])]), (1, False, 'the', [])])
(2,
 1,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])])])
(2,
 0,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])]),
  (2, False, 'final', [])])
(3,
 1,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])]),
  (2, True, 'E', [(2, False, 'final', [])])])
(3,
 0,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])]),
  (2, True, 'E', [(2, False, 'final', [])]),
  (3, False, 'minute', [])])
(4,
 1,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])]),
  (2, True, 'E', [(2, False, 'final', [])]),
  (3, True, 'C', [(3, False, 'minute', [])])])
(4,
 1,
 [(32,
   True,
   'E',
   [(0, True, 'R', [(0, False, 'In', [])]),
    (1, True, 'E', [(1, False, 'the', [])]),
    (2, True, 'E', [(

    (12, True, 'P', [(12, False, 'stolen', [])]),
    (36,
     True,
     'A',
     [(13, True, 'R', [(13, False, 'by', [])]),
      (35,
       True,
       'E',
       [(14, True, 'A', [(14, False, 'Celtics', [])]),
        (15, True, 'S', [(15, False, 'center', [])])]),
      (16, True, 'C', [(16, False, 'Robert', [])])])]),
  (18, True, 'U', [(18, False, ',', [])]),
  (19, True, 'L', [(19, False, 'and', [])]),
  (20, True, 'L', [(20, False, 'then', [])]),
  (21, False, 'missed', [])])
(22,
 1,
 [(37,
   True,
   'H',
   [(33,
     True,
     'T',
     [(32,
       True,
       'E',
       [(0, True, 'R', [(0, False, 'In', [])]),
        (1, True, 'E', [(1, False, 'the', [])]),
        (2, True, 'E', [(2, False, 'final', [])]),
        (3, True, 'C', [(3, False, 'minute', [])])]),
      (4, True, 'R', [(4, False, 'of', [])]),
      (5, True, 'E', [(5, False, 'the', [])]),
      (6, True, 'C', [(6, False, 'game', [])])]),
    (7, True, 'U', [(7, False, ',', [])]),
    (8, True, 'A',

In [41]:
for curr_node_id, action, token_state in zip(curr_node_ids, actions, [[]] + token_states):
    action_type, params = action
    pprint.pprint((curr_node_id, action[0], [token_group[:4] for token_group in token_state]))

(0, 0, [])
(1, 1, [(0, False, 'In', [])])
(1, 0, [(0, True, 'R', [(0, False, 'In', [])])])
(2, 1, [(0, True, 'R', [(0, False, 'In', [])]), (1, False, 'the', [])])
(2,
 0,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])])])
(3,
 1,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])]),
  (2, False, 'final', [])])
(3,
 0,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])]),
  (2, True, 'E', [(2, False, 'final', [])])])
(4,
 1,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])]),
  (2, True, 'E', [(2, False, 'final', [])]),
  (3, False, 'minute', [])])
(4,
 1,
 [(0, True, 'R', [(0, False, 'In', [])]),
  (1, True, 'E', [(1, False, 'the', [])]),
  (2, True, 'E', [(2, False, 'final', [])]),
  (3, True, 'C', [(3, False, 'minute', [])])])
(4,
 0,
 [(32,
   True,
   'E',
   [(0, True, 'R', [(0, False, 'In', [])]),
    (1, True, 'E', [(1, False, 'the', [])]),
    (2, Tr

      (11, True, 'C', [(11, False, 'ball', [])])]),
    (12, True, 'P', [(12, False, 'stolen', [])]),
    (36,
     True,
     'A',
     [(13, True, 'R', [(13, False, 'by', [])]),
      (35,
       True,
       'E',
       [(14, True, 'A', [(14, False, 'Celtics', [])]),
        (15, True, 'S', [(15, False, 'center', [])])]),
      (16, True, 'C', [(16, False, 'Robert', [])])])]),
  (18, True, 'U', [(18, False, ',', [])]),
  (19, True, 'L', [(19, False, 'and', [])]),
  (20, True, 'L', [(20, False, 'then', [])]),
  (21, True, 'D', [(21, False, 'missed', [])]),
  (22, True, 'D', [(22, False, 'two', [])]),
  (23, True, 'D', [(23, False, 'free', [])])])
(25,
 1,
 [(37,
   True,
   'H',
   [(33,
     True,
     'T',
     [(32,
       True,
       'E',
       [(0, True, 'R', [(0, False, 'In', [])]),
        (1, True, 'E', [(1, False, 'the', [])]),
        (2, True, 'E', [(2, False, 'final', [])]),
        (3, True, 'C', [(3, False, 'minute', [])])]),
      (4, True, 'R', [(4, False, 'of', [])

In [42]:
actions

[(0, None),
 (1,
  (1,
   0,
   {'id': 0,
    'anchors': [{'from': 0, 'to': 2}],
    'label': 'In',
    'propagate_label': 'R'},
   [[]])),
 (0, None),
 (1,
  (1,
   0,
   {'id': 1,
    'anchors': [{'from': 3, 'to': 6}],
    'label': 'the',
    'propagate_label': 'E'},
   [[]])),
 (0, None),
 (1,
  (1,
   0,
   {'id': 2,
    'anchors': [{'from': 7, 'to': 12}],
    'label': 'final',
    'propagate_label': 'E'},
   [[]])),
 (0, None),
 (1,
  (1,
   0,
   {'id': 3,
    'anchors': [{'from': 13, 'to': 19}],
    'label': 'minute',
    'propagate_label': 'C'},
   [[]])),
 (1,
  (4,
   3,
   {'id': 31, 'propagate_label': 'E'},
   [[{'source': 31,
      'target': 0,
      'label': 'R',
      'id': 31,
      'parent': 31,
      'child': 0}],
    [{'source': 31,
      'target': 1,
      'label': 'E',
      'id': 28,
      'parent': 31,
      'child': 1}],
    [{'source': 31,
      'target': 2,
      'label': 'E',
      'id': 21,
      'parent': 31,
      'child': 2}],
    [{'source': 31,
      't

In [43]:
token_states[1]

[(0, True, 'R', [(0, False, 'In', [])])]

In [44]:
[n['label'] for n in parse_json['nodes']]

['In',
 'the',
 'final',
 'minute',
 'of',
 'the',
 'game',
 ',',
 'Johnson',
 'had',
 'the',
 'ball',
 'stolen',
 'by',
 'Celtics',
 'center',
 'Robert',
 'Parish',
 ',',
 'and',
 'then',
 'missed',
 'two',
 'free',
 'throws',
 'that',
 'could',
 'have',
 'won',
 'the',
 'game',
 '.']

In [45]:
token_states[-1]

[(42,
  True,
  '<UCCA-TOP-NODE>',
  [(37,
    True,
    'H',
    [(33,
      True,
      'T',
      [(32,
        True,
        'E',
        [(0, True, 'R', [(0, False, 'In', [])]),
         (1, True, 'E', [(1, False, 'the', [])]),
         (2, True, 'E', [(2, False, 'final', [])]),
         (3, True, 'C', [(3, False, 'minute', [])])]),
       (4, True, 'R', [(4, False, 'of', [])]),
       (5, True, 'E', [(5, False, 'the', [])]),
       (6, True, 'C', [(6, False, 'game', [])])]),
     (7, True, 'U', [(7, False, ',', [])]),
     (8, True, 'A', [(8, False, 'Johnson', [])]),
     (9, True, 'F', [(9, False, 'had', [])]),
     (34,
      True,
      'A',
      [(10, True, 'E', [(10, False, 'the', [])]),
       (11, True, 'C', [(11, False, 'ball', [])])]),
     (12, True, 'P', [(12, False, 'stolen', [])]),
     (36,
      True,
      'A',
      [(13, True, 'R', [(13, False, 'by', [])]),
       (35,
        True,
        'E',
        [(14, True, 'A', [(14, False, 'Celtics', [])]),
         (

In [46]:
companion_parser_states, companion_meta_data = mrp_json2parser_states(
    parse_json,
    mrp_doc=doc,
    tokenized_parse_nodes=parse_json['nodes'],
)

In [47]:
logger.info(args.graphviz_file_template.format(
    framework, dataset, cid))

INFO     [__main__:2] http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/ucca/wiki.mrp/470004.png


In [48]:
mrp_json['input']

'In the final minute of the game, Johnson had the ball stolen by Celtics center Robert Parish, and then missed two free throws that could have won the game.'

In [49]:
mrp_parser_states

[(0,
  [(0, None),
   (1,
    (1,
     0,
     {'id': 0,
      'anchors': [{'from': 0, 'to': 2}],
      'label': 'In',
      'propagate_label': 'R'},
     [[]]))],
  [],
  [],
  [],
  [(0, 0, [(0, 0, None)])],
  [(0, True, 'R', [(0, False, 'In', 'In')])]),
 (1,
  [(0, None),
   (1,
    (1,
     0,
     {'id': 1,
      'anchors': [{'from': 3, 'to': 6}],
      'label': 'the',
      'propagate_label': 'E'},
     [[]]))],
  [],
  [],
  [],
  [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])],
  [(0, True, 'R', [(0, False, 'In', 'In')]),
   (1, True, 'E', [(1, False, 'the', 'the')])]),
 (2,
  [(0, None),
   (1,
    (1,
     0,
     {'id': 2,
      'anchors': [{'from': 7, 'to': 12}],
      'label': 'final',
      'propagate_label': 'E'},
     [[]]))],
  [],
  [],
  [],
  [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, [(2, 2, None)])],
  [(0, True, 'R', [(0, False, 'In', 'In')]),
   (1, True, 'E', [(1, False, 'the', 'the')]),
   (2, True, 'E', [(2, False, 'final', 'final')])]),
 (3,
  

In [50]:
[(node['id'], node.get('label')) for node in mrp_json['nodes']]

[(0, 'In'),
 (1, 'the'),
 (2, 'final'),
 (3, 'minute'),
 (4, 'of'),
 (5, 'the'),
 (6, 'game'),
 (7, ','),
 (8, 'Johnson'),
 (9, 'had'),
 (10, 'the'),
 (11, 'ball'),
 (12, 'stolen'),
 (13, 'by'),
 (14, 'Celtics'),
 (15, 'center'),
 (16, 'RobertParish'),
 (17, ','),
 (18, 'and'),
 (19, 'then'),
 (20, 'missed'),
 (21, 'two'),
 (22, 'free'),
 (23, 'throws'),
 (24, 'that'),
 (25, 'could'),
 (26, 'have'),
 (27, 'won'),
 (28, 'the'),
 (29, 'game'),
 (30, '.'),
 (31, None),
 (32, None),
 (33, None),
 (34, None),
 (35, None),
 (36, None),
 (37, None),
 (38, None),
 (39, None),
 (40, None),
 (41, None)]

In [51]:
doc

'In the final minute of the game, Johnson had the ball stolen by Celtics center Robert Parish, and then missed two free throws that could have won the game.'

In [52]:
parse_json['nodes']

[{'id': 0,
  'label': 'In',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['in', 'ADP', 'IN']},
 {'id': 1,
  'label': 'the',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['the', 'DET', 'DT']},
 {'id': 2,
  'label': 'final',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['final', 'ADJ', 'JJ']},
 {'id': 3,
  'label': 'minute',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['minute', 'NOUN', 'NN']},
 {'id': 4,
  'label': 'of',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['of', 'ADP', 'IN']},
 {'id': 5,
  'label': 'the',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['the', 'DET', 'DT']},
 {'id': 6,
  'label': 'game',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['game', 'NOUN', 'NN']},
 {'id': 7,
  'label': ',',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': [',', 'PUNCT', ',']},
 {'id': 8,
  'label': 'Johnson',
  'properties': ['lemma', 'upos', 'xpos'],
  'values': ['Johnson', 'PROPN', 'NNP']},
 {'id': 9,
  'label

In [53]:
[(node['id'], node['label']) for node in parse_json['nodes']]

[(0, 'In'),
 (1, 'the'),
 (2, 'final'),
 (3, 'minute'),
 (4, 'of'),
 (5, 'the'),
 (6, 'game'),
 (7, ','),
 (8, 'Johnson'),
 (9, 'had'),
 (10, 'the'),
 (11, 'ball'),
 (12, 'stolen'),
 (13, 'by'),
 (14, 'Celtics'),
 (15, 'center'),
 (16, 'Robert'),
 (17, 'Parish'),
 (18, ','),
 (19, 'and'),
 (20, 'then'),
 (21, 'missed'),
 (22, 'two'),
 (23, 'free'),
 (24, 'throws'),
 (25, 'that'),
 (26, 'could'),
 (27, 'have'),
 (28, 'won'),
 (29, 'the'),
 (30, 'game'),
 (31, '.')]

In [54]:
anchors

[(0, 2),
 (3, 6),
 (7, 12),
 (13, 19),
 (20, 22),
 (23, 26),
 (27, 31),
 (31, 32),
 (33, 40),
 (41, 44),
 (45, 48),
 (49, 53),
 (54, 60),
 (61, 63),
 (64, 71),
 (72, 78),
 (79, 85),
 (86, 92),
 (92, 93),
 (94, 97),
 (98, 102),
 (103, 109),
 (110, 113),
 (114, 118),
 (119, 125),
 (126, 130),
 (131, 136),
 (137, 141),
 (142, 145),
 (146, 149),
 (150, 154),
 (154, 155)]

### Create training instance

In [56]:
# framework = 'ucca'
# ignore_framework_set = {'amr', 'dm', 'psd', 'eds'}
# dataset = 'wiki'
# ignore_dataset_set = {}

# framework = 'dm'
# ignore_framework_set = {'amr', 'ucca', 'psd', 'eds'}
# dataset = 'wsj'
# ignore_dataset_set = {}

framework = 'ucca'
ignore_framework_set = {'amr', 'psd', 'eds'}
dataset = 'wiki'
ignore_dataset_set = {}

In [57]:
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

In [58]:
framework_names = '-'.join([
    framework 
    for framework in frameworks 
    if framework not in ignore_framework_set
])
framework_names

'ucca-dm'

In [59]:
allennlp_tests_fixtures_output_file = os.path.join(
    args.project_root, args.mrp_test_dir, args.tests_fixtures_file_template.format(framework_names))

allennlp_framework_train_output_file = os.path.join(
    args.project_root, args.allennlp_mrp_json_file_template.format(framework_names, 'train'))

allennlp_framework_test_output_file = os.path.join(
    args.project_root, args.allennlp_mrp_json_file_template.format(framework_names, 'test'))

In [60]:
# Create tests fixture jsonl
fixture_combinations = [
    ('ucca', 'wiki', 70),
    ('dm', 'wsj', 3)
] * 5

with open(allennlp_tests_fixtures_output_file, 'w') as wf:
    for framework, dataset, idx in fixture_combinations:
        mrp_json = framework2dataset2mrp_jsons[framework][dataset][idx]
        cid = mrp_json.get('id')
        doc = mrp_json.get('input')
        
        alignment = {}
        if framework == 'amr':
            alignment = cid2alignment[cid]  
        parse_json = dataset2cid2parse_json.get(dataset, {}).get(cid, {})

        if parse_json:
            with_parse_count += 1
            mrp_parser_states, mrp_meta_data = mrp_json2parser_states(
                mrp_json, 
                tokenized_parse_nodes=parse_json['nodes'],
                alignment=alignment,
            )
            companion_parser_states, companion_meta_data = mrp_json2parser_states(
                parse_json, 
                mrp_doc=doc,
                tokenized_parse_nodes=parse_json['nodes'],
            )

            data_instance = {
                'mrp_json': mrp_json,
                'parse_json': parse_json,
                'mrp_parser_states': mrp_parser_states,
                'mrp_meta_data': mrp_meta_data,
                'companion_parser_states': companion_parser_states,
                'companion_meta_data': companion_meta_data,
            }
            json_encoded_instance = json.dumps(data_instance)
            wf.write(json_encoded_instance + '\n')

In [61]:
for idx in range(20):
    mrp_json = framework2dataset2mrp_jsons[framework][dataset][idx]
    cid = mrp_json.get('id')
    if cid in dataset2cid2parse[dataset]:
        print(idx)

3
8
13
18


In [62]:
with_parse_count

10

In [63]:
[state[-1] for state in mrp_parser_states]

[[(0, True, 'the', [(0, False, 'the', 'The')])],
 [(0, True, 'the', [(0, False, 'the', 'The')]),
  (1, True, 'asbestos', [(1, False, 'asbestos', 'asbestos')])],
 [(0, True, 'the', [(0, False, 'the', 'The')]),
  (1, True, 'asbestos', [(1, False, 'asbestos', 'asbestos')]),
  (2, False, 'fiber', 'fiber')],
 [(0, True, 'the', [(0, False, 'the', 'The')]),
  (1, True, 'asbestos', [(1, False, 'asbestos', 'asbestos')]),
  (2, False, 'fiber', 'fiber'),
  (4, True, 'crocidolite', [(4, False, 'crocidolite', 'crocidolite')])],
 [(2,
   True,
   'fiber',
   [(0, True, 'the', [(0, False, 'the', 'The')]),
    (1, True, 'asbestos', [(1, False, 'asbestos', 'asbestos')]),
    (2, False, 'fiber', 'fiber'),
    (4, True, 'crocidolite', [(4, False, 'crocidolite', 'crocidolite')])])],
 [(2,
   True,
   'fiber',
   [(0, True, 'the', [(0, False, 'the', 'The')]),
    (1, True, 'asbestos', [(1, False, 'asbestos', 'asbestos')]),
    (2, False, 'fiber', 'fiber'),
    (4, True, 'crocidolite', [(4, False, 'crocidol

In [64]:
mrp_meta_data[-1]

[(0, None),
 (1,
  (1,
   0,
   {'id': 0,
    'label': 'the',
    'properties': ['pos', 'frame'],
    'values': ['DT', 'q:i-h-h'],
    'anchors': [{'from': 0, 'to': 3}]},
   [[]])),
 (0, None),
 (1,
  (1,
   0,
   {'id': 1,
    'label': 'asbestos',
    'properties': ['pos', 'frame'],
    'values': ['NN', 'n:x'],
    'anchors': [{'from': 4, 'to': 12}]},
   [[]])),
 (0, None),
 (2, None),
 (0, None),
 (1,
  (1,
   0,
   {'id': 4,
    'label': 'crocidolite',
    'properties': ['pos', 'frame'],
    'values': ['NN', 'n:x'],
    'anchors': [{'from': 20, 'to': 31}]},
   [[]])),
 (1,
  (4,
   2,
   {'id': 2,
    'label': 'fiber',
    'properties': ['pos', 'frame'],
    'values': ['NN', 'n:x'],
    'anchors': [{'from': 13, 'to': 18}]},
   [[{'source': 0,
      'target': 2,
      'label': 'BV',
      'id': 2,
      'parent': 2,
      'child': 0}],
    [{'source': 1,
      'target': 2,
      'label': 'compound',
      'id': 1,
      'parent': 2,
      'child': 1}],
    [],
    [{'source': 4,
    

In [65]:
doc

'The asbestos fiber, crocidolite, is unusually resilient once it enters the lungs, with even brief exposures to it causing symptoms that show up decades later, researchers said.'

In [66]:
parse_json

{'id': '20003002',
 'tops': [30],
 'nodes': [{'id': 0,
   'label': 'The',
   'properties': ['lemma', 'upos', 'xpos'],
   'values': ['the', 'DET', 'DT']},
  {'id': 1,
   'label': 'asbestos',
   'properties': ['lemma', 'upos', 'xpos'],
   'values': ['asbestos', 'NOUN', 'NN']},
  {'id': 2,
   'label': 'fiber',
   'properties': ['lemma', 'upos', 'xpos'],
   'values': ['fiber', 'NOUN', 'NN']},
  {'id': 3,
   'label': ',',
   'properties': ['lemma', 'upos', 'xpos'],
   'values': [',', 'PUNCT', ',']},
  {'id': 4,
   'label': 'crocidolite',
   'properties': ['lemma', 'upos', 'xpos'],
   'values': ['crocidolite', 'NOUN', 'NN']},
  {'id': 5,
   'label': ',',
   'properties': ['lemma', 'upos', 'xpos'],
   'values': [',', 'PUNCT', ',']},
  {'id': 6,
   'label': 'is',
   'properties': ['lemma', 'upos', 'xpos'],
   'values': ['be', 'VERB', 'VBZ']},
  {'id': 7,
   'label': 'unusually',
   'properties': ['lemma', 'upos', 'xpos'],
   'values': ['unusually', 'ADV', 'RB']},
  {'id': 8,
   'label': 'resil

In [67]:
[n['values'][2] for n in parse_json['nodes']]

['DT',
 'NN',
 'NN',
 ',',
 'NN',
 ',',
 'VBZ',
 'RB',
 'JJ',
 'IN',
 'PRP',
 'VBZ',
 'DT',
 'NNS',
 ',',
 'IN',
 'RB',
 'JJ',
 'NNS',
 'TO',
 'PRP',
 'VBG',
 'NNS',
 'WDT',
 'VBP',
 'RP',
 'NNS',
 'RB',
 ',',
 'NNS',
 'VBD',
 '.']

In [127]:
# Create train jsonl
if os.path.isfile(allennlp_framework_train_output_file) and os.path.isfile(
    allennlp_framework_train_output_file):
    logger.info('allennlp_train_output_file found, stop generation')
else:
    pass
if 1==1:
    data_size = 0
    with open(allennlp_framework_train_output_file, 'w') as train_wf:
        with open(allennlp_framework_test_output_file, 'w') as test_wf:
            for _, dataset, idx, mrp_json in tqdm(mrp_dataset.mrp_json_generator(
                ignore_framework_set=ignore_framework_set,
                ignore_dataset_set=ignore_dataset_set,
                data_size_limit=args.data_size_limit * 2
            )):
                total_count += 1
                cid = mrp_json.get('id')
                doc = mrp_json.get('input')

                framework = mrp_json.get('framework')
                alignment = {}
                if framework == 'amr':
                    alignment = cid2alignment[cid]  
                parse_json = dataset2cid2parse_json.get(dataset, {}).get(cid, {})

                if parse_json:
                    mrp_parser_states, mrp_meta_data = mrp_json2parser_states(
                        mrp_json, 
                        tokenized_parse_nodes=parse_json['nodes'],
                        alignment=alignment,
                    )
                    companion_parser_states, companion_meta_data = mrp_json2parser_states(
                        parse_json, 
                        mrp_doc=doc,
                        tokenized_parse_nodes=parse_json['nodes'],
                    )

                    # Continue if error
                    if not mrp_parser_states:
                        continue

                    data_instance = {
                        'mrp_json': mrp_json,
                        'parse_json': parse_json,
                        'mrp_parser_states': mrp_parser_states,
                        'mrp_meta_data': mrp_meta_data,
                        'companion_parser_states': companion_parser_states,
                        'companion_meta_data': companion_meta_data,
                    }
                    json_encoded_instance = json.dumps(data_instance)
                    if idx <= data_size_limit:
                        train_wf.write(json_encoded_instance + '\n')
                    else:
                        test_wf.write(json_encoded_instance + '\n')

INFO     [__main__:4] allennlp_train_output_file found, stop generation

0it [00:00, ?it/s][A
1it [00:00,  9.29it/s][A
2it [00:00,  7.43it/s][A
3it [00:00,  4.00it/s][A

6it [00:01,  4.70it/s][A
8it [00:01,  5.90it/s][A
9it [00:01,  6.26it/s][A
10it [00:01,  5.09it/s][A
12it [00:02,  4.28it/s][A

17it [00:03,  4.84it/s][A
19it [00:03,  6.13it/s][A
21it [00:03,  7.46it/s][A
23it [00:04,  6.19it/s][A
25it [00:04,  7.65it/s][A
27it [00:04,  8.08it/s][A
29it [00:05,  5.01it/s][A
31it [00:05,  6.43it/s][A
33it [00:05,  7.03it/s][A
35it [00:06,  5.06it/s][A
38it [00:06,  6.59it/s][A
40it [00:06,  7.22it/s][A
42it [00:06,  6.53it/s][A
43it [00:07,  2.32it/s][A

47it [00:08,  3.38it/s][A
49it [00:08,  3.99it/s][A
50it [00:09,  4.12it/s][A
51it [00:09,  4.13it/s][A

55it [00:09,  6.47it/s][A
58it [00:09,  8.18it/s][A
60it [00:10,  8.72it/s][A
62it [00:11,  4.01it/s][A
64it [00:11,  5.07it/s][A
66it [00:11,  5.60it/s][A
67it [00:11,  4.03it/s][A
68it [00:12,  3.

352it [01:20,  2.51it/s][A

356it [01:20,  3.94it/s][A
357it [01:20,  4.61it/s][A
358it [01:20,  4.74it/s][A
360it [01:21,  5.11it/s][A
362it [01:22,  3.52it/s][A
363it [01:22,  4.16it/s][A
364it [01:22,  4.22it/s][A

367it [01:22,  6.18it/s][A
369it [01:23,  6.73it/s][A
370it [01:23,  4.14it/s][A
371it [01:24,  3.01it/s][A
373it [01:24,  3.39it/s][A
374it [01:24,  4.06it/s][A
375it [01:25,  3.29it/s][A
377it [01:25,  4.07it/s][A
378it [01:25,  3.91it/s][A
379it [01:25,  4.54it/s][A
380it [01:25,  5.43it/s][A
381it [01:26,  5.36it/s][A
382it [01:26,  6.00it/s][A
383it [01:26,  4.79it/s][A
385it [01:26,  5.30it/s][A
386it [01:26,  5.68it/s][A

390it [01:27,  8.24it/s][A
392it [01:27,  8.43it/s][A
394it [01:27,  9.04it/s][A
396it [01:27,  8.46it/s][A
398it [01:28,  4.34it/s][A
399it [01:29,  3.36it/s][A

403it [01:29,  5.05it/s][A

408it [01:29,  7.27it/s][A
410it [01:31,  3.69it/s][A
412it [01:31,  3.87it/s][A
413it [01:31,  4.74it/s][A
415it [01:31,  


701it [02:35,  5.22it/s][A
702it [02:35,  4.48it/s][A
704it [02:36,  5.35it/s][A
705it [02:36,  4.24it/s][A
706it [02:36,  4.76it/s][A
707it [02:36,  5.43it/s][A
708it [02:43,  2.13s/it][A
709it [02:43,  1.52s/it][A
711it [02:43,  1.09s/it][A
712it [02:43,  1.24it/s][A
714it [02:43,  1.69it/s][A
716it [02:44,  2.22it/s][A


723it [02:45,  3.60it/s][A
725it [02:45,  4.49it/s][A
727it [02:45,  4.77it/s][A
728it [02:45,  4.59it/s][A
730it [02:46,  5.87it/s][A
732it [02:48,  2.22it/s][A
733it [02:48,  2.51it/s][A
734it [02:48,  2.87it/s][A
735it [02:48,  3.47it/s][A
736it [02:49,  3.60it/s][A
738it [02:49,  4.50it/s][A
739it [02:49,  5.03it/s][A
740it [02:49,  4.16it/s][A
741it [02:50,  4.46it/s][A
742it [02:50,  4.00it/s][A
743it [02:50,  4.45it/s][A
744it [02:50,  4.97it/s][A
745it [02:50,  4.41it/s][A
746it [02:51,  5.17it/s][A
747it [02:51,  5.17it/s][A
748it [02:51,  5.10it/s][A
750it [02:51,  5.06it/s][A
751it [02:52,  5.66it/s][A
753it [02:52,  6.

1056it [03:57,  4.17it/s][A
1058it [03:57,  5.19it/s][A
1061it [03:57,  6.46it/s][A

1065it [03:58,  8.25it/s][A
1067it [03:58,  6.42it/s][A
1071it [03:58,  8.40it/s][A
1073it [03:58,  8.88it/s][A
1075it [03:59,  6.56it/s][A
1077it [03:59,  5.99it/s][A
1079it [03:59,  7.21it/s][A
1081it [04:00,  7.25it/s][A
1082it [04:00,  7.89it/s][A
1083it [04:00,  7.83it/s][A
1084it [04:00,  7.45it/s][A
1085it [04:00,  6.33it/s][A
1086it [04:00,  6.89it/s][A
1087it [04:01,  6.35it/s][A
1088it [04:01,  5.63it/s][A
1089it [04:01,  6.43it/s][A
1090it [04:01,  5.14it/s][A


1098it [04:02,  4.77it/s][A
1100it [04:03,  5.97it/s][A
1102it [04:04,  3.36it/s][A
1103it [04:04,  3.00it/s][A
1104it [04:05,  1.80it/s][A
1105it [04:05,  2.10it/s][A
1106it [04:06,  2.43it/s][A
1107it [04:06,  2.30it/s][A
1109it [04:06,  3.08it/s][A
1110it [04:07,  2.88it/s][A
1111it [04:07,  3.00it/s][A

1115it [04:07,  4.64it/s][A
1117it [04:08,  5.90it/s][A
1119it [04:08,  7.19it/s][A
1122it [04

1401it [05:34,  7.22it/s][A
1403it [05:36,  2.82it/s][A

1406it [05:37,  2.47it/s][A
1407it [05:37,  3.00it/s][A
1409it [05:37,  3.98it/s][A
1411it [05:37,  4.87it/s][A
1413it [05:38,  5.06it/s][A
1414it [05:38,  5.68it/s][A
1416it [05:38,  5.76it/s][A

1420it [05:39,  7.88it/s][A
1422it [05:39,  7.68it/s][A
1424it [05:40,  4.43it/s][A
1426it [05:40,  5.23it/s][A
1427it [05:41,  2.13it/s][A
1428it [05:41,  2.79it/s][A
1430it [05:41,  3.67it/s][A
1432it [05:41,  4.75it/s][A
1434it [05:42,  4.26it/s][A
1437it [05:43,  4.60it/s][A
1438it [05:43,  4.03it/s][A
1439it [05:43,  4.62it/s][A
1440it [05:43,  5.47it/s][A
1441it [05:43,  5.84it/s][A

1444it [05:44,  4.38it/s][A
1445it [05:44,  4.41it/s][A
1446it [05:45,  2.68it/s][A
1448it [05:45,  3.38it/s][A
1450it [05:46,  3.64it/s][A
1453it [05:46,  4.59it/s][A
1454it [05:46,  2.85it/s][A
1455it [05:48,  1.52it/s][A
1457it [05:48,  1.94it/s][A
1458it [05:48,  2.35it/s][A
1459it [05:49,  3.01it/s][A
1460it [05:




4677it [07:03, 29.65it/s][A

4721it [07:04, 39.06it/s][A


4765it [07:05, 53.56it/s][A
4776it [07:05, 53.74it/s][A
4785it [07:05, 47.64it/s][A

4814it [07:06, 47.22it/s][A
4820it [07:06, 37.20it/s][A
4825it [07:06, 38.28it/s][A
4835it [07:07, 30.77it/s][A


4863it [07:07, 36.60it/s][A
4881it [07:07, 43.97it/s][A

4901it [07:08, 52.04it/s][A
4908it [07:08, 53.58it/s][A
4916it [07:08, 45.02it/s][A
4924it [07:08, 39.93it/s][A
4937it [07:09, 42.62it/s][A
4942it [07:09, 28.00it/s][A

4957it [07:09, 33.81it/s][A

4990it [07:10, 42.22it/s][A
4996it [07:10, 27.96it/s][A
5005it [07:10, 34.15it/s][A
5013it [07:11, 32.49it/s][A
5023it [07:11, 37.77it/s][A
5028it [07:11, 36.67it/s][A


5053it [07:12, 40.47it/s][A
5068it [07:12, 51.74it/s][A

5090it [07:12, 49.87it/s][A

5110it [07:13, 48.49it/s][A
5129it [07:13, 57.82it/s][A
5139it [07:13, 56.35it/s][A

5167it [07:14, 44.11it/s][A

5190it [07:14, 55.27it/s][A
5205it [07:14, 66.41it/s][A
5217it [07:14, 74.02it/s]

### Test allennlp dataset reader

In [437]:
import torch.optim as optim

from mrp_library.dataset_readers.mrp_jsons import MRPDatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders.embedding import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.feedforward import FeedForward

from allennlp.training.metrics import CategoricalAccuracy

from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer

import json
import logging
from typing import Dict

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.models import Model
from overrides import overrides

In [438]:
from mrp_library.dataset_readers.mrp_jsons_actions import MRPDatasetActionReader

In [439]:
reader = MRPDatasetActionReader()

In [440]:
train_dataset = reader.read(cached_path(allennlp_framework_train_output_file))




0it [00:00, ?it/s][A[A[AINFO     [mrp_library.dataset_readers.mrp_jsons_actions:113] Reading instances from lines in file at: /data/proj29_ds1/home/slai/mrp2019/allennlp-mrp-json-small-ucca-dm-train.jsonl



655it [00:00, 6489.82it/s][A[A[A


1372it [00:00, 6679.74it/s][A[A[A


1920it [00:00, 6266.40it/s][A[A[A


2552it [00:00, 6281.05it/s][A[A[A


3135it [00:00, 6110.23it/s][A[A[A


3841it [00:00, 6366.53it/s][A[A[A


4407it [00:00, 6006.38it/s][A[A[A


5017it [00:00, 6029.04it/s][A[A[A


5738it [00:00, 6340.27it/s][A[A[A


6355it [00:01, 6244.92it/s][A[A[A


7011it [00:01, 6335.68it/s][A[A[A


7765it [00:01, 6639.40it/s][A[A[A


8484it [00:01, 6747.31it/s][A[A[A


9164it [00:01, 6737.53it/s][A[A[A


9903it [00:01, 6894.20it/s][A[A[A


10653it [00:01, 7063.27it/s][A[A[A


11361it [00:01, 6403.29it/s][A[A[A


12015it [00:01, 6214.80it/s][A[A[A


12696it [00:01, 6381.80it/s][A[A[A


13343it [00:02, 6407.14it/s][A[A[A


13

In [441]:
# test_dataset = reader.read(cached_path(allennlp_train_output_file))
test_dataset = reader.read(cached_path(allennlp_framework_test_output_file))

0it [00:00, ?it/s]INFO     [mrp_library.dataset_readers.mrp_jsons_actions:113] Reading instances from lines in file at: /data/proj29_ds1/home/slai/mrp2019/allennlp-mrp-json-small-ucca-dm-test.jsonl
5201it [00:00, 7785.62it/s]


In [442]:
tests_fixtures_dataset = reader.read(cached_path(allennlp_tests_fixtures_output_file))


0it [00:00, ?it/s][AINFO     [mrp_library.dataset_readers.mrp_jsons_actions:113] Reading instances from lines in file at: /data/proj29_ds1/home/slai/mrp2019/src/tests/fixtures/ucca-dm-test.jsonl

503it [00:00, 4709.56it/s][A
965it [00:00, 5707.06it/s][A

In [443]:
vocab = Vocabulary.from_instances(train_dataset + test_dataset + tests_fixtures_dataset)

INFO     [allennlp.data.vocabulary:396] Fitting token dictionary from dataset.

  0%|          | 0/85877 [00:00<?, ?it/s][A
  1%|          | 950/85877 [00:00<00:08, 9494.87it/s][A
  2%|▏         | 1864/85877 [00:00<00:08, 9384.38it/s][A
  3%|▎         | 2651/85877 [00:00<00:09, 8870.41it/s][A
  4%|▍         | 3402/85877 [00:00<00:09, 8409.80it/s][A
  5%|▍         | 4238/85877 [00:00<00:09, 8391.55it/s][A
  6%|▌         | 4947/85877 [00:00<00:10, 7951.10it/s][A
  7%|▋         | 5837/85877 [00:00<00:09, 8211.98it/s][A
  8%|▊         | 6594/85877 [00:00<00:09, 8007.20it/s][A
  9%|▉         | 7682/85877 [00:00<00:08, 8695.14it/s][A
 10%|█         | 8659/85877 [00:01<00:08, 8989.98it/s][A
 11%|█▏        | 9667/85877 [00:01<00:08, 9290.37it/s][A
 12%|█▏        | 10652/85877 [00:01<00:07, 9450.12it/s][A
 14%|█▎        | 11596/85877 [00:01<00:08, 8564.53it/s][A
 15%|█▍        | 12468/85877 [00:01<00:08, 8381.92it/s][A
 16%|█▌        | 13397/85877 [00:01<00:08, 8634.62it/s][A
 1

In [444]:
vocab.print_statistics()

INFO     [allennlp.data.vocabulary:664] Printed vocabulary statistics are only for the part of the vocabulary generated from instances. If vocabulary is constructed by extending saved vocabulary with dataset instances, the directly loaded portion won't be considered here.




----Vocabulary Statistics----


Top 10 most frequent tokens in namespace 'word':
	Token: <START-WORD>		Frequency: 858770
	Token: <END-WORD>		Frequency: 858770
	Token: ,		Frequency: 297964
	Token: the		Frequency: 238314
	Token: .		Frequency: 171118
	Token: and		Frequency: 117392
	Token: in		Frequency: 111534
	Token: a		Frequency: 108434
	Token: of		Frequency: 105518
	Token: to		Frequency: 82726

Top 10 longest tokens in namespace 'word':
	Token: Bridgestone/Firestone		length: 21	Frequency: 69
	Token: Bridgestone/fiRestone		length: 21	Frequency: 69
	Token: dollar-denominated		length: 18	Frequency: 70
	Token: Corton-Charlemagne		length: 18	Frequency: 42
	Token: Corton-CHARlemagne		length: 18	Frequency: 42
	Token: substance-abusing		length: 17	Frequency: 152
	Token: extraterrestrial		length: 16	Frequency: 188
	Token: sesquicentennial		length: 16	Frequency: 178
	Token: price-depressing		length: 16	Frequency: 178
	Token: interest-bearing		length: 16	Frequency: 174

Top 10 shortest tokens i

In [445]:
vocab.get_vocab_size('token_node_label')

5563

In [446]:
vocab.get_vocab_size('word')

6601

In [447]:
vocab.get_vocab_size('pos')

64

In [448]:
vocab.get_vocab_size('label')

2

In [449]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 50

### Test model

In [450]:
from mrp_library.models.generalizer import ActionGeneralizer
from mrp_library.iterators.same_representation_iterator import SameRepresentationIterator

from allennlp.nn import InitializerApplicator, RegularizerApplicator, util
from allennlp.nn.activations import Activation
from allennlp.common.params import Params
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.modules.seq2vec_encoders.pytorch_seq2vec_wrapper import PytorchSeq2VecWrapper

In [451]:
field_types = ['word', 'pos', 'resolved', 'token_node_label', 'token_node_prev_action']
field_type2embedder = {}
field_type2seq2vec_encoder = {}
field_type2seq2seq_encoder = {}

for field_type in field_types:
    embedding = Embedding(num_embeddings=vocab.get_vocab_size(field_type),
                            embedding_dim=EMBEDDING_DIM)
    embedder = BasicTextFieldEmbedder({field_type: embedding})
    field_type2embedder[field_type] = embedder
    
    field_type2seq2vec_encoder[field_type] = PytorchSeq2VecWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    field_type2seq2seq_encoder[field_type] = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [452]:
# word_embedding = Embedding(num_embeddings=vocab.get_vocab_size('word'),
#                             embedding_dim=EMBEDDING_DIM)
# pos_embedding = Embedding(num_embeddings=vocab.get_vocab_size('pos'),
#                             embedding_dim=EMBEDDING_DIM)

# word_embedder = BasicTextFieldEmbedder({
#     "word": word_embedding,
#     "pos": pos_embedding,
# })
# parse_label = {
#     'word': torch.LongTensor(
#         [
#             [ 1,  0,  3,  7,  2,  9,  4],
#             [ 0,  0,  5,  0,  0,  0,  4]
#         ]
#     ),
#     'pos': torch.LongTensor(
#         [
#             [ 1,  0,  3,  7,  2,  9,  4],
#             [ 0,  0,  5,  0,  0,  0,  4]
#         ]
#     )
# }

In [453]:
# embedded_parse_label = word_embedder(parse_label)

In [454]:
# embedded_parse_label.shape

In [455]:
classifier_params = Params({
  "input_dim": HIDDEN_DIM * 3,
  "num_layers": 2,
  "hidden_dims": [50, 3],
  "activations": ["sigmoid", "linear"],
  "dropout": [0.0, 0.0]
})

In [456]:
classifier_feedforward = FeedForward.from_params(classifier_params)

INFO     [allennlp.common.from_params:340] instantiating class <class 'allennlp.modules.feedforward.FeedForward'> from params {'input_dim': 150, 'num_layers': 2, 'hidden_dims': [50, 3], 'activations': ['sigmoid', 'linear'], 'dropout': [0.0, 0.0]} and extras set()
INFO     [allennlp.common.params:252] input_dim = 150
INFO     [allennlp.common.params:252] num_layers = 2
INFO     [allennlp.common.params:252] hidden_dims = [50, 3]
INFO     [allennlp.common.params:252] hidden_dims = [50, 3]
INFO     [allennlp.common.params:252] activations = ['sigmoid', 'linear']
INFO     [allennlp.common.from_params:340] instantiating class <class 'allennlp.nn.activations.Activation'> from params ['sigmoid', 'linear'] and extras set()
INFO     [allennlp.common.params:252] activations = ['sigmoid', 'linear']
INFO     [allennlp.common.from_params:340] instantiating class <class 'allennlp.nn.activations.Activation'> from params sigmoid and extras set()
INFO     [allennlp.common.params:252] type = sigmoid
INFO

In [457]:
field_type = 'word'

In [458]:
parse_label = {
    field_type: torch.LongTensor(
        [
            [ 1,  0,  3,  7,  2,  9,  4],
            [ 0,  0,  5,  0,  0,  0,  4]
        ]
    )
}
embedded_parse_label = field_type2embedder[field_type](parse_label)

In [459]:
feature_mask = util.get_text_field_mask(parse_label)

In [460]:
seq2vec_encoder = field_type2seq2vec_encoder[field_type]

In [461]:
encoded_feature = seq2vec_encoder(embedded_parse_label, feature_mask)

In [462]:
encoded_features = [encoded_feature] * 3

In [463]:
torch.cat(encoded_features, dim=-1).shape

torch.Size([2, 150])

In [464]:
logits = classifier_feedforward(torch.cat(encoded_features, dim=-1))

In [465]:
logits.shape

torch.Size([2, 3])

In [466]:
label = torch.tensor([1, 0])

In [467]:
# loss_func = torch.nn.CrossEntropyLoss()
# loss = loss_func(logits, label)

In [511]:
ActionGeneralizer = None

In [561]:
from mrp_library.models import ActionGeneralizer
from mrp_library.iterators import SameInstanceTypeFrameworkIterator

ImportError: cannot import name 'ActionGeneralizer' from 'mrp_library.models' (/data/proj29_ds1/home/slai/mrp2019/src/mrp_library/models/__init__.py)

In [527]:
ActionGeneralizer

mrp_library.models.generalizer.ActionGeneralizer

In [528]:
if torch.cuda.is_available():
    cuda_device = args.cuda_device
    model = ActionGeneralizer(
        cuda_device=cuda_device,
        vocab=vocab,
        field_type2embedder=field_type2embedder,
        field_type2seq2vec_encoder=field_type2seq2vec_encoder,
        field_type2seq2seq_encoder=field_type2seq2seq_encoder,
        classifier_feedforward=classifier_feedforward
    )
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
    model = ActionGeneralizer(
        vocab=vocab,
        field_type2embedder=field_type2embedder,
        field_type2seq2vec_encoder=field_type2seq2vec_encoder,
        field_type2seq2seq_encoder=field_type2seq2seq_encoder,
        classifier_feedforward=classifier_feedforward
    )

iterator = SameRepresentationIterator(
    shuffle=True,
    batch_size=100, 
    sorting_keys=[("token_node_resolveds", "num_tokens")],
)
iterator.index_with(vocab)

optimizer = optim.SGD(model.parameters(), lr=0.1)

INFO     [allennlp.nn.initializers:293] Initializing parameters
INFO     [allennlp.nn.initializers:309] Done initializing parameters; the following parameters are using their default initialization from their code
INFO     [allennlp.nn.initializers:314]    classifier_feedforward._linear_layers.0.bias
INFO     [allennlp.nn.initializers:314]    classifier_feedforward._linear_layers.0.weight
INFO     [allennlp.nn.initializers:314]    classifier_feedforward._linear_layers.1.bias
INFO     [allennlp.nn.initializers:314]    classifier_feedforward._linear_layers.1.weight
INFO     [allennlp.nn.initializers:314]    field_type2embedder.pos.token_embedder_pos.weight
INFO     [allennlp.nn.initializers:314]    field_type2embedder.resolved.token_embedder_resolved.weight
INFO     [allennlp.nn.initializers:314]    field_type2embedder.token_node_label.token_embedder_token_node_label.weight
INFO     [allennlp.nn.initializers:314]    field_type2embedder.token_node_prev_action.token_embedder_token_node_pre

In [529]:
cuda_device

0

In [530]:
model.resolve_tensor

tensor(1, device='cuda:0')

In [531]:
# list(model.named_parameters())

In [552]:
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=test_dataset,
#     train_dataset=train_dataset,
#     validation_dataset=train_dataset,
    patience=10,
    num_epochs=20,
    cuda_device=cuda_device
)

In [553]:
action_logits = torch.tensor([[-2.2126,  2.6022, -1.1655],
        [ 4.7340, -1.9992, -3.4521],
        [-1.9665,  2.4100, -1.2047],
        [-2.1353,  2.4847, -1.1260],
        [ 4.7492, -2.0234, -3.4460],
        [-1.4369,  1.9822, -1.2885],
        [ 1.0337,  0.3599, -2.0420],
        [ 5.0974, -2.3380, -3.4647],
        [ 5.4187, -2.4720, -3.6469],
        [-3.4045,  2.4903,  0.0773],
        [ 0.6384,  0.6764, -1.9942],
        [-2.2904,  2.7170, -1.2016],
        [ 4.6333, -1.9474, -3.4113],
        [-2.0811,  2.5367, -1.2174],
        [ 5.1840, -2.7536, -3.1499],
        [ 4.7421, -2.0138, -3.4485],
        [ 5.1121, -2.2290, -3.5999],
        [ 0.1843,  0.8324, -1.6990],
        [ 5.3854, -2.4593, -3.6309],
        [ 0.0324,  0.6715, -1.4173]])

action_type = torch.tensor([0, 2, 2, 1, 1, 0, 1, 2, 1, 2, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0])

In [554]:
# action_probs, action_preds = action_logits.max(1)
# action_resolve_preds = action_preds.eq_(model.resolve_tensor)

In [555]:
iter([1, 2, 3, 4])

<list_iterator at 0x7fef6c93a748>

In [556]:
defaultdict(lambda: defaultdict(dict))

defaultdict(<function __main__.<lambda>()>, {})

In [557]:
(action_resolve_preds, action_type, action_resolve_preds.eq(action_type))

(tensor([1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1]),
 tensor([0, 2, 2, 1, 1, 0, 1, 2, 1, 2, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0]),
 tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0],
        dtype=torch.uint8))

In [558]:
action_resolve_preds.unsqueeze(-1).float() * action_logits

tensor([[-2.2126,  2.6022, -1.1655],
        [ 0.0000, -0.0000, -0.0000],
        [-1.9665,  2.4100, -1.2047],
        [-2.1353,  2.4847, -1.1260],
        [ 0.0000, -0.0000, -0.0000],
        [-1.4369,  1.9822, -1.2885],
        [ 0.0000,  0.0000, -0.0000],
        [ 0.0000, -0.0000, -0.0000],
        [ 0.0000, -0.0000, -0.0000],
        [-3.4045,  2.4903,  0.0773],
        [ 0.6384,  0.6764, -1.9942],
        [-2.2904,  2.7170, -1.2016],
        [ 0.0000, -0.0000, -0.0000],
        [-2.0811,  2.5367, -1.2174],
        [ 0.0000, -0.0000, -0.0000],
        [ 0.0000, -0.0000, -0.0000],
        [ 0.0000, -0.0000, -0.0000],
        [ 0.1843,  0.8324, -1.6990],
        [ 0.0000, -0.0000, -0.0000],
        [ 0.0324,  0.6715, -1.4173]])

In [None]:
trainer.train()

In [None]:
token_state = [[26, True, 'H', [[23, True, 'A', [[0, True, 'E', [[0, False, 'The', []]]], [1, True, 'C', [[1, False, 'Lakers', []]]]]], [2, True, 'P', [[2, False, 'advanced', []]]], [25, True, 'A', [[3, True, 'R', [[3, False, 'through', []]]], [4, True, 'E', [[4, False, 'the', []]]], [24, True, 'P', [[5, True, 'T', [[5, False, '1982', []]]], [6, True, 'C', [[6, False, 'playoffs', []]]]]]]]]], [7, True, 'L', [[7, False, 'and', []]]], [8, True, 'P', [[8, False, 'faced', []]]], [9, True, 'A', [[9, False, 'Philadelphia', []]]], [27, True, 'D', [[10, True, 'R', [[10, False, 'for', []]]], [11, True, 'E', [[11, False, 'the', []]]], [12, True, 'Q', [[12, False, 'second', []]]], [13, True, 'C', [[13, False, 'time', []]]]]], [14, True, 'R', [[14, False, 'in', []]]], [15, True, 'Q', [[15, False, 'three', []]]], [16, False, 'years', []]]

In [None]:
pprint.pprint(token_state)

In [None]:
vocab.get_token_from_index(0, namespace='labels')

In [None]:
vocab.get_token_index('RESOLVE')

In [None]:
vocab.get_token_from_index(3, namespace='resolved')

In [550]:
vocab.get_token_from_index(5, namespace='token_node_prev_action')

'<INITIAL-ACTION>'