In [1]:
try:
    __IPYTHON__
    USING_IPYTHON = True
    %load_ext autoreload
    %autoreload 2
except NameError:
    USING_IPYTHON = False

#### Argparse

In [2]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('project_root', help='')
ap.add_argument('--mrp-data-dir', default='data', help='')
ap.add_argument('--graphviz-sub-dir', default='visualization/graphviz', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='companion')
ap.add_argument('--jamr-alignment-file', default='jamr.mrp')


ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png')
arg_string = """
    /data/proj29_ds1/home/slai/mrp2019
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [3]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [4]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='companion', graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png', graphviz_sub_dir='visualization/graphviz', jamr_alignment_file='jamr.mrp', mrp_data_dir='data', mrp_file_extension='.mrp', project_root='/data/proj29_ds1/home/slai/mrp2019', train_sub_dir='training')

#### Library imports

In [5]:
import json
import logging
import os
import pprint
import re
import string
from collections import Counter, defaultdict, deque

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import plot_util
from preprocessing import CompanionParseDataset, MrpDataset, JamrAlignmentDataset
from action_state import mrp_json2parser_states, _generate_parser_action_states
                           
from tqdm import tqdm

#### ipython notebook specific imports

In [6]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [7]:
sh = logging.StreamHandler()
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
sh.setFormatter(formatter)
logging.basicConfig(level=logging.DEBUG, handlers=[sh])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [8]:
UNKWOWN = 'UNKWOWN'

### Load data

In [9]:
train_dir = os.path.join(args.project_root, args.mrp_data_dir, args.train_sub_dir)

In [10]:
mrp_dataset = MrpDataset()

In [11]:
frameworks, framework2dataset2mrp_jsons = mrp_dataset.load_mrp_json_dir(
    train_dir, args.mrp_file_extension)

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  2.77it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:02,  1.40it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:04<00:04,  1.62s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:08<00:04,  2.50s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:15<00:03,  3.60s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  29%|██▊       | 4/14 [00:00<00:00, 39.39it/s][A
dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 17.50it/s][A
dataset_name:  57%|█████▋    | 8/14 [00:00<00:00, 15.74it/s][A
dataset_name:  71%|███████▏  | 10/14 [00:01<00:00,  6.21it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  5.78it/s][A
frameworks: 100%|██████████| 5/5 [00:

### Data Preprocessing companion

In [12]:
companion_dir = os.path.join(args.project_root, args.mrp_data_dir, args.companion_sub_dir)

In [13]:
cparse_dataset = CompanionParseDataset()

In [14]:
dataset2cid2parse = cparse_dataset.load_companion_parse_dir(companion_dir, args.companion_file_extension)

preprocessing - INFO - framework amr found
dataset: 100%|██████████| 13/13 [00:03<00:00,  3.83it/s]
preprocessing - INFO - framework dm found
dataset: 100%|██████████| 5/5 [00:01<00:00,  4.22it/s]
preprocessing - INFO - framework ucca found
dataset: 100%|██████████| 6/6 [00:00<00:00, 24.79it/s]


In [15]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [16]:
# Some data is missing
'20003001' in dataset2cid2parse['wsj']

False

### Load JAMR alignment data

In [17]:
jalignment_dataset = JamrAlignmentDataset()

In [18]:
cid2alignment = jalignment_dataset.load_jamr_alignment_file(os.path.join(
    args.project_root,
    args.mrp_data_dir,
    args.companion_sub_dir,
    args.jamr_alignment_file
))

### Define the state at each step

In [19]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

__main__ - INFO - ucca
__main__ - INFO - ['wiki', 'ewt']
__main__ - INFO - psd
__main__ - INFO - ['wsj']
__main__ - INFO - eds
__main__ - INFO - ['wsj']
__main__ - INFO - dm
__main__ - INFO - ['wsj']
__main__ - INFO - amr
__main__ - INFO - ['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Test module

In [135]:
from action_state import mrp_json2parser_states, _generate_parser_action_states

In [136]:
from action_state import sentence_spliter

In [190]:
framework, dataset = [
    ('dm', 'wsj'),
    ('psd', 'wsj'),
    ('eds', 'wsj'),
    ('ucca', 'wiki'),
    ('amr', 'wsj'),
    ('amr', 'wiki'),
][0]

mrp_jsons = framework2dataset2mrp_jsons[framework][dataset]
framework, dataset

('dm', 'wsj')

In [191]:
mrp_json = mrp_jsons[0]

In [192]:
# mrp_json = [mrp_json for mrp_json in mrp_jsons if mrp_json.get('id') == '20209013'][0]

In [193]:
mrp_json['input']

'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.'

In [172]:
alignment = {}
if framework == 'amr':
    cid = mrp_json.get('id', '')
    alignment = cid2alignment[cid]

In [184]:
logger.info(args.graphviz_file_template.format(
    framework, dataset, mrp_json.get('id')))

__main__ - INFO - http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/ucca/wiki.mrp/502000.png


In [187]:
parser_states, meta_data = mrp_json2parser_states(mrp_json, alignment)

In [186]:
from allennlp.data.tokenizers import Tokenizer, WordTokenizer

In [177]:
wt = WordTokenizer()

In [180]:
tokenized_doc = wt.tokenize(mrp_json['input'])

In [183]:
type(tokenized_doc[0])

allennlp.data.tokenizers.token.Token

In [178]:
mrp_json['input']

'Bowie moved to Switzerland in 1976, purchasing a chalet in the hills to the north of Lake Geneva.'

In [194]:
(
    doc,
    nodes,
    node_id2node,
    edge_id2edge,
    top_oriented_edges,
    token_nodes,
    abstract_node_id_set,
    parent_id2indegree,
    parent_id2child_id_set,
    child_id2parent_id_set,
    child_id2edge_id_set,
    parent_id2edge_id_set,
    token_node_id_set,
    actions,
    anchor2token_id,
    parent_child_id2edge_id_set,
) = meta_data

In [195]:
abstract_node_id_set

{20, 21, 22, 23, 24, 25, 26, 27, 28}

In [165]:
parent_id2edge_id_set

defaultdict(set,
            {20: {0, 14, 24, 25},
             25: {1, 6, 11},
             21: {2, 8, 17},
             24: {3, 28},
             28: {4, 9, 12, 21, 23},
             22: {5, 27},
             23: {10, 20},
             27: {13, 19, 26},
             26: {15, 16, 18, 22}})

In [152]:
parent_id2child_id_set

defaultdict(set,
            {20: {0, 1, 22, 23},
             25: {8, 9, 26},
             21: {6, 20, 24},
             24: {7, 25},
             28: {16, 17, 18, 19, 27},
             22: {2, 3},
             23: {4, 5},
             27: {13, 14, 15},
             26: {10, 11, 12, 28}})

In [153]:
# node_id2node

In [160]:
top_oriented_edges[27]

{'source': 22, 'target': 2, 'label': 'R', 'id': 27, 'parent': 22, 'child': 2}

In [161]:
top_oriented_edges[5]

{'source': 22, 'target': 3, 'label': 'C', 'id': 5, 'parent': 22, 'child': 3}

In [189]:
tokenized_doc

[Bowie,
 moved,
 to,
 Switzerland,
 in,
 1976,
 ,,
 purchasing,
 a,
 chalet,
 in,
 the,
 hills,
 to,
 the,
 north,
 of,
 Lake,
 Geneva,
 .]

In [188]:
nodes

[{'id': 0, 'anchors': [{'from': 0, 'to': 5}], 'label': 'Bowie'},
 {'id': 1, 'anchors': [{'from': 6, 'to': 11}], 'label': 'moved'},
 {'id': 2, 'anchors': [{'from': 12, 'to': 14}], 'label': 'to'},
 {'id': 3, 'anchors': [{'from': 15, 'to': 26}], 'label': 'Switzerland'},
 {'id': 4, 'anchors': [{'from': 27, 'to': 29}], 'label': 'in'},
 {'id': 5, 'anchors': [{'from': 30, 'to': 34}], 'label': '1976'},
 {'id': 6, 'anchors': [{'from': 34, 'to': 35}], 'label': ','},
 {'id': 7, 'anchors': [{'from': 36, 'to': 46}], 'label': 'purchasing'},
 {'id': 8, 'anchors': [{'from': 47, 'to': 48}], 'label': 'a'},
 {'id': 9, 'anchors': [{'from': 49, 'to': 55}], 'label': 'chalet'},
 {'id': 10, 'anchors': [{'from': 56, 'to': 58}], 'label': 'in'},
 {'id': 11, 'anchors': [{'from': 59, 'to': 62}], 'label': 'the'},
 {'id': 12, 'anchors': [{'from': 63, 'to': 68}], 'label': 'hills'},
 {'id': 13, 'anchors': [{'from': 69, 'to': 71}], 'label': 'to'},
 {'id': 14, 'anchors': [{'from': 72, 'to': 75}], 'label': 'the'},
 {'id'

In [155]:
parser_node_id_set = set()
parser_edge_id_set = set()
for (node_id, actions, edge_state, abstract_node_state, 
     complete_node_state, node_state, token_stack, pending_token_stack) in parser_states:
    parser_node_id_set.add(node_id)
    for edge_id in edge_state:
        parser_edge_id_set.add(edge_id)
    
    node = node_id2node[node_id]
    node_edges = [edge_id2edge[edge_id] for edge_id in edge_state]
    pprint.pprint((
        node.get('id'),
        actions, 
        node.get('label'), 
        [edge.get('label') for edge in node_edges], 
#         abstract_node_state,
        complete_node_state,
        node_state,
        token_stack,
        pending_token_stack,
    ))
    
print({node.get('id', -1) for node in mrp_json.get('nodes')} - parser_node_id_set)
assert len(parser_node_id_set) == len(mrp_json.get('nodes'))
print({edge_id for edge_id, edge in enumerate(mrp_json.get('edges'))} - parser_edge_id_set)
assert len(parser_edge_id_set) == len(mrp_json.get('edges'))

(0,
 [(1, None),
  (2,
   (1,
    {'anchors': [{'from': 0, 'to': 5}], 'id': 0, 'label': 'Bowie'},
    [set()]))],
 'Bowie',
 [],
 [],
 [(0, 0, [(0, 0, None)])],
 [0],
 [])
(1,
 [(1, None),
  (2,
   (1,
    {'anchors': [{'from': 6, 'to': 11}], 'id': 1, 'label': 'moved'},
    [set()]))],
 'moved',
 [],
 [],
 [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)])],
 [0, 1],
 [])
(2,
 [(1, None),
  (2,
   (1,
    {'anchors': [{'from': 12, 'to': 14}], 'id': 2, 'label': 'to'},
    [set()]))],
 'to',
 [],
 [],
 [(0, 0, [(0, 0, None)]), (1, 1, [(1, 1, None)]), (2, 2, [(2, 2, None)])],
 [0, 1, 2],
 [])
(3,
 [(1, None),
  (2,
   (1,
    {'anchors': [{'from': 15, 'to': 26}], 'id': 3, 'label': 'Switzerland'},
    [set()]))],
 'Switzerland',
 ['C'],
 [22],
 [(0, 0, [(0, 0, None)]),
  (1, 1, [(1, 1, None)]),
  (2, 2, [(2, 2, None)]),
  (3, 3, [(3, 3, None)])],
 [0, 1, 2, 3],
 [])
(22,
 [(2, (2, {'id': 22}, [{27}, {5}]))],
 None,
 [],
 [],
 [(0, 0, [(0, 0, None)]),
  (1, 1, [(1, 1, None)]),
  (22, 22, [(2, 

AssertionError: 

In [709]:
action_states = [s[1] for s in parser_states]

In [696]:
token_poss = []
prev_token_pos = 0
for token in sentence_spliter(doc):
    token_poss.append((prev_token_pos, prev_token_pos + len(token)))
    prev_token_pos += len(token) + 1

In [697]:
list(sentence_spliter(doc))

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29.']

In [684]:
token_poss

[(0, 6),
 (7, 13),
 (14, 15),
 (16, 18),
 (19, 24),
 (25, 28),
 (29, 30),
 (31, 35),
 (36, 40),
 (41, 44),
 (45, 50),
 (51, 53),
 (54, 55),
 (56, 68),
 (69, 77),
 (78, 82),
 (83, 86)]

In [687]:
[n['anchors'] for n in nodes]

[[{'from': 0, 'to': 6}],
 [{'from': 7, 'to': 13}],
 [{'from': 15, 'to': 17}],
 [{'from': 18, 'to': 23}],
 [{'from': 24, 'to': 27}],
 [{'from': 34, 'to': 38}],
 [{'from': 39, 'to': 42}],
 [{'from': 43, 'to': 48}],
 [{'from': 49, 'to': 51}],
 [{'from': 52, 'to': 53}],
 [{'from': 54, 'to': 66}],
 [{'from': 67, 'to': 75}],
 [{'from': 76, 'to': 80}],
 [{'from': 81, 'to': 83}]]

In [683]:
nodes

[{'id': 0,
  'label': 'Pierre',
  'properties': ['pos', 'frame'],
  'values': ['NNP', 'named:x-c'],
  'anchors': [{'from': 0, 'to': 6}]},
 {'id': 1,
  'label': 'Vinken',
  'properties': ['pos', 'frame'],
  'values': ['NNP', 'named:x-c'],
  'anchors': [{'from': 7, 'to': 13}]},
 {'id': 3,
  'label': '61',
  'properties': ['pos', 'frame'],
  'values': ['CD', 'card:i-i-c'],
  'anchors': [{'from': 15, 'to': 17}]},
 {'id': 4,
  'label': 'year',
  'properties': ['pos', 'frame'],
  'values': ['NNS', 'n:x'],
  'anchors': [{'from': 18, 'to': 23}]},
 {'id': 5,
  'label': 'old',
  'properties': ['pos', 'frame'],
  'values': ['JJ', 'a:e-p'],
  'anchors': [{'from': 24, 'to': 27}]},
 {'id': 8,
  'label': 'join',
  'properties': ['pos', 'frame'],
  'values': ['VB', 'v:e-i-p'],
  'anchors': [{'from': 34, 'to': 38}]},
 {'id': 9,
  'label': 'the',
  'properties': ['pos', 'frame'],
  'values': ['DT', 'q:i-h-h'],
  'anchors': [{'from': 39, 'to': 42}]},
 {'id': 10,
  'label': 'board',
  'properties': ['pos'

In [669]:
nodes = mrp_json['nodes']

In [494]:
logger.setLevel(logging.INFO)

In [525]:
num_pops = []
error_num = 0
for i, mrp_json in tqdm(enumerate(mrp_jsons)):
#     print(i)
    parser_states, meta_data = mrp_json2parser_states(mrp_json, framework, alignment)
    if not parser_states:
        logger.info(i)
        error_num += 1
        continue
    action_states = [s[1] for s in parser_states]
    for action_state in action_states:
        for action in action_state:
            action_type, arg = action
            if action_type == RESOLVE:
                num_pop = arg
                num_pops.append(num_pop)
















0it [00:00, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A














__main__ - INFO - 4
__main__ - INFO - 7
__main__ - INFO - 9















__main__ - INFO - 10
__main__ - INFO - 14















__main__ - INFO - 17
__main__ - INFO - 18
__main__ - INFO - 19















__main__ - INFO - 20
__main__ - INFO - 25















__main__ - INFO - 27
__main__ - INFO - 30















__main__ - INFO - 33















__main__ - INFO - 38
__main__ - INFO - 40
__main__ - INFO - 42















__main__ - INFO - 45
__main__ - INFO - 46
__main__ - INFO - 49















__main__ - INFO - 52
__main__ - INFO - 54
__main__ - INFO - 57
__main__ - INFO - 59















__main__ - INFO - 60
__main__ - INFO - 62
__main__ - INFO - 63
__main__ - INFO - 66















__main__ - INFO - 67
__main__ - INFO - 68
__main__ - INFO - 70
__main__ - INFO - 71
__main__ - INFO - 72
__main__ - INFO - 73
__main__ - INFO - 74
__main__ - INFO - 75
__main__ - INFO - 76






KeyboardInterrupt: 