In [1]:
try:
    __IPYTHON__
    USING_IPYTHON = True
    %load_ext autoreload
    %autoreload 2
except NameError:
    USING_IPYTHON = False

#### Argparse

In [2]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('project_root', help='')
ap.add_argument('--mrp-data-dir', default='data', help='')
ap.add_argument('--graphviz-sub-dir', default='visualization/graphviz', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='companion')
ap.add_argument('--jamr-alignment-file', default='jamr.mrp')


ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png')
arg_string = """
    /data/proj29_ds1/home/slai/mrp2019
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [3]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [4]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='companion', graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png', graphviz_sub_dir='visualization/graphviz', jamr_alignment_file='jamr.mrp', mrp_data_dir='data', mrp_file_extension='.mrp', project_root='/data/proj29_ds1/home/slai/mrp2019', train_sub_dir='training')

#### Library imports

In [6]:
import json
import logging
import os
import pprint
import string
from collections import Counter, defaultdict, deque

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import plot_util
from preprocessing import CompanionParseDataset, MrpDataset, JamrAlignmentDataset
from action_state import mrp_json2parser_states
                           
from tqdm import tqdm

#### ipython notebook specific imports

In [7]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [8]:
sh = logging.StreamHandler()
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
sh.setFormatter(formatter)
logging.basicConfig(level=logging.INFO, handlers=[sh])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [9]:
UNKWOWN = 'UNKWOWN'

### Load data

In [10]:
train_dir = os.path.join(args.project_root, args.mrp_data_dir, args.train_sub_dir)

In [11]:
mrp_dataset = MrpDataset()

In [12]:
frameworks, framework2dataset2mrp_jsons = mrp_dataset.load_mrp_json_dir(
    train_dir, args.mrp_file_extension)

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  2.30it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:03,  1.28it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:04<00:05,  1.75s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:09<00:05,  2.75s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:14<00:03,  3.39s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 19.38it/s][A
dataset_name:  57%|█████▋    | 8/14 [00:00<00:00, 15.18it/s][A
dataset_name:  71%|███████▏  | 10/14 [00:01<00:00,  5.80it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  5.17it/s][A
frameworks: 100%|██████████| 5/5 [00:16<00:00,  2.87s/it]t/s][A


### Data Preprocessing companion

In [13]:
companion_dir = os.path.join(args.project_root, args.mrp_data_dir, args.companion_sub_dir)

In [14]:
cparse_dataset = CompanionParseDataset()

In [15]:
dataset2cid2parse = cparse_dataset.load_companion_parse_dir(companion_dir, args.companion_file_extension)

preprocessing - INFO - framework amr found
dataset: 100%|██████████| 13/13 [00:03<00:00,  3.47it/s]
preprocessing - INFO - framework dm found
dataset: 100%|██████████| 5/5 [00:01<00:00,  4.70it/s]
preprocessing - INFO - framework ucca found
dataset: 100%|██████████| 6/6 [00:00<00:00, 35.33it/s]


In [16]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [17]:
# Some data is missing
'20003001' in dataset2cid2parse['wsj']

False

### Load JAMR alignment data

In [18]:
jalignment_dataset = JamrAlignmentDataset()

In [19]:
cid2alignment = jalignment_dataset.load_jamr_alignment_file(os.path.join(
    args.project_root,
    args.mrp_data_dir,
    args.companion_sub_dir,
    args.jamr_alignment_file
))

### Define the state at each step

In [20]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

__main__ - INFO - ucca
__main__ - INFO - ['wiki', 'ewt']
__main__ - INFO - psd
__main__ - INFO - ['wsj']
__main__ - INFO - eds
__main__ - INFO - ['wsj']
__main__ - INFO - dm
__main__ - INFO - ['wsj']
__main__ - INFO - amr
__main__ - INFO - ['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Test module

In [169]:
from action_state import mrp_json2parser_states

In [289]:
framework, dataset = [
    ('dm', 'wsj'),
    ('psd', 'wsj'),
    ('eds', 'wsj'),
    ('ucca', 'wiki'),
    ('amr', 'wsj'),
    ('amr', 'wiki'),
][0]

mrp_jsons = framework2dataset2mrp_jsons[framework][dataset]
framework, dataset

('dm', 'wsj')

In [290]:
mrp_json = mrp_jsons[9]

In [291]:
alignment = {}
if framework == 'amr':
    cid = mrp_json.get('id', '')
    alignment = cid2alignment[cid]

In [292]:
# mrp_json

In [293]:
logger.info(args.graphviz_file_template.format(
    framework, dataset, mrp_json.get('id')))

__main__ - INFO - http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/dm/wsj.mrp/20003010.png


In [294]:
parser_states, meta_data = mrp_json2parser_states(mrp_json, framework, alignment)

In [295]:
(node_id2node, edge_id2edge) = meta_data

In [296]:
mrp_json['input'][15:23]

' a team '

In [297]:
mrp_json

{'id': '20003010',
 'flavor': 0,
 'framework': 'dm',
 'version': 0.9,
 'time': '2019-04-10 (20:16)',
 'input': 'Dr. Talcott led a team of researchers from the National Cancer Institute and the medical schools of Harvard University and Boston University.',
 'tops': [2],
 'nodes': [{'id': 0,
   'label': 'Dr.',
   'properties': ['pos', 'frame'],
   'values': ['NNP', 'n:x'],
   'anchors': [{'from': 0, 'to': 3}]},
  {'id': 1,
   'label': 'Talcott',
   'properties': ['pos', 'frame'],
   'values': ['NNP', 'named:x-c'],
   'anchors': [{'from': 4, 'to': 11}]},
  {'id': 2,
   'label': 'lead',
   'properties': ['pos', 'frame'],
   'values': ['VBD', 'v:e-i-p'],
   'anchors': [{'from': 12, 'to': 15}]},
  {'id': 3,
   'label': 'a',
   'properties': ['pos', 'frame'],
   'values': ['DT', 'q:i-h-h'],
   'anchors': [{'from': 16, 'to': 17}]},
  {'id': 4,
   'label': 'team',
   'properties': ['pos', 'frame'],
   'values': ['NN', 'n_of:x-i'],
   'anchors': [{'from': 18, 'to': 22}]},
  {'id': 6,
   'label':

In [298]:
parser_node_id_set = set()
parser_edge_id_set = set()
for node_id, actions, edge_state, abstract_node_state in parser_states:
    parser_node_id_set.add(node_id)
    for edge_id in edge_state:
        parser_edge_id_set.add(edge_id)
    
    node = node_id2node[node_id]
    node_edges = [edge_id2edge[edge_id] for edge_id in edge_state]
    print(
        node.get('id'),
        actions, 
        node.get('label'), 
        [edge.get('label') for edge in node_edges], 
        abstract_node_state,
    )
    
assert len(parser_node_id_set) == len(mrp_json.get('nodes'))
assert len(parser_edge_id_set) == len(mrp_json.get('edges'))

0 [0] Dr. [] []
1 [0] Talcott ['compound'] []
2 [0] lead ['ARG1'] []
3 [0] a [] []
4 [0] team ['ARG2', 'BV'] []
6 [0] researcher ['ARG1'] []
7 [0] from ['ARG1'] []
8 [0] the [] []
9 [0] National [] []
10 [0] Cancer [] []
11 [0] Institute ['BV', 'ARG2', 'compound', 'compound'] []
13 [0] the [] []
14 [0] medical [] []
15 [0] school ['BV', 'ARG1', '_and_c'] []
16 [0] of ['ARG1'] []
17 [0] Harvard [] []
18 [0] University ['ARG2', 'compound'] []
20 [0] Boston [] []
21 [0] University ['_and_c', 'compound'] []


In [198]:
{node.get('id') for node in mrp_json.get('nodes')} - parser_node_id_set

set()