In [2]:
try:
    __IPYTHON__
    USING_IPYTHON = True
except NameError:
    USING_IPYTHON = False

#### Argparse

In [3]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('mrp_data_dir', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='./mrp-companion/2019/companion')
ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
arg_string = """
    ./data/
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [4]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [5]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='./mrp-companion/2019/companion', mrp_data_dir='./data/', mrp_file_extension='.mrp', train_sub_dir='training')

#### Library imports

In [6]:
import json
import logging
import os
import string

from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

#### ipython notebook specific imports

In [7]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [8]:
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [9]:
UNKWOWN = 'UNKWOWN'

### Load data

In [10]:
train_dir = os.path.join(args.mrp_data_dir, args.train_sub_dir)
frameworks = [sub_dir for sub_dir in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, sub_dir))]
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

In [11]:
framework2dataset2mrp_jsons = {}
for framework in tqdm(frameworks, desc='frameworks'):
    dataset2mrp_jsons = {}
    framework_dir = os.path.join(train_dir, framework)
    dataset_names = os.listdir(framework_dir)
    
    for dataset_name in tqdm(dataset_names, desc='dataset_name'):
        mrp_jsons = []
        if not dataset_name.endswith(args.mrp_file_extension):
            continue
        with open(os.path.join(framework_dir, dataset_name)) as rf:
            for line in rf:
                mrp_json = json.loads(line.strip())
                if framework == 'ucca' and 'nodes' in mrp_json and 'input' in mrp_json:
                    input_text = mrp_json['input']
                    nodes = mrp_json['nodes']
                    for i, node in enumerate(nodes):
                        if 'anchors' not in node:
                            continue
                        text_segments = []
                        for anchor in node['anchors']:
                            text_segments.append(input_text[anchor.get('from', -1): anchor.get('to', -1)])
                        mrp_json['nodes'][i]['label'] = ''.join(text_segments)
                        
                mrp_jsons.append(mrp_json)
        dataset_name = dataset_name.split('.')[0]
        dataset2mrp_jsons[dataset_name] = mrp_jsons
                
    framework2dataset2mrp_jsons[framework] = dataset2mrp_jsons

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  3.93it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:02,  1.56it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:04<00:05,  1.70s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:11<00:06,  3.10s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:18<00:04,  4.22s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 21.87it/s][A
dataset_name:  57%|█████▋    | 8/14 [00:00<00:00, 18.54it/s][A
dataset_name:  71%|███████▏  | 10/14 [00:01<00:00,  6.65it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  6.02it/s][A
frameworks: 100%|██████████| 5/5 [00:19<00:00,  3.39s/it]t/s][A


In [12]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

INFO:__main__:ucca
INFO:__main__:['wiki', 'ewt']
INFO:__main__:psd
INFO:__main__:['wsj']
INFO:__main__:eds
INFO:__main__:['wsj']
INFO:__main__:dm
INFO:__main__:['wsj']
INFO:__main__:amr
INFO:__main__:['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Data Preprocessing companion

In [15]:
dataset2cid2parse = {}
for framework in os.listdir(args.companion_sub_dir):
    framework_dir = os.path.join(args.companion_sub_dir, framework)
    if not os.path.isdir(framework_dir):
        continue
    for dataset in tqdm(os.listdir(framework_dir), desc='dataset'):
        if not dataset.endswith(args.companion_file_extension):
            continue
        dataset_name = dataset.split('.')[0].rstrip(string.digits)
        cid2parse = {}
        with open(os.path.join(framework_dir, dataset)) as rf:
            parse = []
            for line in rf:
                line = line.strip()
                if not line:
                    cid2parse[cid] = parse
                    parse = []
                    cid = ''
                elif line.startswith('#'):
                    cid = line[1:]
                else:
                    parse.append(line.split('\t'))
        dataset2cid2parse[dataset_name] = cid2parse

dataset: 100%|██████████| 13/13 [00:01<00:00,  6.88it/s]
dataset: 100%|██████████| 5/5 [00:04<00:00,  1.05s/it]
dataset: 100%|██████████| 6/6 [00:00<00:00, 22.10it/s]


In [16]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [21]:
dataset = 'xinhua'
framework = 'amr'
dataset2cid2parse[dataset][framework2dataset2mrp_jsons[framework][dataset][1]['id']]

[['1',
  'According',
  'accord',
  'VERB',
  'VBG',
  '_',
  '10',
  'case',
  '_',
  'TokenRange=0:9'],
 ['2', 'to', 'to', 'ADP', 'TO', '_', '1', 'fixed', '_', 'TokenRange=10:12'],
 ['3',
  'Taiwan',
  'Taiwan',
  'PROPN',
  'NNP',
  '_',
  '10',
  'nmod:poss',
  '_',
  'TokenRange=13:19'],
 ['4', '’s', '’s', 'PART', 'POS', '_', '3', 'case', '_', 'TokenRange=20:22'],
 ['5', '“', '“', 'PUNCT', '``', '_', '10', 'punct', '_', 'TokenRange=23:24'],
 ['6',
  'Ministry',
  'Ministry',
  'PROPN',
  'NNP',
  '_',
  '10',
  'dep',
  '_',
  'TokenRange=25:33'],
 ['7', 'of', 'of', 'ADP', 'IN', '_', '8', 'case', '_', 'TokenRange=34:36'],
 ['8',
  'Economy',
  'Economy',
  'PROPN',
  'NNP',
  '_',
  '6',
  'obl',
  '_',
  'TokenRange=37:44'],
 ['9', '“', '“', 'PUNCT', '``', '_', '10', 'punct', '_', 'TokenRange=45:46'],
 ['10',
  'statistics',
  'statistics',
  'NOUN',
  'NNS',
  '_',
  '26',
  'obl',
  '_',
  'TokenRange=47:57'],
 ['11', ',', ',', 'PUNCT', ',', '_', '26', 'punct', '_', 'TokenRange

In [22]:
framework2dataset2mrp_jsons[framework][dataset][1]

{'id': 'nw.chtb_0012.2',
 'flavor': 2,
 'framework': 'amr',
 'version': 0.9,
 'time': '2019-04-10 (20:11)',
 'input': 'According to Taiwan \'s " Ministry of Economy " statistics , the volume of trade between mainland and Taiwan last year was 20.9 billion US dollars .',
 'tops': [0],
 'nodes': [{'id': 0, 'label': 'say-01'},
  {'id': 1, 'label': 'statistic'},
  {'id': 2, 'label': 'government-organization'},
  {'id': 3,
   'label': 'name',
   'properties': ['op1', 'op2', 'op3'],
   'values': ['Ministry', 'of', 'Economy']},
  {'id': 4,
   'label': 'monetary-quantity',
   'properties': ['quant'],
   'values': ['20900000000']},
  {'id': 5, 'label': 'dollar'},
  {'id': 6, 'label': 'country'},
  {'id': 7, 'label': 'name', 'properties': ['op1'], 'values': ['US']},
  {'id': 8, 'label': 'volume'},
  {'id': 9, 'label': 'trade-01'},
  {'id': 10, 'label': 'mainland'},
  {'id': 11, 'label': 'country'},
  {'id': 12, 'label': 'name', 'properties': ['op1'], 'values': ['Taiwan']},
  {'id': 13, 'label': '