In [1]:
try:
    __IPYTHON__
    USING_IPYTHON = True
except NameError:
    USING_IPYTHON = False

#### Argparse

In [43]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('mrp_data_dir', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='./mrp-companion/2019/companion')
ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
arg_string = """
    ./data/
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [44]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [45]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='./mrp-companion/2019/companion', mrp_data_dir='./data/', mrp_file_extension='.mrp', train_sub_dir='training')

#### Library imports

In [68]:
import json
import logging
import os
import string

from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

#### ipython notebook specific imports

In [10]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [11]:
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [12]:
UNKWOWN = 'UNKWOWN'

### Load data

In [13]:
train_dir = os.path.join(args.mrp_data_dir, args.train_sub_dir)
frameworks = [sub_dir for sub_dir in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, sub_dir))]
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

In [32]:
framework2dataset2mrp_jsons = {}
for framework in tqdm(frameworks, desc='frameworks'):
    dataset2mrp_jsons = {}
    framework_dir = os.path.join(train_dir, framework)
    dataset_names = os.listdir(framework_dir)
    
    for dataset_name in tqdm(dataset_names, desc='dataset_name'):
        mrp_jsons = []
        if not dataset_name.endswith(args.mrp_file_extension):
            continue
        with open(os.path.join(framework_dir, dataset_name)) as rf:
            for line in rf:
                mrp_json = json.loads(line.strip())
                if framework == 'ucca' and 'nodes' in mrp_json and 'input' in mrp_json:
                    input_text = mrp_json['input']
                    nodes = mrp_json['nodes']
                    for i, node in enumerate(nodes):
                        if 'anchors' not in node:
                            continue
                        text_segments = []
                        for anchor in node['anchors']:
                            text_segments.append(input_text[anchor.get('from', -1): anchor.get('to', -1)])
                        mrp_json['nodes'][i]['label'] = ''.join(text_segments)
                        
                mrp_jsons.append(mrp_json)
        dataset_name = dataset_name.split('.')[0]
        dataset2mrp_jsons[dataset_name] = mrp_jsons
                
    framework2dataset2mrp_jsons[framework] = dataset2mrp_jsons

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  4.06it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:01,  2.27it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:07<00:07,  2.36s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:11<00:05,  2.90s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:16<00:03,  3.47s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 21.92it/s][A
dataset_name:  57%|█████▋    | 8/14 [00:00<00:00, 18.67it/s][A
dataset_name:  71%|███████▏  | 10/14 [00:02<00:01,  2.37it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:03<00:01,  2.81it/s][A
frameworks: 100%|██████████| 5/5 [00:19<00:00,  3.40s/it]t/s][A


In [33]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

INFO:__main__:ucca
INFO:__main__:['wiki', 'ewt']
INFO:__main__:psd
INFO:__main__:['wsj']
INFO:__main__:eds
INFO:__main__:['wsj']
INFO:__main__:dm
INFO:__main__:['wsj']
INFO:__main__:amr
INFO:__main__:['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Data Preprocessing companion

In [73]:
dataset2cid2parse = {}
for framework in os.listdir(args.companion_sub_dir):
    framework_dir = os.path.join(args.companion_sub_dir, framework)
    if not os.path.isdir(framework_dir):
        continue
    for dataset in tqdm(os.listdir(framework_dir), desc='dataset'):
        if not dataset.endswith(args.companion_file_extension):
            continue
        dataset_name = dataset.split('.')[0].rstrip(string.digits)
        cid2parse = {}
        with open(os.path.join(framework_dir, dataset)) as rf:
            parse = []
            for line in rf:
                line = line.strip()
                if not line:
                    cid2parse[cid] = parse
                    parse = []
                    cid = ''
                elif line.startswith('#'):
                    cid = line[1:]
                else:
                    parse.append(line.split('\t'))
        dataset2cid2parse[dataset_name] = cid2parse

dataset: 100%|██████████| 13/13 [00:06<00:00,  1.49it/s]
dataset: 100%|██████████| 5/5 [00:01<00:00,  3.91it/s]
dataset: 100%|██████████| 6/6 [00:00<00:00, 23.34it/s]


In [74]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [85]:
dataset2cid2parse['wiki'][framework2dataset2mrp_jsons['ucca']['wiki'][2]['id']]

[['1',
  'Both',
  'both',
  'DET',
  'DT',
  '_',
  '2',
  'cc:preconj',
  '_',
  'TokenRange=0:4'],
 ['2',
  'Depp',
  'Depp',
  'PROPN',
  'NNP',
  '_',
  '9',
  'nsubj',
  '_',
  'TokenRange=5:9'],
 ['3', 'and', 'and', 'CONJ', 'CC', '_', '8', 'cc', '_', 'TokenRange=10:13'],
 ['4',
  'his',
  'he',
  'PRON',
  'PRP$',
  '_',
  '8',
  'nmod:poss',
  '_',
  'TokenRange=14:17'],
 ['5',
  'subsequent',
  'subsequent',
  'ADJ',
  'JJ',
  '_',
  '8',
  'amod',
  '_',
  'TokenRange=18:28'],
 ['6',
  'fiancé',
  'fiancé',
  'NOUN',
  'NN',
  '_',
  '8',
  'compound',
  '_',
  'TokenRange=29:35'],
 ['7',
  'Sherilyn',
  'Sherilyn',
  'PROPN',
  'NNP',
  '_',
  '8',
  'compound',
  '_',
  'TokenRange=36:44'],
 ['8',
  'Fenn',
  'Fenn',
  'PROPN',
  'NNP',
  '_',
  '2',
  'conj',
  '_',
  'TokenRange=45:49'],
 ['9',
  'auditioned',
  'audition',
  'VERB',
  'VBD',
  '_',
  '0',
  'root',
  '_',
  'TokenRange=50:60'],
 ['10', 'for', 'for', 'ADP', 'IN', '_', '13', 'case', '_', 'TokenRange=61:64'

In [86]:
framework2dataset2mrp_jsons['ucca']['wiki'][2]

{'id': '586010',
 'flavor': 1,
 'framework': 'ucca',
 'version': 0.9,
 'time': '2019-04-11 (22:04)',
 'input': "Both Depp and his subsequent fiancé Sherilyn Fenn auditioned for the 1986 film Thrashin' and they were both cast, with Depp being chosen by the film's director to star as the lead, which would have been Depp's second major role.",
 'tops': [47],
 'nodes': [{'id': 0, 'anchors': [{'from': 0, 'to': 4}], 'label': 'Both'},
  {'id': 1, 'anchors': [{'from': 5, 'to': 9}], 'label': 'Depp'},
  {'id': 2, 'anchors': [{'from': 10, 'to': 13}], 'label': 'and'},
  {'id': 3, 'anchors': [{'from': 14, 'to': 17}], 'label': 'his'},
  {'id': 4, 'anchors': [{'from': 18, 'to': 28}], 'label': 'subsequent'},
  {'id': 5, 'anchors': [{'from': 29, 'to': 35}], 'label': 'fiancé'},
  {'id': 6,
   'anchors': [{'from': 36, 'to': 44}, {'from': 45, 'to': 49}],
   'label': 'SherilynFenn'},
  {'id': 7, 'anchors': [{'from': 50, 'to': 60}], 'label': 'auditioned'},
  {'id': 8, 'anchors': [{'from': 61, 'to': 64}], 'l