In [1]:
try:
    __IPYTHON__
    USING_IPYTHON = True
except NameError:
    USING_IPYTHON = False

#### Argparse

In [12]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('project_root', help='')
ap.add_argument('--mrp-data-dir', default='data', help='')
ap.add_argument('--graphviz-sub-dir', default='visualization/graphviz', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='companion')

ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png')
arg_string = """
    /data/proj29_ds1/home/slai/mrp2019
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [13]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [14]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='companion', graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/visualization/graphviz/{}/{}.mrp/{}.png', graphviz_sub_dir='visualization/graphviz', mrp_data_dir='data', mrp_file_extension='.mrp', project_root='/data/proj29_ds1/home/slai/mrp2019', train_sub_dir='training')

#### Library imports

In [15]:
import json
import logging
import os
import pprint
import string

from collections import Counter
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import plot_util

#### ipython notebook specific imports

In [16]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [17]:
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [18]:
UNKWOWN = 'UNKWOWN'

### Load data

In [19]:
train_dir = os.path.join(args.project_root, args.mrp_data_dir, args.train_sub_dir)
frameworks = [sub_dir for sub_dir in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, sub_dir))]
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

In [20]:
framework2dataset2mrp_jsons = {}
for framework in tqdm(frameworks, desc='frameworks'):
    dataset2mrp_jsons = {}
    framework_dir = os.path.join(train_dir, framework)
    dataset_names = os.listdir(framework_dir)
    
    for dataset_name in tqdm(dataset_names, desc='dataset_name'):
        mrp_jsons = []
        if not dataset_name.endswith(args.mrp_file_extension):
            continue
        with open(os.path.join(framework_dir, dataset_name)) as rf:
            for line in rf:
                mrp_json = json.loads(line.strip())
                if framework == 'ucca' and 'nodes' in mrp_json and 'input' in mrp_json:
                    input_text = mrp_json['input']
                    nodes = mrp_json['nodes']
                    for i, node in enumerate(nodes):
                        if 'anchors' not in node:
                            continue
                        text_segments = []
                        for anchor in node['anchors']:
                            text_segments.append(input_text[anchor.get('from', -1): anchor.get('to', -1)])
                        mrp_json['nodes'][i]['label'] = ''.join(text_segments)
                        
                mrp_jsons.append(mrp_json)
        dataset_name = dataset_name.split('.')[0]
        dataset2mrp_jsons[dataset_name] = mrp_jsons
                
    framework2dataset2mrp_jsons[framework] = dataset2mrp_jsons

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  2.35it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:03,  1.16it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:04<00:05,  1.83s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:11<00:06,  3.11s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:16<00:03,  3.93s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  36%|███▌      | 5/14 [00:00<00:00, 42.18it/s][A
dataset_name:  50%|█████     | 7/14 [00:00<00:00, 17.33it/s][A
dataset_name:  64%|██████▍   | 9/14 [00:00<00:00, 16.08it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  5.14it/s][A
frameworks: 100%|██████████| 5/5 [00:18<00:00,  3.24s/it]t/s][A


In [21]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

INFO:__main__:ucca
INFO:__main__:['wiki', 'ewt']
INFO:__main__:psd
INFO:__main__:['wsj']
INFO:__main__:eds
INFO:__main__:['wsj']
INFO:__main__:dm
INFO:__main__:['wsj']
INFO:__main__:amr
INFO:__main__:['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Data Preprocessing companion

In [12]:
dataset2cid2parse = {}
for framework in os.listdir(args.companion_sub_dir):
    framework_dir = os.path.join(args.companion_sub_dir, framework)
    if not os.path.isdir(framework_dir):
        continue
    for dataset in tqdm(os.listdir(framework_dir), desc='dataset'):
        if not dataset.endswith(args.companion_file_extension):
            continue
        dataset_name = dataset.split('.')[0].rstrip(string.digits)
        cid2parse = {}
        with open(os.path.join(framework_dir, dataset)) as rf:
            parse = []
            for line in rf:
                line = line.strip()
                if not line:
                    cid2parse[cid] = parse
                    parse = []
                    cid = ''
                elif line.startswith('#'):
                    cid = line[1:]
                else:
                    parse.append(line.split('\t'))
        dataset2cid2parse[dataset_name] = cid2parse

dataset: 100%|██████████| 13/13 [00:04<00:00,  2.99it/s]
dataset: 100%|██████████| 5/5 [00:01<00:00,  3.75it/s]
dataset: 100%|██████████| 6/6 [00:00<00:00, 26.27it/s]


In [13]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [14]:
'20003001' in dataset2cid2parse['wsj']

False

In [15]:
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

### Plot companion data

In [43]:
dataset = 'wiki'
framework = 'dm'
framework = 'eds'
framework = 'psd'
framework = 'ucca'
# framework = 'amr'
mrp_index = 10

parse = None
while not parse:
    mrp_index += 1
    cid = framework2dataset2mrp_jsons[framework][dataset][mrp_index]['id']
    parse = dataset2cid2parse[dataset].get(cid)
    
# print sentence
print(mrp_index, ' '.join([word_record[1] for word_record in parse]))

# print corresponding mrp png link
for framework in frameworks:
    if dataset not in framework2dataset2mrp_jsons[framework]:
        continue
    if len(framework2dataset2mrp_jsons[framework][dataset]) <= mrp_index:
        continue
    if framework2dataset2mrp_jsons[framework][dataset][mrp_index]['id'] == cid:
        print(args.graphviz_file_template.format(framework, dataset, cid))

# 
def plot_parse(parse, args, dataset, dataset_dir, cid):
    """plot parse and print png filename"""
    parse_dg = plot_util.parse_to_directed_graph(parse)
    dataset_dir = os.path.join(args.graphviz_dir, dataset)
    image_path = os.path.join(dataset_dir, cid + '.png')
    os.makedirs(dataset_dir, exist_ok=True)
    plot_util.directed_graph_to_graphviz_image(parse_dg, image_path)
    return os.path.join('http://localhost:8000/files/proj29_ds1/home/slai/mrp2019', image_path)
    
parse_plot_filename = plot_parse(parse, args, dataset, dataset_dir, cid)
print(parse_plot_filename)

11 One of the most popular musicians of the 20th century , he is often referred to as the “ King of Rock and Roll ” or “ the King”. Born in Tupelo , Mississippi , Presley moved to Memphis , Tennessee , with his family at the age of 13 .
http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/ucca/wiki.mrp/553002.png
http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/wiki/553002.png
