In [1]:
try:
    __IPYTHON__
    USING_IPYTHON = True
except NameError:
    USING_IPYTHON = False

#### Argparse

In [2]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('mrp_data_dir', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--graphviz-dir', default='graphviz', help='')
ap.add_argument('--companion-sub-dir', default='./mrp-companion/2019/companion')
ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/{}/{}.mrp/{}.png')
arg_string = """
    ./data/
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [3]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [4]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='./mrp-companion/2019/companion', graphviz_dir='graphviz', graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/{}/{}.mrp/{}.png', mrp_data_dir='./data/', mrp_file_extension='.mrp', train_sub_dir='training')

#### Library imports

In [5]:
import json
import logging
import os
import pprint
import string

from collections import Counter
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import plot_util

#### ipython notebook specific imports

In [6]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [7]:
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [8]:
UNKWOWN = 'UNKWOWN'

### Load data

In [9]:
train_dir = os.path.join(args.mrp_data_dir, args.train_sub_dir)
frameworks = [sub_dir for sub_dir in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, sub_dir))]
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

In [10]:
framework2dataset2mrp_jsons = {}
for framework in tqdm(frameworks, desc='frameworks'):
    dataset2mrp_jsons = {}
    framework_dir = os.path.join(train_dir, framework)
    dataset_names = os.listdir(framework_dir)
    
    for dataset_name in tqdm(dataset_names, desc='dataset_name'):
        mrp_jsons = []
        if not dataset_name.endswith(args.mrp_file_extension):
            continue
        with open(os.path.join(framework_dir, dataset_name)) as rf:
            for line in rf:
                mrp_json = json.loads(line.strip())
                if framework == 'ucca' and 'nodes' in mrp_json and 'input' in mrp_json:
                    input_text = mrp_json['input']
                    nodes = mrp_json['nodes']
                    for i, node in enumerate(nodes):
                        if 'anchors' not in node:
                            continue
                        text_segments = []
                        for anchor in node['anchors']:
                            text_segments.append(input_text[anchor.get('from', -1): anchor.get('to', -1)])
                        mrp_json['nodes'][i]['label'] = ''.join(text_segments)
                        
                mrp_jsons.append(mrp_json)
        dataset_name = dataset_name.split('.')[0]
        dataset2mrp_jsons[dataset_name] = mrp_jsons
                
    framework2dataset2mrp_jsons[framework] = dataset2mrp_jsons

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  3.25it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:02,  1.46it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:04<00:04,  1.55s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:09<00:05,  2.55s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:13<00:03,  3.24s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 20.91it/s][A
dataset_name:  57%|█████▋    | 8/14 [00:00<00:00, 17.69it/s][A
dataset_name:  71%|███████▏  | 10/14 [00:01<00:00,  6.37it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  5.71it/s][A
frameworks: 100%|██████████| 5/5 [00:15<00:00,  2.72s/it]t/s][A


In [11]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

INFO:__main__:ucca
INFO:__main__:['wiki', 'ewt']
INFO:__main__:psd
INFO:__main__:['wsj']
INFO:__main__:eds
INFO:__main__:['wsj']
INFO:__main__:dm
INFO:__main__:['wsj']
INFO:__main__:amr
INFO:__main__:['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Data Preprocessing companion

In [12]:
dataset2cid2parse = {}
for framework in os.listdir(args.companion_sub_dir):
    framework_dir = os.path.join(args.companion_sub_dir, framework)
    if not os.path.isdir(framework_dir):
        continue
    for dataset in tqdm(os.listdir(framework_dir), desc='dataset'):
        if not dataset.endswith(args.companion_file_extension):
            continue
        dataset_name = dataset.split('.')[0].rstrip(string.digits)
        cid2parse = {}
        with open(os.path.join(framework_dir, dataset)) as rf:
            parse = []
            for line in rf:
                line = line.strip()
                if not line:
                    cid2parse[cid] = parse
                    parse = []
                    cid = ''
                elif line.startswith('#'):
                    cid = line[1:]
                else:
                    parse.append(line.split('\t'))
        dataset2cid2parse[dataset_name] = cid2parse

dataset: 100%|██████████| 13/13 [00:03<00:00,  3.91it/s]
dataset: 100%|██████████| 5/5 [00:01<00:00,  4.31it/s]
dataset: 100%|██████████| 6/6 [00:00<00:00, 32.31it/s]


In [13]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [14]:
'20003001' in dataset2cid2parse['wsj']

False

In [15]:
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

In [19]:
dataset = 'wiki'
framework = 'dm'
framework = 'eds'
framework = 'psd'
framework = 'ucca'
# framework = 'amr'
mrp_index = 0
parse = None
while not parse:
    mrp_index += 1
    cid = framework2dataset2mrp_jsons[framework][dataset][mrp_index]['id']
    parse = dataset2cid2parse[dataset].get(cid)
    
print(mrp_index, ' '.join([word_record[1] for word_record in parse]))

for framework in frameworks:
    if dataset not in framework2dataset2mrp_jsons[framework]:
        continue
    if len(framework2dataset2mrp_jsons[framework][dataset]) <= mrp_index:
        continue
    if framework2dataset2mrp_jsons[framework][dataset][mrp_index]['id'] == cid:
        print(args.graphviz_file_template.format(framework, dataset, cid))

parse_dg = plot_util.parse_to_directed_graph(parse)
dataset_dir = os.path.join(args.graphviz_dir, dataset)
image_path = os.path.join(dataset_dir,  cid + '.png')
os.makedirs(dataset_dir, exist_ok=True)
plot_util.directed_graph_to_graphviz_image(parse_dg, image_path)
print(os.path.join('http://localhost:8000/files/proj29_ds1/home/slai/mrp2019', image_path))

1 Bowie and Visconti continued their collaboration , producing a new album of completely original songs instead : the result of the sessions was the 2002 album Heathen .
http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/ucca/wiki.mrp/509011.png
http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/wiki/509011.png


In [30]:
dataset = 'wsj'
framework = 'dm'
framework = 'eds'
# framework = 'psd'
# framework = 'ucca'
# framework = 'amr'
mrp_index = 170
parse = None
while not parse:
    mrp_index += 1
    cid = framework2dataset2mrp_jsons[framework][dataset][mrp_index]['id']
    parse = dataset2cid2parse[dataset].get(cid)
    
(mrp_index, ' '.join([word_record[1] for word_record in parse]), parse)

(177,
 'Mrs. Hills said many of the 25 countries that she placed under varying degrees of scrutiny have made “ genuine progress ” on this touchy issue .',
 [['1',
   'Mrs.',
   'Mrs.',
   'PROPN',
   'NNP',
   '_',
   '2',
   'compound',
   '_',
   'TokenRange=0:4'],
  ['2',
   'Hills',
   'Hills',
   'PROPN',
   'NNP',
   '_',
   '3',
   'nsubj',
   '_',
   'TokenRange=5:10'],
  ['3',
   'said',
   'say',
   'VERB',
   'VBD',
   '_',
   '0',
   'root',
   '_',
   'TokenRange=11:15'],
  ['4',
   'many',
   'many',
   'ADJ',
   'JJ',
   '_',
   '18',
   'nsubj',
   '_',
   'TokenRange=16:20'],
  ['5', 'of', 'of', 'ADP', 'IN', '_', '8', 'case', '_', 'TokenRange=21:23'],
  ['6', 'the', 'the', 'DET', 'DT', '_', '8', 'det', '_', 'TokenRange=24:27'],
  ['7', '25', '25', 'NUM', 'CD', '_', '8', 'nummod', '_', 'TokenRange=28:30'],
  ['8',
   'countries',
   'country',
   'NOUN',
   'NNS',
   '_',
   '4',
   'nmod',
   '_',
   'TokenRange=31:40'],
  ['9',
   'that',
   'that',
   'ADP',
   'IN',

In [32]:
framework2dataset2mrp_jsons[framework][dataset][mrp_index]

{'id': '20020004',
 'flavor': 1,
 'framework': 'eds',
 'version': 0.9,
 'time': '2019-04-10 (20:21)',
 'input': 'Mrs. Hills said many of the 25 countries that she placed under varying degrees of scrutiny have made "genuine progress" on this touchy issue.',
 'tops': [5],
 'nodes': [{'id': 0, 'label': 'proper_q', 'anchors': [{'from': 0, 'to': 10}]},
  {'id': 1, 'label': 'compound', 'anchors': [{'from': 0, 'to': 10}]},
  {'id': 2, 'label': 'udef_q', 'anchors': [{'from': 0, 'to': 4}]},
  {'id': 3, 'label': '_missus_n_1', 'anchors': [{'from': 0, 'to': 4}]},
  {'id': 4,
   'label': 'named',
   'properties': ['carg'],
   'values': ['Hills'],
   'anchors': [{'from': 5, 'to': 10}]},
  {'id': 5, 'label': '_say_v_to', 'anchors': [{'from': 11, 'to': 15}]},
  {'id': 6, 'label': 'part_of', 'anchors': [{'from': 16, 'to': 20}]},
  {'id': 7, 'label': 'udef_q', 'anchors': [{'from': 16, 'to': 20}]},
  {'id': 8, 'label': 'much-many_a', 'anchors': [{'from': 16, 'to': 20}]},
  {'id': 9, 'label': '_the_q', '

In [38]:
for framework in frameworks:
    if dataset not in framework2dataset2mrp_jsons[framework]:
        continue
    if len(framework2dataset2mrp_jsons[framework][dataset]) <= mrp_index:
        continue
    if framework2dataset2mrp_jsons[framework][dataset][mrp_index]['id'] == cid:
        print(args.graphviz_file_template.format(framework, dataset, cid))

parse_dg = plot_util.parse_to_directed_graph(parse)
dataset_dir = os.path.join(args.graphviz_dir, dataset)
image_path = os.path.join(dataset_dir,  cid + '.png')
os.makedirs(dataset_dir, exist_ok=True)
plot_util.directed_graph_to_graphviz_image(parse_dg, image_path)
print(os.path.join('http://localhost:8000/files/proj29_ds1/home/slai/mrp2019', image_path))

http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/psd/wsj.mrp/20020004.png
http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/eds/wsj.mrp/20020004.png
http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/dm/wsj.mrp/20020004.png
http://localhost:8000/files/proj29_ds1/home/slai/mrp2019/graphviz/wsj/20020004.png


### Generate NER data

In [None]:
dataset

In [None]:
cid = list(dataset2cid2parse[dataset])[0]

In [None]:
dataset2cid2parse[dataset][cid]

In [None]:
for dataset, cid2parse in dataset2cid2parse.items():
    for cid, parse in cid2parse.items():
        for word_record in parse:
            