In [1]:
try:
    __IPYTHON__
    USING_IPYTHON = True
except NameError:
    USING_IPYTHON = False

#### Argparse

In [2]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('mrp_data_dir', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='./mrp-companion/2019/companion')
ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp/graphviz/{}/{}.mrp/{}.png')
arg_string = """
    ./data/
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [3]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [4]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='./mrp-companion/2019/companion', graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp/graphviz/{}/{}.mrp/{}.png', mrp_data_dir='./data/', mrp_file_extension='.mrp', train_sub_dir='training')

#### Library imports

In [5]:
import json
import logging
import os
import pprint
import string

from collections import Counter
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

#### ipython notebook specific imports

In [6]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [7]:
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [8]:
UNKWOWN = 'UNKWOWN'

### Load data

In [9]:
train_dir = os.path.join(args.mrp_data_dir, args.train_sub_dir)
frameworks = [sub_dir for sub_dir in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, sub_dir))]
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

In [10]:
framework2dataset2mrp_jsons = {}
for framework in tqdm(frameworks, desc='frameworks'):
    dataset2mrp_jsons = {}
    framework_dir = os.path.join(train_dir, framework)
    dataset_names = os.listdir(framework_dir)
    
    for dataset_name in tqdm(dataset_names, desc='dataset_name'):
        mrp_jsons = []
        if not dataset_name.endswith(args.mrp_file_extension):
            continue
        with open(os.path.join(framework_dir, dataset_name)) as rf:
            for line in rf:
                mrp_json = json.loads(line.strip())
                if framework == 'ucca' and 'nodes' in mrp_json and 'input' in mrp_json:
                    input_text = mrp_json['input']
                    nodes = mrp_json['nodes']
                    for i, node in enumerate(nodes):
                        if 'anchors' not in node:
                            continue
                        text_segments = []
                        for anchor in node['anchors']:
                            text_segments.append(input_text[anchor.get('from', -1): anchor.get('to', -1)])
                        mrp_json['nodes'][i]['label'] = ''.join(text_segments)
                        
                mrp_jsons.append(mrp_json)
        dataset_name = dataset_name.split('.')[0]
        dataset2mrp_jsons[dataset_name] = mrp_jsons
                
    framework2dataset2mrp_jsons[framework] = dataset2mrp_jsons

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  2.37it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:03,  1.02it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:05<00:06,  2.05s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:11<00:06,  3.26s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:18<00:04,  4.22s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  21%|██▏       | 3/14 [00:00<00:00, 23.38it/s][A
dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 15.21it/s][A
dataset_name:  57%|█████▋    | 8/14 [00:00<00:00, 11.23it/s][A
dataset_name:  71%|███████▏  | 10/14 [00:01<00:00,  5.06it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  4.59it/s][A
dataset_name:  93%|█████████▎| 13/14 

In [11]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

INFO:__main__:ucca
INFO:__main__:['wiki', 'ewt']
INFO:__main__:psd
INFO:__main__:['wsj']
INFO:__main__:eds
INFO:__main__:['wsj']
INFO:__main__:dm
INFO:__main__:['wsj']
INFO:__main__:amr
INFO:__main__:['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Data Preprocessing companion

In [12]:
dataset2cid2parse = {}
for framework in os.listdir(args.companion_sub_dir):
    framework_dir = os.path.join(args.companion_sub_dir, framework)
    if not os.path.isdir(framework_dir):
        continue
    for dataset in tqdm(os.listdir(framework_dir), desc='dataset'):
        if not dataset.endswith(args.companion_file_extension):
            continue
        dataset_name = dataset.split('.')[0].rstrip(string.digits)
        cid2parse = {}
        with open(os.path.join(framework_dir, dataset)) as rf:
            parse = []
            for line in rf:
                line = line.strip()
                if not line:
                    cid2parse[cid] = parse
                    parse = []
                    cid = ''
                elif line.startswith('#'):
                    cid = line[1:]
                else:
                    parse.append(line.split('\t'))
        dataset2cid2parse[dataset_name] = cid2parse

dataset: 100%|██████████| 13/13 [00:04<00:00,  2.73it/s]
dataset: 100%|██████████| 5/5 [00:01<00:00,  3.11it/s]
dataset: 100%|██████████| 6/6 [00:00<00:00, 18.58it/s]


In [13]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [27]:
'20003001' in dataset2cid2parse['wsj']

False

In [26]:
framework2dataset2mrp_jsons[framework][dataset][2]['id']

'20003001'

In [66]:
dataset = 'wsj'
framework = 'dm'
mrp_index = 128
parse = None
while not parse:
    mrp_index += 1
    cid = framework2dataset2mrp_jsons[framework][dataset][mrp_index]['id']
    parse = dataset2cid2parse[dataset].get(cid)
    
(mrp_index, ' '.join([word_record[1] for word_record in parse]), parse)

(132,
 'The monthly sales have been setting records every month since March .',
 [['1', 'The', 'the', 'DET', 'DT', '_', '3', 'det', '_', 'TokenRange=0:3'],
  ['2',
   'monthly',
   'monthly',
   'ADJ',
   'JJ',
   '_',
   '3',
   'amod',
   '_',
   'TokenRange=4:11'],
  ['3',
   'sales',
   'sale',
   'NOUN',
   'NNS',
   '_',
   '6',
   'nsubj',
   '_',
   'TokenRange=12:17'],
  ['4',
   'have',
   'have',
   'AUX',
   'VBP',
   '_',
   '6',
   'aux',
   '_',
   'TokenRange=18:22'],
  ['5', 'been', 'be', 'AUX', 'VBN', '_', '6', 'aux', '_', 'TokenRange=23:27'],
  ['6',
   'setting',
   'set',
   'VERB',
   'VBG',
   '_',
   '0',
   'root',
   '_',
   'TokenRange=28:35'],
  ['7',
   'records',
   'record',
   'NOUN',
   'NNS',
   '_',
   '6',
   'obj',
   '_',
   'TokenRange=36:43'],
  ['8',
   'every',
   'every',
   'DET',
   'DT',
   '_',
   '9',
   'det',
   '_',
   'TokenRange=44:49'],
  ['9',
   'month',
   'month',
   'NOUN',
   'NN',
   '_',
   '6',
   'obl:tmod',
   '_',
   'To

In [67]:
cid

'20016003'

In [None]:
framework2dataset2mrp_jsons[framework][dataset][mrp_index]

In [69]:
print(' '.join([
    node.get('label', '') 
    for node in sorted(framework2dataset2mrp_jsons[framework][dataset][mrp_index]['nodes'], key=lambda x:x['id'])
]))
print('http://localhost:8000/tree/proj29_ds1/home/slai/mrp/graphviz/{}/{}.mrp/{}.png'.format(framework, dataset, cid))

the monthly sale set record every month since March
http://localhost:8000/tree/proj29_ds1/home/slai/mrp/graphviz/dm/wsj.mrp/20016003.png


### Generate NER data

In [16]:
dataset

'xinhua'

In [17]:
cid = list(dataset2cid2parse[dataset])[0]

In [18]:
dataset2cid2parse[dataset][cid]

[['1',
  'Xinhua',
  'Xinhua',
  'PROPN',
  'NNP',
  '_',
  '3',
  'compound',
  '_',
  'TokenRange=0:6'],
 ['2',
  'News',
  'News',
  'PROPN',
  'NNP',
  '_',
  '3',
  'compound',
  '_',
  'TokenRange=7:11'],
 ['3',
  'Agency',
  'Agency',
  'PROPN',
  'NNP',
  '_',
  '0',
  'root',
  '_',
  'TokenRange=12:18'],
 ['4', ',', ',', 'PUNCT', ',', '_', '3', 'punct', '_', 'TokenRange=19:20'],
 ['5',
  'Hong',
  'Hong',
  'PROPN',
  'NNP',
  '_',
  '6',
  'compound',
  '_',
  'TokenRange=21:25'],
 ['6',
  'Kong',
  'Kong',
  'PROPN',
  'NNP',
  '_',
  '3',
  'appos',
  '_',
  'TokenRange=26:30'],
 ['7', ',', ',', 'PUNCT', ',', '_', '3', 'punct', '_', 'TokenRange=31:32'],
 ['8',
  'February',
  'February',
  'PROPN',
  'NNP',
  '_',
  '9',
  'compound',
  '_',
  'TokenRange=33:41'],
 ['9',
  '23rd',
  '23rd',
  'NOUN',
  'NN',
  '_',
  '3',
  'appos',
  '_',
  'TokenRange=42:46']]

In [19]:
for dataset, cid2parse in dataset2cid2parse.items():
    for cid, parse in cid2parse.items():
        for word_record in parse:
            

SyntaxError: unexpected EOF while parsing (<ipython-input-19-fbca15c464ac>, line 4)