In [1]:
try:
    __IPYTHON__
    USING_IPYTHON = True
except NameError:
    USING_IPYTHON = False

#### Argparse

In [83]:
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('mrp_data_dir', help='')
ap.add_argument('--train-sub-dir', default='training', help='')
ap.add_argument('--companion-sub-dir', default='./mrp-companion/2019/companion')
ap.add_argument('--mrp-file-extension', default='.mrp')
ap.add_argument('--companion-file-extension', default='.conllu')
ap.add_argument('--graphviz-file-template', default='http://localhost:8000/files/proj29_ds1/home/slai/mrp/graphviz/{}/{}.mrp/{}.png')
arg_string = """
    ./data/
"""
arguments = [arg for arg_line in arg_string.split(r'\\n') for arg in arg_line.split()]

In [84]:
if USING_IPYTHON:
    args = ap.parse_args(arguments)
else:
    args = ap.parse_args()

In [85]:
args

Namespace(companion_file_extension='.conllu', companion_sub_dir='./mrp-companion/2019/companion', graphviz_file_template='http://localhost:8000/files/proj29_ds1/home/slai/mrp/graphviz/{}/{}.mrp/{}.png', mrp_data_dir='./data/', mrp_file_extension='.mrp', train_sub_dir='training')

#### Library imports

In [102]:
import json
import logging
import os
import pprint
import string

from collections import Counter
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

#### ipython notebook specific imports

In [6]:
if USING_IPYTHON:
    # matplotlib config
    %matplotlib inline

In [7]:
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
logger.setLevel(logging.INFO)

### Constants

In [8]:
UNKWOWN = 'UNKWOWN'

### Load data

In [9]:
train_dir = os.path.join(args.mrp_data_dir, args.train_sub_dir)
frameworks = [sub_dir for sub_dir in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, sub_dir))]
frameworks

['ucca', 'psd', 'eds', 'dm', 'amr']

In [10]:
framework2dataset2mrp_jsons = {}
for framework in tqdm(frameworks, desc='frameworks'):
    dataset2mrp_jsons = {}
    framework_dir = os.path.join(train_dir, framework)
    dataset_names = os.listdir(framework_dir)
    
    for dataset_name in tqdm(dataset_names, desc='dataset_name'):
        mrp_jsons = []
        if not dataset_name.endswith(args.mrp_file_extension):
            continue
        with open(os.path.join(framework_dir, dataset_name)) as rf:
            for line in rf:
                mrp_json = json.loads(line.strip())
                if framework == 'ucca' and 'nodes' in mrp_json and 'input' in mrp_json:
                    input_text = mrp_json['input']
                    nodes = mrp_json['nodes']
                    for i, node in enumerate(nodes):
                        if 'anchors' not in node:
                            continue
                        text_segments = []
                        for anchor in node['anchors']:
                            text_segments.append(input_text[anchor.get('from', -1): anchor.get('to', -1)])
                        mrp_json['nodes'][i]['label'] = ''.join(text_segments)
                        
                mrp_jsons.append(mrp_json)
        dataset_name = dataset_name.split('.')[0]
        dataset2mrp_jsons[dataset_name] = mrp_jsons
                
    framework2dataset2mrp_jsons[framework] = dataset2mrp_jsons

frameworks:   0%|          | 0/5 [00:00<?, ?it/s]
dataset_name:   0%|          | 0/2 [00:00<?, ?it/s][A
dataset_name:  50%|█████     | 1/2 [00:00<00:00,  2.99it/s][A
frameworks:  20%|██        | 1/5 [00:00<00:03,  1.23it/s]s][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  40%|████      | 2/5 [00:04<00:05,  1.76s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  60%|██████    | 3/5 [00:09<00:05,  2.77s/it]t][A
dataset_name:   0%|          | 0/1 [00:00<?, ?it/s][A
frameworks:  80%|████████  | 4/5 [00:15<00:03,  3.65s/it]t][A
dataset_name:   0%|          | 0/14 [00:00<?, ?it/s][A
dataset_name:  43%|████▎     | 6/14 [00:00<00:00, 21.00it/s][A
dataset_name:  57%|█████▋    | 8/14 [00:00<00:00, 17.69it/s][A
dataset_name:  71%|███████▏  | 10/14 [00:01<00:00,  6.14it/s][A
dataset_name:  79%|███████▊  | 11/14 [00:01<00:00,  5.44it/s][A
frameworks: 100%|██████████| 5/5 [00:17<00:00,  3.03s/it]t/s][A


In [11]:
for framework in framework2dataset2mrp_jsons:
    logger.info(framework)
    logger.info(list(framework2dataset2mrp_jsons[framework].keys()))

INFO:__main__:ucca
INFO:__main__:['wiki', 'ewt']
INFO:__main__:psd
INFO:__main__:['wsj']
INFO:__main__:eds
INFO:__main__:['wsj']
INFO:__main__:dm
INFO:__main__:['wsj']
INFO:__main__:amr
INFO:__main__:['xinhua', 'wsj', 'wiki', 'wb', 'rte', 'proxy', 'mt09sdl', 'lorelei', 'fables', 'dfb', 'dfa', 'cctv', 'bolt', 'amr-guidelines']


### Data Preprocessing companion

In [12]:
dataset2cid2parse = {}
for framework in os.listdir(args.companion_sub_dir):
    framework_dir = os.path.join(args.companion_sub_dir, framework)
    if not os.path.isdir(framework_dir):
        continue
    for dataset in tqdm(os.listdir(framework_dir), desc='dataset'):
        if not dataset.endswith(args.companion_file_extension):
            continue
        dataset_name = dataset.split('.')[0].rstrip(string.digits)
        cid2parse = {}
        with open(os.path.join(framework_dir, dataset)) as rf:
            parse = []
            for line in rf:
                line = line.strip()
                if not line:
                    cid2parse[cid] = parse
                    parse = []
                    cid = ''
                elif line.startswith('#'):
                    cid = line[1:]
                else:
                    parse.append(line.split('\t'))
        dataset2cid2parse[dataset_name] = cid2parse

dataset: 100%|██████████| 13/13 [00:03<00:00,  3.36it/s]
dataset: 100%|██████████| 5/5 [00:01<00:00,  3.57it/s]
dataset: 100%|██████████| 6/6 [00:00<00:00, 30.49it/s]


In [13]:
dataset2cid2parse.keys()

dict_keys(['amr-guidelines', 'bolt', 'cctv', 'dfa', 'dfb', 'fables', 'lorelei', 'mt09sdl', 'proxy', 'rte', 'wb', 'wiki', 'xinhua', 'wsj', 'ewt'])

In [14]:
dataset = 'xinhua'
framework = 'amr'
dataset2cid2parse[dataset][framework2dataset2mrp_jsons[framework][dataset][1]['id']]

[['1',
  'According',
  'accord',
  'VERB',
  'VBG',
  '_',
  '10',
  'case',
  '_',
  'TokenRange=0:9'],
 ['2', 'to', 'to', 'ADP', 'TO', '_', '1', 'fixed', '_', 'TokenRange=10:12'],
 ['3',
  'Taiwan',
  'Taiwan',
  'PROPN',
  'NNP',
  '_',
  '10',
  'nmod:poss',
  '_',
  'TokenRange=13:19'],
 ['4', '’s', '’s', 'PART', 'POS', '_', '3', 'case', '_', 'TokenRange=20:22'],
 ['5', '“', '“', 'PUNCT', '``', '_', '10', 'punct', '_', 'TokenRange=23:24'],
 ['6',
  'Ministry',
  'Ministry',
  'PROPN',
  'NNP',
  '_',
  '10',
  'dep',
  '_',
  'TokenRange=25:33'],
 ['7', 'of', 'of', 'ADP', 'IN', '_', '8', 'case', '_', 'TokenRange=34:36'],
 ['8',
  'Economy',
  'Economy',
  'PROPN',
  'NNP',
  '_',
  '6',
  'obl',
  '_',
  'TokenRange=37:44'],
 ['9', '“', '“', 'PUNCT', '``', '_', '10', 'punct', '_', 'TokenRange=45:46'],
 ['10',
  'statistics',
  'statistics',
  'NOUN',
  'NNS',
  '_',
  '26',
  'obl',
  '_',
  'TokenRange=47:57'],
 ['11', ',', ',', 'PUNCT', ',', '_', '26', 'punct', '_', 'TokenRange

In [15]:
framework2dataset2mrp_jsons[framework][dataset][1]

{'id': 'nw.chtb_0012.2',
 'flavor': 2,
 'framework': 'amr',
 'version': 0.9,
 'time': '2019-04-10 (20:11)',
 'input': 'According to Taiwan \'s " Ministry of Economy " statistics , the volume of trade between mainland and Taiwan last year was 20.9 billion US dollars .',
 'tops': [0],
 'nodes': [{'id': 0, 'label': 'say-01'},
  {'id': 1, 'label': 'statistic'},
  {'id': 2, 'label': 'government-organization'},
  {'id': 3,
   'label': 'name',
   'properties': ['op1', 'op2', 'op3'],
   'values': ['Ministry', 'of', 'Economy']},
  {'id': 4,
   'label': 'monetary-quantity',
   'properties': ['quant'],
   'values': ['20900000000']},
  {'id': 5, 'label': 'dollar'},
  {'id': 6, 'label': 'country'},
  {'id': 7, 'label': 'name', 'properties': ['op1'], 'values': ['US']},
  {'id': 8, 'label': 'volume'},
  {'id': 9, 'label': 'trade-01'},
  {'id': 10, 'label': 'mainland'},
  {'id': 11, 'label': 'country'},
  {'id': 12, 'label': 'name', 'properties': ['op1'], 'values': ['Taiwan']},
  {'id': 13, 'label': '

### Companion statistics

In [18]:
list(dataset2cid2parse['wsj'].keys())[:10]

['20988006',
 '20988012',
 '20949001',
 '20949006',
 '20984005',
 '20984010',
 '20984015',
 '20984021',
 '20984026',
 '20984033']

In [61]:
verb_lemma2org2dataset_cids = defaultdict(lambda: defaultdict(list))
for dataset, cid2parse in dataset2cid2parse.items():
    for cid, parse in cid2parse.items():
        for word_record in parse:
            word_index, org, lemma, pos, *_ = word_record
            if pos == 'VERB':
                verb_lemma2org2dataset_cids[lemma][org].append((dataset, cid))

In [62]:
logger.info('No. of unique verb lemma: {}'.format(len(verb_lemma2org2dataset_cids)))

INFO:__main__:No. of unique verb lemma: 4928


In [65]:
for verb_lemma, org2dataset_cids in verb_lemma2org2dataset_cids.items():
    sorted_org_count = sorted([
        (org, len(dataset_cids)) 
        for org, dataset_cids in org2dataset_cids.items()
    ], key=lambda x: x[1], reverse=True)
    print(verb_lemma, sorted_org_count[:10])

be [('is', 6239), ('was', 2727), ('are', 2308), ('be', 1888), ('’s', 1619), ('were', 901), ('been', 557), ('’re', 217), ('’m', 215), ('being', 205)]
bear [('born', 145), ('bear', 39), ('bore', 9), ('bearing', 7), ('Born', 7), ('bears', 5), ('Bearing', 4), ('Bear', 1)]
go [('going', 941), ('go', 876), ('went', 360), ('goes', 192), ('gone', 124), ('Go', 44), ('Going', 8), ('GO', 8), ('Went', 3), ('GOES', 1)]
present [('presented', 48), ('present', 25), ('presents', 11), ('presenting', 9), ('Present', 1), ('Presented', 1), ('Presenting', 1)]
ask [('asked', 231), ('ask', 185), ('asking', 98), ('asks', 18), ('Ask', 16), ('Asked', 12), ('Asking', 3), ('ASKED', 1), ('Asks', 1)]
accept [('accept', 100), ('accepted', 84), ('accepting', 16), ('accepts', 6)]
get [('get', 1400), ('got', 494), ('getting', 290), ('gets', 145), ('gotten', 47), ('Get', 41), ('Got', 19), ('Getting', 11), ('GET', 6), ('GOT', 4)]
cease [('cease', 21), ('ceased', 6), ('ceases', 3)]
want [('want', 952), ('wanted', 251), ('

infer [('infer', 2), ('inferred', 1)]
change. [('change.', 1)]
rejecte [('rejected.', 1)]
exist. [('exist.', 3)]
question [('question', 29), ('questioned', 25), ('questioning', 10), ('questions', 4), ('Question', 2), ('Questioned', 1)]
interpret [('interpreted', 10), ('interpret', 2), ('interpreting', 2)]
decern [('decern', 1)]
werent [('werent', 3)]
contemplate [('contemplated', 5), ('contemplate', 2), ('contemplating', 1)]
justify [('justify', 38), ('justified', 8), ('justifying', 3), ('justifies', 3)]
observes/define [('observes/defines', 1)]
presuppose [('presuppose', 1)]
encompass [('encompass', 3), ('encompassing', 3), ('encompassed', 1)]
interact [('interact', 7), ('interacting', 2), ('interacted', 1)]
define [('defined', 24), ('define', 11), ('defines', 9), ('defining', 5), ('Define', 3)]
rewire [('rewire', 1)]
hand-select [('hand-selected', 1)]
impel [('impelled', 1)]
propel [('propelled', 6), ('propel', 1)]
alter [('alter', 10), ('altered', 8), ('altering', 6), ('alters', 2)]

coincide [('coincided', 6), ('coinciding', 3), ('coincide', 2), ('coincides', 1), ('Coinciding', 1)]
dawn [('dawn', 2), ('Dawning', 1), ('dawns', 1)]
dont [('don\x92t', 3)]
it [('it\x92s', 5), ('It\x92s', 2)]
have [('\x91have', 1)]
juxtapose [('Juxtaposing', 1), ('juxtaposed', 1), ('juxtapose', 1)]
knot [('knot', 1)]
play. [('play.', 1)]
isnt [('isnt', 15)]
blather [('blather', 1), ('blathering', 1)]
boot [('booted', 2), ('boot', 1)]
holler [('holler', 1)]
fly. [('fly.', 1)]
weed [('weeding', 1), ('weed', 1)]
amend [('amended', 12), ('amend', 4), ('Amend', 1)]
socialize [('socializing', 2), ('socialized', 1), ('socialize', 1)]
stagnate [('stagnate', 3)]
want. [('want.', 2)]
generalize [('generalized', 1), ('generalize', 1)]
leach [('leached', 1)]
game [('game', 1)]
moan [('moaning', 7), ('moan', 6)]
soak [('soak', 2), ('soaked', 1)]
autograph [('autographed', 2), ('autograph', 1)]
’cut [('’cut', 1)]
hep [('hep', 1)]
unmask [('unmasking', 1)]
ache [('aches', 2), ('ached', 1), ('achin

harvest [('harvested', 3), ('harvest', 1)]
RAN [('RAN', 1)]
rolling [('rolling', 1)]
balooning [('balooning', 1)]
stave [('staved', 1), ('stave', 1)]
anticpate [('anticpated', 1)]
heck [('heck', 1)]
kettle [('kettle', 1)]
riote [('rioting', 1), ('rioted', 1)]
want/can [('want/can', 1)]
’get [('’get', 2)]
maintain/build [('maintain/build', 1)]
defend/prosecute [('defend/prosecute', 1)]
emigrate [('emigrate', 3), ('emigrated', 3)]
frick [('fricking', 1)]
retort [('retort', 2), ('retorted', 1)]
comprise [('comprised', 12), ('comprises', 5), ('comprise', 2), ('comprising', 2)]
ge [('gettng', 1)]
youneed [('youneed', 1)]
commite [('commited', 1)]
working [('working', 2)]
speek [('speeking', 1), ('speek', 1)]
assimilate [('assimilate', 5), ('assimilated', 1)]
heard- [('heard-', 1)]
mind- [('mind-', 1)]
-date [('-date', 1)]
reat [('reated', 1)]
scrimp [('scrimping', 1)]
feck [('fecked', 1)]
wasn;t [('wasn;t', 1)]
distorted [('distorted', 1)]
here [('heres', 1)]
instil [('instilled', 1)]
flour

In [107]:
verb_lemma = 'be'
org = 'is'
cid_index = 1

In [108]:
def view_parse(verb_lemma, org, cid_index):
    dataset, cid = verb_lemma2org2dataset_cids[verb_lemma][org][cid_index]
    for framework, dataset2mrp_jsons in framework2dataset2mrp_jsons.items():
        if dataset in dataset2mrp_jsons:
            logger.info(pprint.pformat(dataset2mrp_jsons[dataset][cid_index]))
            graphviz_file_name = dataset2mrp_jsons[dataset][cid_index].get('id')
            if graphviz_file_name:
                logger.info((framework, dataset, graphviz_file_name))
                logger.info(args.graphviz_file_template.format(framework, dataset, graphviz_file_name))
    return dataset2cid2parse[dataset][cid]

In [109]:
dataset

'ewt'

In [110]:
dataset

'ewt'

In [111]:
view_parse(verb_lemma, org, cid_index)

INFO:__main__:{'edges': [{'label': 'degree', 'source': 0, 'target': 4},
           {'label': 'ARG2', 'source': 0, 'target': 1},
           {'label': 'path', 'source': 1, 'target': 3},
           {'label': 'destination', 'source': 1, 'target': 2}],
 'flavor': 2,
 'framework': 'amr',
 'id': 'bc.cctv_0000.167',
 'input': "It 's extremely troublesome to get there via land .",
 'nodes': [{'id': 0, 'label': 'trouble-05'},
           {'id': 1, 'label': 'get-05'},
           {'id': 2, 'label': 'there'},
           {'id': 3, 'label': 'land'},
           {'id': 4, 'label': 'extreme'}],
 'time': '2019-04-10 (20:10)',
 'tops': [0],
 'version': 0.9}
INFO:__main__:('amr', 'amr-guidelines', 'bc.cctv_0000.167')
INFO:__main__:http://localhost:8000/files/proj29_ds1/home/slai/mrp/graphviz/amr/amr-guidelines.mrp/bc.cctv_0000.167.png


[['1',
  'There',
  'there',
  'PRON',
  'EX',
  '_',
  '2',
  'expl',
  '_',
  'TokenRange=0:5'],
 ['2', 'is', 'be', 'VERB', 'VBZ', '_', '0', 'root', '_', 'TokenRange=6:8'],
 ['3', 'no', 'no', 'DET', 'DT', '_', '4', 'det', '_', 'TokenRange=9:11'],
 ['4',
  'possibility',
  'possibility',
  'NOUN',
  'NN',
  '_',
  '2',
  'nsubj',
  '_',
  'TokenRange=12:23'],
 ['5',
  'that',
  'that',
  'SCONJ',
  'IN',
  '_',
  '9',
  'mark',
  '_',
  'TokenRange=24:28'],
 ['6', 'the', 'the', 'DET', 'DT', '_', '7', 'det', '_', 'TokenRange=29:32'],
 ['7', 'boy', 'boy', 'NOUN', 'NN', '_', '9', 'nsubj', '_', 'TokenRange=33:36'],
 ['8', 'will', 'will', 'AUX', 'MD', '_', '9', 'aux', '_', 'TokenRange=37:41'],
 ['9', 'go', 'go', 'VERB', 'VB', '_', '4', 'ccomp', '_', 'TokenRange=42:44'],
 ['10', '.', '.', 'PUNCT', '.', '_', '2', 'punct', '_', 'TokenRange=44:45']]