# Surface realization

In [1]:
from surface.grammar import Grammar
from surface import converter
from surface import utils
from collections import defaultdict
import ast
import pickle

First we initialize the training and the test file to a variable, the files can be downloaded from the SRST 19 page.

In [2]:
TRAIN_FILE = "data/en_tr_tr.conllu"
# TRAIN_FILE = "data/en_tr_sample.conllu"
TEST_FILE = "data/en_tr_dev.conllu"
# TEST_FILE = "data/foo"

Then, we train the two static grammars (the first corresponds to the subgraphs from the ud trees, the second is the fallback grammar, where each rule is binary)

Later, the dynamic grammars are generated from these ones.

In [3]:
word_to_id, id_to_word = converter.build_dictionaries([TRAIN_FILE, TEST_FILE])

In [4]:
do_train = True
# do_train = False

In [5]:
if do_train:
    grammar = Grammar()
    grammar.train_subgraphs(TRAIN_FILE, word_to_id)
    with open('grammar.bin', 'wb') as f:
        pickle.dump(grammar, f)

206595it [04:55, 699.53it/s] 


In [6]:
if not do_train:
    with open('grammar.bin', 'rb') as f:
        grammar = pickle.load(f)

We need to extract the graphs from the conll format (conversion from conll to isi), and the rules that use the <strong>lin</strong> feature.

The rules are for incorporating the <strong>lin</strong> feature, so we can dynamically delete every rule the contradicts the linearity.

In [7]:
rules, _ = converter.extract_rules(TEST_FILE, word_to_id)
graphs, _, id_graphs= converter.convert(TEST_FILE, word_to_id)
#_, sentences, _ = converter.convert(TEST_FILE)

In [8]:
from importlib import reload  
reload(converter)
reload(utils)

<module 'surface.utils' from '/home/grecski/projects/surface_realization/surface/utils.py'>

We run through the sentences and call the <strong>alto</strong> parser to generate the derivation and map the ud representation to string.

The alto can be downloaded from [bitbucket](https://bitbucket.org/tclup/alto/downloads/).

In [9]:
def gen_grammar(sen_rules, grammar_fn, sen, binary=False):
    with open(grammar_fn, 'w') as grammar_f:
        grammar.generate_grammar(sen_rules, grammar_f, binary=binary)
        grammar.generate_terminal_ids(sen, grammar_f)

In [10]:
conll = utils.get_conll_from_file(TEST_FILE, word_to_id)
pred_ids = {}

In [14]:
len(rules)

500

In [23]:

# for i in range(len(rules)):
# for i in range(0, 10):
for i in range(362, len(rules)):
    print(i)
    prefix = f'gen/{i}'
    grammar_fn = f'{prefix}.irtg'
    input_fn = f'{prefix}.input'
    utils.set_parse(input_fn, id_graphs[i])
    output_fn = f'{prefix}.output'
    try:
        gen_grammar(rules[i], grammar_fn, conll[i])
        !timeout 5 java -Xmx32G -cp alto-2.3.6-all.jar de.up.ling.irtg.script.ParsingEvaluator $input_fn -g $grammar_fn -I ud -O string=toString -o $output_fn
        ids = utils.get_ids_from_parse(output_fn)
    except StopIteration:
        print(f'sen {i} timed out, falling back to binary grammar')
        gen_grammar(rules[i], grammar_fn, conll[i], binary=True)
        !timeout 60 java -Xmx32G -cp alto-2.3.6-all.jar de.up.ling.irtg.script.ParsingEvaluator $input_fn -g $grammar_fn -I ud -O string=toString -o $output_fn
        ids = utils.get_ids_from_parse(output_fn)

        
    pred_ids[i] = ids

362
Processing gen/362.input (2 instances) ...
1 [[WORD2489_3/WORD2489_3 -det-> WORD30_53/WORD30_53;] sen 362 timed out, falling back to binary grammar
Processing gen/362.input (2 instances) ...
1 [[WORD2489_3/WORD2489_3 -det-> WORD30_53/WORD30_53;] 44.130s
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 44.190s
363
Processing gen/363.input (2 instances) ...
1 [[WORD279_1/WORD279_1 -nsubj-> WORD348_2/WORD348_2;] 2.072s
2 [[dummy_0/dummy_0]                                 ] 3 ms
Done, total time: 2.108s
364
Processing gen/364.input (2 instances) ...
1 [[WORD279_2/WORD279_2 -ccomp-> WORD2357_1/WORD2357_] 213 ms
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 247 ms
365
Processing gen/365.input (2 instances) ...
1 [[WORD13678_11/WORD13678_11 -punct-> WORD1_2/WORD1_] 354 ms
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 396 ms
366
Processing gen/366.input (2 instances) ...
1 [[WORD6164_2/WORD6164_2 -

1 [[WORD865_17/WORD865_17 -punct-> WORD23_3/WORD23_3;] 407 ms
2 [[dummy_0/dummy_0]                                 ] 1 ms
Done, total time: 444 ms
399
Processing gen/399.input (2 instances) ...
1 [[WORD229_1/WORD229_1 -advmod-> WORD151_2/WORD151_2] 603 ms
2 [[dummy_0/dummy_0]                                 ] 6 ms
Done, total time: 650 ms
400
Processing gen/400.input (2 instances) ...
1 [[WORD13691_3/WORD13691_3 -advmod-> WORD79_1/WORD79] 102 ms
2 [[dummy_0/dummy_0]                                 ] 1 ms
Done, total time: 131 ms
401
Processing gen/401.input (2 instances) ...
1 [[WORD1882_2/WORD1882_2 -case-> WORD642_4/WORD642_4] 1.933s
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 1.971s
402
Processing gen/402.input (2 instances) ...
1 [[WORD7905_5/WORD7905_5 -cop-> WORD39_1/WORD39_1; W] 161 ms
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 197 ms
403
Processing gen/403.input (2 instances) ...
1 [[WORD103_3/WORD103_3 -punct

1 [[WORD13708_14/WORD13708_14 -case-> WORD7_36/WORD7_] sen 434 timed out, falling back to binary grammar
Processing gen/434.input (2 instances) ...
1 [[WORD13708_14/WORD13708_14 -case-> WORD7_36/WORD7_] 1.518s
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 1.574s
435
Processing gen/435.input (2 instances) ...
1 [[WORD1124_3/WORD1124_3 -cop-> WORD39_1/WORD39_1; W] 628 ms
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 661 ms
436
Processing gen/436.input (2 instances) ...
1 [[WORD12900_1/WORD12900_1 -det-> WORD11_20/WORD11_2] sen 436 timed out, falling back to binary grammar
Processing gen/436.input (2 instances) ...
1 [[WORD12900_1/WORD12900_1 -det-> WORD11_20/WORD11_2] 389 ms
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 430 ms
437
Processing gen/437.input (2 instances) ...
1 [[WORD4938_3/WORD4938_3 -nummod-> WORD4985_7/WORD49] 2.322s
2 [[dummy_0/dummy_0]                                 ] 3 ms


1 [[WORD487_3/WORD487_3 -nmod_poss-> WORD95_1/WORD95_] 146 ms
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 178 ms
468
Processing gen/468.input (2 instances) ...
1 [[WORD7867_4/WORD7867_4 -amod-> WORD130_2/WORD130_2] 152 ms
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 192 ms
469
Processing gen/469.input (2 instances) ...
1 [[WORD117_4/WORD117_4 -punct-> WORD1_20/WORD1_20; W] 254 ms
2 [[dummy_0/dummy_0]                                 ] 4 ms
Done, total time: 303 ms
470
Processing gen/470.input (2 instances) ...
1 [[WORD3627_2/WORD3627_2 -advmod-> WORD117_6/WORD117] 3.692s
2 [[dummy_0/dummy_0]                                 ] 3 ms
Done, total time: 3.741s
471
Processing gen/471.input (2 instances) ...
1 [[WORD1821_3/WORD1821_3 -punct-> WORD55_1/WORD55_1;] 80 ms
2 [[dummy_0/dummy_0]                                 ] 723 ?s
Done, total time: 107 ms
472
Processing gen/472.input (2 instances) ...
1 [[WORD1821_7/WORD1821_7 -am

In [24]:
def get_orig_order(toks):
    return sorted(toks, key=lambda tok: int(tok.misc.split('|')[-1].split('=')[-1]))


In [25]:
with open("test.conllu" , "w") as f:
    for sen_id, ids in pred_ids.items():
        sen = conll[sen_id]
        old_id_to_tok = {tok.id: tok for tok in sen}
        tok_to_new_id = {tok: ids.index(tok.id) for tok in sen}
        for new_id, tok in enumerate(sorted(sen, key=lambda t: tok_to_new_id[t])):
            if tok.head == 0:
                head_id = 0
            else:
                head_tok = old_id_to_tok[tok.head]
                head_id = tok_to_new_id[head_tok] + 1
            new_tok = utils.Token(
                new_id+1, tok.lemma, tok.word, tok.pos, tok.tpos, tok.misc, head_id, tok.deprel, tok.comp_edge,
                tok.space_after, tok.word_id)
            f.write("\t".join(str(f) for f in new_tok))
            f.write('\n')
        f.write("\n")

In [None]:
converter.to_tokenized_output("test-results-inflected/", "tokenized_test_results/")