# Surface realization

In [5]:
from surface.grammar import Grammar
from surface import converter
from surface import utils
from collections import defaultdict
import ast
import pickle

First we initialize the training and the test file to a variable, the files can be downloaded from the SRST 19 page.

In [58]:
TRAIN_FILE = "data/T1-train/en_ewt-ud-train.conllu"
TEST_FILE = "data/T1-dev/en_ewt-ud-dev.conllu"

Then, we train the two static grammars (the first corresponds to the subgraphs from the ud trees, the second is the fallback grammar, where each rule is binary)

Later, the dynamic grammars are generated from these ones.

In [59]:
word_to_id, id_to_word = converter.build_dictionaries([TRAIN_FILE, TEST_FILE])

In [60]:
grammar = Grammar()
grammar.train_subgraphs(TRAIN_FILE, word_to_id)

217128it [08:28, 427.27it/s] 


In [6]:
with open('grammar.bin', 'rb') as f:
    grammar = pickle.load(f)

In [61]:
import pickle
with open('grammar.bin', 'wb') as f:
    pickle.dump(grammar, f)

We need to extract the graphs from the conll format (conversion from conll to isi), and the rules that use the <strong>lin</strong> feature.

The rules are for incorporating the <strong>lin</strong> feature, so we can dynamically delete every rule the contradicts the linearity.

In [62]:
rules, _ = converter.extract_rules(TEST_FILE, word_to_id)
graphs, _, id_graphs= converter.convert(TEST_FILE, word_to_id)
#_, sentences, _ = converter.convert(TEST_FILE)

In [63]:
conll = converter.get_conll_from_file(TEST_FILE, word_to_id)
id_to_parse = {}
stops = []

We run through the sentences and call the <strong>alto</strong> parser to generate the derivation and map the ud representation to string.

The alto can be downloaded from [bitbucket](https://bitbucket.org/tclup/alto/downloads/).

In [64]:
def do_parse(id_graph, rules, conll, prefix, binary=False):
    grammar_fn = f'{prefix}.irtg'
    grammar_f = open(grammar_fn, 'w') 
    grammar.generate_grammar(rules, grammar_f, binary=binary)
    grammar.generate_terminal_ids(conll, grammar_f)
    grammar_f.close()
    input_fn = f'{prefix}.input'
    utils.set_parse(input_fn, id_graph)
    output_fn = f'{prefix}.output'
    !timeout 5 java -Xmx32G -cp alto-2.3.6-all.jar de.up.ling.irtg.script.ParsingEvaluator $input_fn -g $grammar_fn -I ud -O string=toString -o $output_fn
    text_parse, conll_parse = utils.get_parse(output_fn, conll)
    return text_parse, conll_parse

In [65]:
dirname = 'gen'
# for sen_id in range(0, len(rules)):
for i in range(0, 10):
    print(i)
    try:
        id_to_parse[i] = do_parse(id_graphs[i], rules[i], conll[i], f"{dirname}/{i}")
    except StopIteration:
        print("stop iteration")
        id_to_parse[i] = do_parse(id_graphs[i], rules[i], conll[i], f"{dirname}/{i}", binary=True)

0
Processing gen/0.input (2 instances) ...
1 [[WORD6879_1/WORD6879_1 -det-> WORD11_4/WORD11_4; W] 402 ms
2 [[dummy_0/dummy_0]                                 ] 2 ms
Done, total time: 541 ms
1
Processing gen/1.input (2 instances) ...
1 [[WORD2500_3/WORD2500_3 -case-> WORD158_9/WORD158_9] stop iteration
Processing gen/1.input (2 instances) ...
1 [[WORD2500_3/WORD2500_3 -case-> WORD158_9/WORD158_9] 323 ms
2 [[dummy_0/dummy_0]                                 ] 8 ms
Done, total time: 416 ms
2
Processing gen/2.input (2 instances) ...
1 [[WORD8990_11/WORD8990_11 -punct-> WORD23_2/WORD23_] 3.251s
2 [[dummy_0/dummy_0]                                 ] 7 ms
Done, total time: 3.322s
3
Processing gen/3.input (2 instances) ...
1 [[WORD2213_1/WORD2213_1]                           ] 97 ms
2 [[dummy_0/dummy_0]                                 ] 733 ?s
Done, total time: 132 ms
4
Processing gen/4.input (2 instances) ...
1 [[WORD8990_6/WORD8990_6 -punct-> WORD1_1/WORD1_1; W] stop iteration
Processing gen/

In [67]:
with open("test.conllu" , "w") as f:
    for i in id_to_parse:
        conll_f = id_to_parse[i][1]
        for line in conll_f:
            f.write(str(line) + "\t")
            f.write("\t".join(conll_f[line]))
            f.write('\n')
        f.write("\n")

In [17]:
converter.to_tokenized_output("test-results-inflected/", "tokenized_test_results/")