# Surface realization

In [None]:
from surface.grammar import Grammar
from surface import converter
from surface import utils
from collections import defaultdict
import ast
import pickle

First we initialize the training and the test file to a variable, the files can be downloaded from the SRST 19 page.

In [None]:
# TRAIN_FILE = "data/en_tr_tr.conllu"
TRAIN_FILE = "data/en_tr_sample.conllu"
TEST_FILE = "data/en_tr_dev.conllu"

Then, we train the two static grammars (the first corresponds to the subgraphs from the ud trees, the second is the fallback grammar, where each rule is binary)

Later, the dynamic grammars are generated from these ones.

In [None]:
word_to_id, id_to_word = converter.build_dictionaries([TRAIN_FILE, TEST_FILE])

In [None]:
do_train = True

In [None]:
if do_train:
    grammar = Grammar()
    grammar.train_subgraphs(TRAIN_FILE, word_to_id)
    with open('grammar.bin', 'wb') as f:
        pickle.dump(grammar, f)

In [None]:
if not do_train:
    with open('grammar.bin', 'rb') as f:
        grammar = pickle.load(f)

We need to extract the graphs from the conll format (conversion from conll to isi), and the rules that use the <strong>lin</strong> feature.

The rules are for incorporating the <strong>lin</strong> feature, so we can dynamically delete every rule the contradicts the linearity.

In [None]:
rules, _ = converter.extract_rules(TEST_FILE, word_to_id)
graphs, _, id_graphs= converter.convert(TEST_FILE, word_to_id)
#_, sentences, _ = converter.convert(TEST_FILE)

In [None]:
from importlib import reload  
reload(converter)
reload(utils)

We run through the sentences and call the <strong>alto</strong> parser to generate the derivation and map the ud representation to string.

The alto can be downloaded from [bitbucket](https://bitbucket.org/tclup/alto/downloads/).

In [None]:
def gen_grammar(sen_rules, grammar_fn, sen, binary=False):
    with open(grammar_fn, 'w') as grammar_f:
        grammar.generate_grammar(sen_rules, grammar_f, binary=binary)
        grammar.generate_terminal_ids(sen, grammar_f)

In [None]:
def run_alto(timeout, input_fn, grammar_fn, output_fn):
    !timeout $timeout java -Xmx32G -cp alto-2.3.6-all.jar de.up.ling.irtg.script.ParsingEvaluator $input_fn -g $grammar_fn -I ud -O string=toString -o $output_fn


In [None]:
def do_parse(id_graph, sen_rules, sen, prefix, timeout=5):
    grammar_fn = f'{prefix}.irtg'
    input_fn = f'{prefix}.input'
    utils.set_parse(input_fn, id_graph)
    output_fn = f'{prefix}.output'
    try:
        gen_grammar(sen_rules, grammar_fn, sen)
        run_alto(timeout, input_fn, grammar_fn, output_fn)
    except StopIteration:
        print(f'sen {i} timed out, falling back to binary grammar')
        gen_grammar(sen_rules, grammar_fn, sen, binary=True)
        run_alto(timeout, input_fn, grammar_fn, output_fn)
        
    return utils.get_ids_from_parse(output_fn)

In [None]:
conll = utils.get_conll_from_file(TEST_FILE, word_to_id)
pred_ids = []

In [None]:
id_graphs[2]
rules[2]
conll[2]

In [None]:
dirname = 'gen'
for i in range(4, len(rules)):
# for i in range(0, 10):
    print(i)
    pred_ids.append(do_parse(id_graphs[i], rules[i], conll[i], f"{dirname}/{i}"))

In [None]:
[t.word for t in sorted(conll[4], key=lambda tok: int(tok.misc.split('|')[-1].split('=')[-1]))]

In [None]:
[t.pos for t in sorted(conll[4], key=lambda tok: int(tok.misc.split('|')[-1].split('=')[-1]))]

In [None]:
rules[4]

In [None]:
with open("test.conllu" , "w") as f:
    for i in id_to_parse:
        conll_f = id_to_parse[i][1]
        for line in conll_f:
            f.write(str(line) + "\t")
            f.write("\t".join(conll_f[line]))
            f.write('\n')
        f.write("\n")

In [None]:
converter.to_tokenized_output("test-results-inflected/", "tokenized_test_results/")

In [None]:
conll[2]