# Surface realization

In [None]:
from surface.grammar import Grammar
from surface import converter
from surface import utils
from collections import defaultdict
import ast

First we initialize the training and the test file to a variable, the files can be downloaded from the SRST 19 page.

In [None]:
TRAIN_FILE = "en_ewt-ud-train.conllu"
TEST_FILE = "en_ewt-ud-test.conllu"

Then, we train the two static grammars (the first corresponds to the subgraphs from the ud trees, the second is the fallback grammar, where each rule is binary)

Later, the dynamic grammars are generated from these ones.

In [None]:
grammar = Grammar()
grammar.train_subgraphs(TRAIN_FILE, TEST_FILE)

We need to extract the graphs from the conll format (conversion from conll to isi), and the rules that use the <strong>lin</strong> feature.

The rules are for incorporating the <strong>lin</strong> feature, so we can dynamically delete every rule the contradicts the linearity.

In [None]:
rules, _ = converter.extract_rules(TEST_FILE)
graphs, _, id_graphs= converter.convert(TEST_FILE)
_, sentences, _ = converter.convert(TEST_FILE)
conll = converter.get_conll_from_file(TEST_FILE)
id_to_parse = {}
stops = []

We run through the sentences and call the <strong>alto</strong> parser to generate the derivation and map the ud representation to string.

The alto can be downloaded from [bitbucket](https://bitbucket.org/tclup/alto/downloads/).

In [None]:
for sen_id in range(0, len(rules)):
    print(sen_id)
    try:
        grammar_fn = open('dep_grammar_spec.irtg', 'w') 
        grammar.generate_grammar(rules[sen_id], grammar_fn)
        grammar.generate_terminal_ids(conll[sen_id], grammar_fn)
        grammar_fn.close()
        utils.set_parse("ewt_ones", id_graphs[sen_id])
        !timeout 70 java -Xmx32G -cp alto-2.3.6-SNAPSHOT-all.jar de.up.ling.irtg.script.ParsingEvaluator ewt_ones -g dep_grammar_spec.irtg -I ud -O string=toString -o surface_eval_ewt
        text_parse, conll_parse = utils.get_parse("surface_eval_ewt", conll[sen_id])
        id_to_parse[sen_id] = (text_parse, conll_parse)
    except StopIteration:
        print("stop iteration")
        stops.append(sen_id)
        continue

We then iterate through the sentences that took too long to parse with the original grammar, and switch to the binary grammar for faster results.

In [None]:
for sen_id in stops:
    grammar_fn = open('dep_grammar_edges.irtg', 'w') 
    grammar.generate_grammar(rules[sen_id], grammar_fn, binary=True)
    grammar.generate_terminal_ids(conll[sen_id], grammar_fn)
    grammar_fn.close()
    set_parse("ewt_ones", id_graphs[sen_id])
    !java -Xmx32G -cp alto-2.3.6-SNAPSHOT-all.jar de.up.ling.irtg.script.ParsingEvaluator ewt_ones -g dep_grammar_edges.irtg -I ud -O string=toString -o surface_eval_ewt
    text_parse, conll_parse = get_parse("surface_eval_ewt", conll[sen_id])
    id_to_parse[sen_id] = (text_parse, conll_parse)

In [None]:
with open("pt_bosque-Pred-Stanford.conllu" , "w") as f:
    for i in id_to_parse:
        conll_f = id_to_parse[i][1]
        for line in conll_f:
            f.write(str(line) + "\t")
            f.write("\t".join(conll_f[line]))
        f.write("\n")

In [None]:
converter.to_tokenized_output("test-results-inflected/", "tokenized_test_results/")

In [5]:
rules[0]

[{'graph': [{'dir': None, 'edge': 'case', 'to': 'IN'}], 'root': 'NNP'},
 {'graph': [{'dir': None, 'edge': 'advcl', 'to': 'VBD'}], 'root': 'WP'},
 {'graph': [], 'root': 'IN'},
 {'graph': [{'dir': None, 'edge': 'mark', 'to': 'IN'},
   {'dir': 'S', 'edge': 'punct', 'to': 'PERIOD'},
   {'dir': None, 'edge': 'obl', 'to': 'NNP'},
   {'dir': None, 'edge': 'nsubj', 'to': 'NNP'}],
  'root': 'VBD'},
 {'graph': [], 'root': 'IN'},
 {'graph': [], 'root': 'PERIOD'},
 {'graph': [], 'root': 'NNP'}]