# Q: Are there NPs/PPs in PCC that aren't markables although they should be?

In [1]:
import os
import glob

import discoursegraphs as dg

Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
MMAX_ROOT_DIR = os.path.expanduser('/home/arne/corpora/potsdam-commentary-corpus-2.0.0/coreference/')
TIGER_ROOT_DIR = os.path.expanduser('/home/arne/corpora/potsdam-commentary-corpus-2.0.0/syntax/')


MMAX_TEST_FILE = os.path.join(MMAX_ROOT_DIR, 'maz-1423.mmax')
TIGER_TEST_FILE = os.path.join(TIGER_ROOT_DIR, 'maz-1423.xml')

In [3]:
# for mmax_file in glob.glob(MMAX_ROOT_DIR+'*.mmax'):
#     dg.read_mmax2(mmax_file)

In [4]:
from discoursegraphs.readwrite.mmax2 import get_potential_markables

def get_unmarked_potential_markables(docgraph):
    """
    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph that (at least) contains syntax trees
        (imported from Tiger XML files) and markables (imported from
        an MMAX2 file)
    """
    unmarked_potential_markables = []
    for cat_node in get_potential_markables(docgraph):
        if not any('mmax:markable' in eattr['layers']
                  for token_id in dg.get_span(docgraph, cat_node)
                  for (src, trg, eattr) in docgraph.in_edges(token_id, data=True)):
            unmarked_potential_markables.append(cat_node)
    return unmarked_potential_markables


In [5]:
from collections import defaultdict

def gen_unmarked_bracket_mappings(docgraph, unmarked_potential_markables):

    # a token node can be part of more than one markable/span
    opening = defaultdict(list)
    closing = defaultdict(list)
    
    for unmarked_phrase_id in unmarked_potential_markables:
        span_node_ids = dg.get_span(docgraph, unmarked_phrase_id)
        first = span_node_ids[0]
        last = span_node_ids[-1]
        opening[first].append(unmarked_phrase_id)
        closing[last].append(unmarked_phrase_id)
    return opening, closing

In [6]:
def gen_bracketed_output_for_unmarkables(docgraph):
    ret_str = u''
    unmarked_potential_markables = get_unmarked_potential_markables(docgraph)
    opening, closing = gen_unmarked_bracket_mappings(docgraph, unmarked_potential_markables)
    for tok_id in docgraph.tokens:
        token = docgraph.get_token(tok_id)
        if tok_id in opening:
            ret_str += u"[{} ".format(token)
        elif tok_id in closing:
            ret_str += u"{0}]_{{{1}}} ".format(token, 'UNMARKED: '+' '.join(closing[tok_id]))
        else:
            ret_str += u"{} ".format(token)
    return ret_str


In [7]:
from discoursegraphs.readwrite.brackets import gen_bracket_mappings, gen_closing_string

def gen_bracketed_output(docgraph, layer='mmax'):
    '''

    TODO: the order of the opening brackets should be determined (e.g. if
    a token marks the beginning of two markables, we could check if the
    first markable subsumes the second markable or vice versa.)

    Example
    -------
    Die Diskussion , wie teuer [die neue [Wittstocker]_{markable_22}
    Stadthalle]_{markable_21} für Vereine und Veranstalter wird , hat
    einige Zeit in Anspruch genommen .
    Die Betriebskosten [für den schmucken Veranstaltungsort]_{markable_21}
    sind hoch . Jetzt wird es darum gehen , [die Halle]_{markable_21} so oft
    wie möglich zu füllen .
    Und [in der Region]_{markable_22} gibt es Konkurrenz .
    '''
    opening, closing, markable2chain = gen_bracket_mappings(docgraph, layer=layer)
    
    unmarked_potential_markables = get_unmarked_potential_markables(docgraph)
    unmark_opening, unmark_closing = gen_unmarked_bracket_mappings(docgraph, unmarked_potential_markables)
    
    ret_str = u''
    stack = []
    for token_id in docgraph.tokens:
        token_str = docgraph.get_token(token_id)
        if token_id in opening:
            num_of_opening_brackets = len(opening[token_id])
            stack.extend(opening[token_id])
            opening_str = u'[' * num_of_opening_brackets

            if token_id in closing:
                # token is both the first and last element of 1+ markables
                closing_str = gen_closing_string(closing, markable2chain,
                                                 token_id, stack)
                ret_str += u'{0}{1}{2} '.format(opening_str, token_str,
                                                closing_str)
            else: # token is the first element of 1+ markables
                ret_str += u'{0}{1} '.format(opening_str, token_str)
        elif token_id in closing:
            closing_str = gen_closing_string(closing, markable2chain,
                                             token_id, stack)
            ret_str += u'{0}{1} '.format(token_str, closing_str)

        # token is not part of a markable, but of an (unmarked) NP/PP
        elif token_id in unmark_opening:
            num_of_opening_brackets = len(unmark_opening[token_id])
            stack.extend(unmark_opening[token_id])
            opening_str = u'//' * num_of_opening_brackets

            if token_id in unmark_closing:
                # token is both the first and last element of 1+ markables
#                 ret_str += u"{0}//_{{{1}}} ".format(token_str, 'UNMARKED: '+' '.join(unmark_closing[token_id]))
                ret_str += token_str + gen_unmark_closing_string(unmark_closing, token_id, stack)
                
                
            else: # token is the first element of 1+ markables
                ret_str += u'{0}{1} '.format(opening_str, token_str)            
        elif token_id in unmark_closing:
#             ret_str += u"{0}//_{{{1}}} ".format(token_str, 'UNMARKED: '+' '.join(unmark_closing[token_id]))
            ret_str += token_str + gen_unmark_closing_string(unmark_closing, token_id, stack)
            
            
            
        else: # token is not part of any span
            ret_str += u'{} '.format(token_str)
    return ret_str

In [8]:
def gen_unmark_closing_string(closing_dict, token_id, stack):
    num_of_closing_brackets = len(closing_dict[token_id])
    closing_markable_ids = [stack.pop()
                            for i in range(num_of_closing_brackets)]
    return u''.join(u'//_{{{}}}'.format(cmi) for cmi in closing_markable_ids) + u' '

In [9]:
import itertools

MMAX_BAD_FILE = os.path.expanduser('~/corpora/potsdam-commentary-corpus-2.0.0/coreference/maz-14172.mmax')

# Traceback (most recent call last):
#   File "<ipython-input-9-6e65fa629262>", line 11, in <module>
#     print gen_bracketed_output(mdg)
#   File "<ipython-input-7-c182c063225c>", line 20, in gen_bracketed_output
#     opening, closing, markable2chain = gen_bracket_mappings(docgraph)
#   File "/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.1.2-py2.7.egg/discoursegraphs/readwrite/brackets.py", line 41, in gen_bracket_mappings
#     docgraph, docgraph.node[markable][docgraph.ns+':span'])
# KeyError: 'mmax:span'

mdg = dg.read_mmax2(MMAX_BAD_FILE)

# generate a file that contains all markables and potential markables

In [19]:


import sys
import traceback
import codecs

with codecs.open('/tmp/unmarked_markables.txt', 'w', encoding='utf-8') as outfile:
    for mmax_file in sorted(glob.glob(MMAX_ROOT_DIR+'*.mmax'), key=dg.util.natural_sort_key)[:5]:
        mdg = dg.read_mmax2(mmax_file)
        doc_id = os.path.basename(mmax_file).split('.')[0]
        tdg = dg.read_tiger(os.path.join(TIGER_ROOT_DIR, doc_id+'.xml'))
        mdg.merge_graphs(tdg)
#         outfile.write(u"{}\n{}\n\n".format(mdg.name, gen_bracketed_output(mdg, layer='mmax')))
        sys.stdout.write(u"{}\n{}\n\n".format(mdg.name, gen_bracketed_output(mdg, layer='mmax')))

maz-00001.mmax
//Auf Eis//_{s2165_500} gelegt [[Dagmar Ziegler]_{markable_23}]_{markable_23} sitzt [in der Schuldenfalle]_{markable_12} . Auf Grund [der dramatischen Kassenlage [in Brandenburg]_{markable_90}]_{markable_12} hat [[sie]_{markable_23}]_{markable_23} jetzt [[eine seit mehr als einem Jahr erarbeitete Kabinettsvorlage]_{markable_64}]_{markable_64} überraschend //auf Eis//_{s2167_502} gelegt und vorgeschlagen , //erst 2003//_{s2167_503} [[darüber]_{markable_64}]_{markable_64} zu entscheiden . Überraschend , weil [das Finanz-]_{markable_23} und [das Bildungsressort]_{markable_1000139} [[das Lehrerpersonalkonzept]_{markable_64}]_{markable_64} gemeinsam entwickelt hatten . Der Rückzieher [der Finanzministerin]_{markable_31} ist aber verständlich . Es dürfte derzeit schwer zu vermitteln sein , weshalb //ein Ressort//_{s2170_501} pauschal //von künftigen Einsparungen//_{s2170_502} ausgenommen werden soll auf Kosten [der anderen]_{markable_1000193} . [[Reiches]_{markable_1000193} Mi