In [1]:
from collections import defaultdict
from lxml import etree
from discoursegraphs import t
from discoursegraphs.readwrite.rst.rs3 import extract_relationtypes

In [2]:
RS3_EXAMPLE_FILE = '/usr/local/lib/python2.7/dist-packages/discoursegraphs-0.3.2-py2.7.egg/discoursegraphs/data/potsdam-commentary-corpus-2.0.0/rst/maz-1818.rs3'

In [3]:
def get_rs3_data(rs3_file):
    rs3_etree = etree.parse(rs3_file)
    reltypes = extract_relationtypes(rs3_etree)
    elements = defaultdict(lambda : defaultdict(str))
    children = defaultdict(list)

    for elem in rs3_etree.iter('segment', 'group'):
        elem_id = elem.attrib['id']
        parent_id = elem.attrib.get('parent')
        elements[elem_id]['parent'] = parent_id
        children[parent_id].append(elem_id)
        
        relname = elem.attrib.get('relname')
        elements[elem_id]['relname'] = relname
        if relname == None:
            # nodes without a parent have no relname attribute
            elements[elem_id]['nuclearity'] = 'root'
        else:
            reltype = reltypes.get(relname, 'span')
            elements[elem_id]['reltype'] = reltype
            if reltype == 'rst':
                # this elem is the S of an N-S relation, its parent is the N
                elements[elem_id]['nuclearity'] = 'satellite'
            elif reltype == 'multinuc':
                # this elem is one of several Ns of a multinuc relation.
                # its parent is the multinuc relation node.
                elements[elem_id]['nuclearity'] = 'nucleus'
            elif reltype == 'span':
                # this elem is the N of an N-S relation, its parent is a span
                elements[elem_id]['nuclearity'] = 'nucleus'
            else:
                raise NotImplementedError("Unknown reltype: {}".format(reltypes[relname]))
        
        elem_type = elem.tag
        elements[elem_id]['element_type'] = elem_type
        
        if elem_type == 'segment':
            elements[elem_id]['text'] = elem.text
        else:  # elem_type == 'group':
            elements[elem_id]['group_type'] = elem.attrib.get('type')
    return children, elements

In [4]:
cdict, edict = get_rs3_data(RS3_EXAMPLE_FILE)

In [5]:
def dt(child_dict, elem_dict, start_node=None):
    elem_id = start_node
    if elem_id not in elem_dict:
        return []
    
    elem = elem_dict[elem_id]
    elem_type = elem['element_type']
    
    if elem_type == 'segment':
        if elem['nuclearity'] == 'root':
            return t('elem_id', elem['text'])
        
        if elem['reltype'] == 'rst':
            parent = elem['parent']
            nuc_subtree = dt(child_dict, elem_dict, start_node=parent)
            return t(elem['relname'], [
                ('S ({})'.format(elem_id), elem['text']),
                ('N ({})'.format(parent), nuc_subtree)
            ])
        elif elem['reltype'] == 'multinuc':
            return dt(child_dict, elem_dict, start_node=elem['parent'])
        else:  # elem['reltype'] == 'span'
            assert len(child_dict[elem_id]) == 1
            child_node = child_dict[elem_id][0]
            return dt(child_dict, elem_dict, start_node=child_node)


    else:  # elem_type == 'group':
        if elem['reltype'] == 'rst':
            # --- copied from 'segment' section ---
            parent = elem['parent']
            nuc_subtree = dt(child_dict, elem_dict, start_node=parent)
            return t(elem['relname'], [
                ('S ({})'.format(elem_id), elem['text']),
                ('N ({})'.format(parent), nuc_subtree)
            ])
            # --- copied from 'segment' section ---
        else: # elem_type == 'group' and elem['reltype'] != 'rst'
            raise NotImplementedError

In [6]:
dt(cdict, edict, start_node='5')

RuntimeError: maximum recursion depth exceeded in cmp

In [None]:
dt(cdict, edict, start_node='3')

In [None]:
# import IPython

# for child in cdict:
#     print(child)
#     try:
#         IPython.core.display.display(dt(cdict, edict, start_node=child))
#     except NotImplementedError as e:
#         print(e.args, e.message)