In [1]:
import discoursegraphs as dg
from discoursegraphs.corpora import pcc
from discoursegraphs.readwrite.rst import rs3

## Task 1: Which syntactic subordinate clauses match with an RST EDU?
## Task 2: Are these EDUs satellites or nucleii?
## Task 3: does this correlate with certain RST relation types?


In [2]:
def get_subordinate_clauses(tiger_docgraph):
    """
    given a document graph of a TIGER syntax tree, return all
    node IDs of nodes representing subordinate clause constituents.
    """
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])
    
    subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
            subord_clause_nodes.append(target_id)
    return subord_clause_nodes

In [3]:
from operator import itemgetter



In [4]:
from collections import defaultdict

from intervaltree import IntervalTree, Interval

import discoursegraphs as dg
from discoursegraphs.readwrite.rst import rs3


def find_overlapping_nodes(
    docgraph, local_nodes, other_nodes,
    overlap_threshold=95, debug=False):
    """
    """
    def max_overlap(overlapping_nodes):
        """
        given a list of nodes (and their overlap scores),
        return the best matching node.
        """
        return sorted(
            overlapping_nodes, key=lambda n: n['overlap'],
            reverse=True)[0]

    # there might be more than one node covering the same span
    local_span2nodes = defaultdict(list)
    for local_node in local_nodes:
        span = dg.get_span_offsets(docgraph, local_node)
        local_span2nodes[span].append(local_node)
        
    other_span2nodes = defaultdict(list)
    for other_node in other_nodes:
        span = dg.get_span_offsets(docgraph, other_node)
        other_span2nodes[span].append(other_node)

    other_tree = IntervalTree.from_tuples(other_span2nodes.keys())
    overlap_map = defaultdict(list)

    for local_span in local_span2nodes:
        # all the spans from ``other_spans`` that overlap with this ``local_span``
        overlap_intervals = other_tree[Interval(*local_span)]
        for overlap_interval in overlap_intervals:
            local_on, local_off = local_span
            len_local = local_off - local_on

            other_on, other_off = overlap_interval.begin, overlap_interval.end
            len_other = other_off - other_on
            len_longest_input = max(len_local, len_other)
            
            overlap_on = max(local_on, other_on)
            overlap_off = min(local_off, other_off)
            len_overlap = overlap_off - overlap_on

            # overlap in % between the input intervals
            overlap = len_overlap / float(len_longest_input) * 100

            if overlap >= overlap_threshold or (len_overlap+2 >= len_longest_input):
                # generate a mapping from a local node to all
                # the ``other_nodes`` it overlaps with (incl.
                # their overlap in % for finding the best match)
                for local_node in local_span2nodes[local_span]:
                    overlap_span = (overlap_interval.begin, overlap_interval.end)
                    for other_node in other_span2nodes[overlap_span]:
                        overlap_map[local_node].append(
                            {'node_id': other_node, 'overlap': overlap,
                             'interval': (other_on, other_off)})
                
                if debug:
                    print local_span, overlap_interval, overlap
    
    return {local_node: max_overlap(overlap_map[local_node])
            for local_node in overlap_map}


In [5]:
for doc_id in pcc.document_ids[:10]:
    docgraph = pcc[doc_id]
    print docgraph.name
        
    subord_nodes = get_subordinate_clauses(docgraph)
    edu_nodes =  rs3.get_edus(docgraph)

    try:
        overlapping_nodes_map = find_overlapping_nodes(
            docgraph, subord_nodes, edu_nodes, overlap_threshold=100)

        for subord_id, edu in overlapping_nodes_map.items():
            subord_clause_cat = docgraph.node[subord_id]['tiger:cat']

            edu_node_id = edu['node_id']
            segment_type = docgraph.node[edu_node_id]['rst:segment_type']

            rel_name = docgraph.node[edu_node_id].get('rst:rel_name')
            if not rel_name:
                in_edges = docgraph.in_edges(edu_node_id)
                assert len(in_edges) == 1, \
                    "There must be exactly one dominating node."
                dom_node_id = in_edges[0][0]
                rel_name = docgraph.node[dom_node_id]['rst:rel_name']

#             print subord_clause_cat, edu_node_id, segment_type, rel_name
#         print
    except ValueError as e:
        print "ValueError in {}".format(doc_id)

maz-00001.rs3
ValueError in maz-00001
maz-00002.rs3
ValueError in maz-00002
maz-10110.rs3
maz-10175.rs3
maz-10205.rs3
maz-10207.rs3
ValueError in maz-10207
maz-10374.rs3
ValueError in maz-10374
maz-10423.rs3
maz-10503.rs3
maz-10575.rs3
ValueError in maz-10575


In [6]:
docgraph.name

'maz-10575.rs3'

In [7]:
dg.get_span(pcc['maz-10110'], 'rst:2')

['rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_0',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_1',
 'rst:rst:2_2',
 'rst:rst:2_2',
 'rst:rst:2_2',
 'rst:rst:2_2',
 'rst:rst:2_2',
 'rst:rst:2_2',
 'rst:rst:2_2',
 'rst:rst:2_2',
 'rst:rst:2_2',
 'rst:rs

In [8]:
dg.get_span_offsets(pcc['maz-10110'], 'rst:2')

KeyboardInterrupt: 

In [None]:
dg.get_text(pcc['maz-10110'], 'rst:2')

In [None]:
dg.get_text(pcc['maz-10110'], 'rst:10')

In [None]:
dg.get_text(pcc['maz-10110'])

In [None]:
from collections import OrderedDict

token_list = pcc['maz-10110'].tokens

In [None]:
od = OrderedDict({node_id: node_id for node_id in token_list})

In [None]:
token_list

In [None]:
od['rst:rst:2_0']