In [1]:
from nltk.tree import Tree
from tree import MyTree
import codecs
import re

In [2]:
def recPaths(leaves, id_predicate):
    predicate = leaves[id_predicate]
    ptr = predicate.parent
    predicate_parents = []
    while ptr:
        predicate_parents += [ptr]
        ptr = ptr.parent
    features = []
    for i in leaves:
        s = ''
        prev_label = ''
        ptr = i.parent
        while True:
            if ptr in predicate_parents:
                common_id = predicate_parents.index(ptr)
                break
            if not ptr.label == prev_label:
                s += (ptr.label + '>')
                prev_label = ptr.label
            ptr = ptr.parent
        s += predicate_parents[common_id].label
        prev_label = predicate_parents[common_id].label
        for t in range(common_id - 1, -1, -1):
            ptr = predicate_parents[t]
            if not ptr.label == prev_label:
                s += ('<' + ptr.label)
                prev_label = ptr.label
        features += [s]
    return features


def predMacroFeature(predicate):
    father = predicate.parent
    grand = father.parent
    macroFeature = grand.label + '='
    for uncle in grand.children:
        if uncle == father:
            macroFeature += '(%s)|' % father.label
        else:
            macroFeature += '%s|' % uncle.label
    return macroFeature.strip('|')


def fetch_label(s, current_label):
    ts=re.search(u'\((.*)\*(.*)\)', s)
    tb=re.search(u'\((.*)\*', s)
    te=re.search(u'\*(.*)\)', s)
    if ts is not None:
        return 'S-' + ts.group(1), 'O'
    if tb is not None:
        return 'B-' + tb.group(1), tb.group(1)
    if te is not None:
        return 'E-' + current_label, 'O'
    if current_label == 'O':
        return 'O', 'O'
    return 'I-' + current_label, current_label




In [None]:
# load data
datastr = 'test'
f = codecs.open(datastr + ".textparsed","r","utf-8")
s_tree = f.read().strip().split('\n')
f = codecs.open(datastr + ".mg","r",'utf-8')
s_chunks = [[r.strip().split('\t') for r in t.strip().split('\n')] for t in f.read().strip().split('\n\n')]

In [19]:
# trans to trees
trees = []
for t in s_tree:
    tr = Tree.fromstring(t)
    trees += [tr]
mytrees = []
myleaves = []
for i, tree in enumerate(trees):
    _mytree = MyTree(tree)
    _mytree.labelDependencyTree()
    _mytree.findDependencyParent()
    mytrees += [_mytree]
    myleaves += [_mytree.leaves()]

In [20]:
samples = []
sampleIds = []
for i_sent, sent in enumerate(s_chunks):
    pos_verbs = []
    postag_verbs = []
    word_verbs = []
    for idx, m in enumerate(sent):
        if not m[2] == '-':
            pos_verbs += [idx]
            postag_verbs += [m[1]]
            word_verbs += [m[2]]
    n_verbs = len(pos_verbs)
    for i_verb in range(n_verbs):
        sampleIds += [i_sent]
        pos_relv = -pos_verbs[i_verb]
        sample_sent = []
        current_target = 'O'
        macroFeature = predMacroFeature(myleaves[i_sent][pos_verbs[i_verb]])
        pathFeatures = recPaths(myleaves[i_sent], pos_verbs[i_verb])
        for i_token, s in enumerate(sent):
            # basic features
            sample_token = s[0:2]
            
            # verb features
            sample_token += [word_verbs[i_verb]]
            sample_token += [postag_verbs[i_verb]]
            sample_token += [macroFeature]
            
            # relv position features
            sample_token += [str(pos_relv)]
            if pos_relv < 0:
                sample_token += ['bf']
            elif pos_relv > 0:
                sample_token += ['af']
            else:
                sample_token += ['at']
            pos_relv += 1
            sample_token += [pathFeatures[i_token]]
            id_depParent = myleaves[i_sent][i_token].dependencyParent
            if id_depParent:
                sample_token += [myleaves[i_sent][id_depParent - 1].label]
                sample_token += [myleaves[i_sent][id_depParent - 1].parent.label]
            else:
                sample_token += ['ROOT']
                sample_token += ['ROOT']
            if id_depParent == pos_verbs[i_verb] + 1:
                sample_token += ['isPred']
            else:
                sample_token += ['notPred']
            
            # fetch labels - just kill this chunk if no label exists.
            return_label, current_target = fetch_label(s[3 + i_verb], current_target)
            sample_token += [return_label]
            
            # add to lists
            sample_sent += [sample_token]
        samples += [sample_sent]
    

In [21]:
f = codecs.open(datastr + '.samples','w','utf-8')
for i,sent in enumerate(samples):
    if not i % 1000:
        print(i)
    s_sent = []
    for token in sent:
        s_sent += ['\t'.join(token)]
    s_sent = '\n'.join(s_sent)
    s_sent += '\n\n'
    f.write(s_sent)
f.close()

0
1000


In [22]:
f = codecs.open(datastr + '.sampleids','w','utf-8')
f.write(str(sampleIds))
f.close()