In [1]:
%run ../script/webnlg.py

import re

In [2]:
train = WebNLGCorpus(dataset='train')

In [3]:
e = train.sample(idx='6_164')
e.lexes()

['The Left Ecology Freedom are the leading party in Gubbio.']

In [4]:
e.triples(kind='dict')

[{'m_object': 'Left_Ecology_Freedom',
  'm_predicate': 'leaderParty',
  'm_subject': 'Gubbio'}]

In [13]:
from nltk.stem import WordNetLemmatizer

def split(spo):
    
    return spo.split(' ')

lemmatizer = WordNetLemmatizer()

def lemmatize(spo):
    
    return ' '.join((lemmatizer.lemmatize(spo_) for spo_ in split(spo)))

from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer

stemmer_ = 'lancaster'

if stemmer_ == 'lancaster':
    stemmer = LancasterStemmer()
elif stemmer == 'porter':
    stemmer = PorterStemmer()
else:
    stemmer = SnowballStemmer()

def stem(spo):
    
    return ' '.join((stemmer.stem(spo_) for spo_ in split(spo)))

c_detect_camelcase = re.compile(r'(?<=[a-z])([A-Z])')
def split_by_camelcase(spo):
    
    return c_detect_camelcase.sub(r' \1', spo)

c_chars_to_remove = re.compile(r'[_]')
def remove_unwanted_char(spo):
    
    return c_chars_to_remove.sub(' ', spo)

def to_lower(spo):
    
    return spo.lower()

from functools import reduce

spo_pipeline = [split_by_camelcase, remove_unwanted_char, to_lower, stem]
def preprocess_spo(spo):
    
    return reduce(lambda v, f: f(v), spo_pipeline, spo)
    
def preprocess_triple(triple):

    return {k:preprocess_spo(v) for k, v in triple.items()}    

lexe_pipeline = [to_lower, stem]
def preprocess_lexe(lexe):
    
    return reduce(lambda v, f: f(v), lexe_pipeline, lexe)

In [14]:
def replace_sop(m):
    
    return "<{}>".format(next((k for k, v in m.groupdict().items() if v)))

def simple_align(triple, lexe):
    
    preprocessed_triple = preprocess_triple(triple)
    
    preprocessed_lexe = preprocess_lexe(lexe)
    
    regex = '((?P<subject>{m_subject})|(?P<predicate>{m_predicate})|(?P<object>{m_object}))'.format(**preprocessed_triple)
    
    return re.compile(regex).sub(replace_sop, preprocessed_lexe)

In [15]:
c = simple_align(e.triples(kind='dict')[0], e.lexes()[0])
c

'the <object> ar the <predicate> in <subject>.'

In [53]:
c_search_spo = re.compile(r'((?P<object><object>)|(?P<predicate><predicate>)|(?P<subject><subject>))')

def count_spo(s):
    
    return [t[0] for t in c_search_spo.findall(s)]

In [63]:
e = train.sample()
c = simple_align(e.triples(kind='dict')[0], e.lexes()[0])
c, count_spo(c), e.triples()[0]

('the character, <subject>es, was <predicate> by <object>.',
 ['<subject>', '<predicate>', '<object>'],
 {'m_object': 'Len_Wein', 'm_predicate': 'creator', 'm_subject': 'Aurakles'})

In [43]:
c

'<subject> is a <object>.'

In [44]:
c_search_spo.match(c).groupdict()

{'object': None, 'predicate': None, 'subject': '<subject>'}

In [52]:
c_search_spo.findall('<subject> <object> <predicate> <subject>')

[('<subject>', '', '', '<subject>'),
 ('<object>', '<object>', '', ''),
 ('<predicate>', '', '<predicate>', ''),
 ('<subject>', '', '', '<subject>')]