In [1]:
%run ../script/webnlg.py

import re
from pprint import pprint
import pandas as pd
pd.set_option('max_colwidth', 1000)

import logging

logger = logging.getLogger()

In [2]:
train = WebNLGCorpus.load(dataset='train')

# Simple Template Model

ntriples = 1

In [3]:
train_1 = train.subset(ntriples=1)

In [4]:
import re
import spacy
from nltk.metrics.distance import edit_distance

nlp = spacy.load('en_core_web_lg')

In [7]:
doc = nlp('The wizard of oz is killing people')

list(doc.noun_chunks)

[The wizard, oz, people]

In [67]:
doc[0:2].text

'The wizard'

# Make the distance a plugable

In [68]:
def get_span(node):
    
    return doc[node.left_edge.i: node.right_edge.i + 1]


roots = [token for token in doc if token.head == token]

nodes = []

data = ['wizard of oz', 'people']

distances = []

for root in roots:
    
    nodes.append(root)
    
    distances_root = []
    
    for d in data:
        
        distances_root.append(edit_distance(d, get_span(root).text))
        
    distances.append(distances_root)
    
    roots.extend(root.lefts)
    roots.extend(root.rights)

In [71]:
pd.DataFrame(distances, index=nodes, columns=data)

Unnamed: 0,wizard of oz,people
killing,22,28
wizard,4,14
is,11,6
people,11,0
The,12,5
of,7,6
oz,10,5


In [288]:
def replace_by_groupname(m):
    
    return "{{{}}}".format(next((k for k, v in m.groupdict().items() if v)))

DISTANCE_THRESHOLD = 200

#TODO: parameterize distance metric
nnc_logger = logging.getLogger("nearest_noun_chunks")
def nearest_noun_chunks(doc, m_subject, m_object):
    
    map_text_into_function = {m_subject: 'm_subject',
                              m_object: 'm_object'}
    
    distances_s = [(m_subject, nc, edit_distance(m_subject, nc.text)) for nc in doc.noun_chunks]
    distances_o = [(m_object, nc, edit_distance(m_object, nc.text)) for nc in doc.noun_chunks]
    
    distances = distances_s + distances_o
    
    if not distances:
        
        raise Exception("doc without sufficient noun chunks: {}".format(doc))
    
    nnc_logger.debug(distances)
    
    min_distance_1 = min(distances, key=lambda v: v[2])
    
    if min_distance_1[2] > DISTANCE_THRESHOLD:
        nnc_logger.warning("distance threshold: {}".format(min_distance_1))
    
    # remove distances from already matched resource
    distances_without_m = [v for v in distances if v[1] != min_distance_1[1] and v[0] != min_distance_1[0]]
    
    if not distances_without_m:
        
        raise Exception("doc without sufficient noun chunks: {}".format(doc))
    
    nnc_logger.debug(distances_without_m)
    
    min_distance_2 = min(distances_without_m, key=lambda v: v[2])
    
    if min_distance_2[2] > DISTANCE_THRESHOLD:
        nnc_logger.warning("distance threshold: {}".format(min_distance_2))

    return {map_text_into_function[min_distance_1[0]]: min_distance_1[1],
            map_text_into_function[min_distance_2[0]]: min_distance_2[1]}


class TemplateExtractor(object):
    
    def __init__(self):
        
        self.logger = logging.getLogger('TemplateExtractor')
    
    def extract_template(self, text, triple):
        
        slots = {}
        
        doc = nlp(text)
        
        ncc = nearest_noun_chunks(doc, triple['m_subject'], triple['m_object'])
        ncc_regex_escaped = {k: re.escape(v.text) for k, v in ncc.items()}
        
        self.logger.debug(ncc)
        
        # is it necessary to compile?
        c = re.compile("((?P<m_subject>{m_subject})|(?P<m_object>{m_object}))".format(**ncc_regex_escaped))

        return c.sub(replace_by_groupname, doc.text)
    

#TODO: search python template libraries
class Template(object):
    
    def __init__(self, template_string):
        
        self.template_string = template_string
        
    def fill(self, triple):
        
        return self.template_string.format(**triple)
    
    def __str__(self):
        
        return self.template_string

In [246]:
logger.setLevel(logging.WARN)

e = train_1.sample()

lexe = e.lexes()[0]
triple = e.triples()[0]

te = TemplateExtractor()
t = te.extract_template(lexe, triple)

print(lexe)
print()
print(triple)
print()
print(t)



Bakewell tart comes from the Derbyshire Dales region.

{'m_object': 'Derbyshire_Dales', 'm_predicate': 'region', 'm_subject': 'Bakewell_tart'}

{m_subject} comes from {m_object}.


# Problems

* some objects or subjects aren't fully captured
    * idx = 5_37

# Let's build a template database

In [310]:
%%time
from collections import defaultdict

logger.setLevel(logging.ERROR)
template_db = defaultdict(set)

lexes_triples = pd.merge(train_1.ldf, train_1.mdf)

te = TemplateExtractor()

for ix, row in lexes_triples.iterrows():
    lexe = row['ltext']
    triple = {'m_subject': row['m_subject'],
              'm_object': row['m_object'],
              'm_predicate': row['m_predicate']
             }
    try:
        t = te.extract_template(lexe, triple)
    
        template_db[row['m_predicate']].add(t)
    except Exception as ex:
        
        pass

CPU times: user 2min 17s, sys: 2.2 s, total: 2min 20s
Wall time: 38.3 s


### how many predicates did we get templates for?

In [311]:
len(template_db)

225

### how many predicates are in the train_1 dataset

In [312]:
train_1.mdf.m_predicate.unique().shape

(227,)

### how many templates per predicate?

In [320]:
data_index = [(m_predicate, len(templates)) for m_predicate, templates in template_db.items()]
data = [d[1] for d in data_index]
index = [d[0] for d in data_index]

stats_on_templates = pd.Series(data=data, index=index)
stats_on_templates.describe()

count    225.000000
mean      13.013333
std       21.639390
min        1.000000
25%        3.000000
50%        5.000000
75%       14.000000
max      170.000000
dtype: float64

### what's the predicate with most templates?

In [322]:
stats_on_templates.nlargest(10)

country         170
isPartOf        164
runwayLength     96
location         93
leaderName       83
language         75
club             72
runwayName       66
leaderTitle      62
creator          54
dtype: int64

# Data Alignment

In [13]:
from nltk.stem import WordNetLemmatizer

def split(spo):
    
    return spo.split(' ')

lemmatizer = WordNetLemmatizer()

def lemmatize(spo):
    
    return ' '.join((lemmatizer.lemmatize(spo_) for spo_ in split(spo)))

from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer

stemmer_ = 'lancaster'

if stemmer_ == 'lancaster':
    stemmer = LancasterStemmer()
elif stemmer == 'porter':
    stemmer = PorterStemmer()
else:
    stemmer = SnowballStemmer()

def stem(spo):
    
    return ' '.join((stemmer.stem(spo_) for spo_ in split(spo)))

c_detect_camelcase = re.compile(r'(?<=[a-z])([A-Z])')
def split_by_camelcase(spo):
    
    return c_detect_camelcase.sub(r' \1', spo)

c_chars_to_remove = re.compile(r'[_]')
def remove_unwanted_char(spo):
    
    return c_chars_to_remove.sub(' ', spo)

def to_lower(spo):
    
    return spo.lower()

from functools import reduce

spo_pipeline = [split_by_camelcase, remove_unwanted_char, to_lower, stem]
def preprocess_spo(spo):
    
    return reduce(lambda v, f: f(v), spo_pipeline, spo)
    
def preprocess_triple(triple):

    return {k:preprocess_spo(v) for k, v in triple.items()}    

lexe_pipeline = [to_lower, stem]
def preprocess_lexe(lexe):
    
    return reduce(lambda v, f: f(v), lexe_pipeline, lexe)

In [14]:
def replace_sop(m):
    
    return "<{}>".format(next((k for k, v in m.groupdict().items() if v)))

def simple_align(triple, lexe):
    
    preprocessed_triple = preprocess_triple(triple)
    
    preprocessed_lexe = preprocess_lexe(lexe)
    
    regex = '((?P<subject>{m_subject})|(?P<predicate>{m_predicate})|(?P<object>{m_object}))'.format(**preprocessed_triple)
    
    return re.compile(regex).sub(replace_sop, preprocessed_lexe)

In [15]:
c = simple_align(e.triples(kind='dict')[0], e.lexes()[0])
c

'the <object> ar the <predicate> in <subject>.'

In [53]:
c_search_spo = re.compile(r'((?P<object><object>)|(?P<predicate><predicate>)|(?P<subject><subject>))')

def count_spo(s):
    
    return [t[0] for t in c_search_spo.findall(s)]

In [63]:
e = train.sample()
c = simple_align(e.triples(kind='dict')[0], e.lexes()[0])
c, count_spo(c), e.triples()[0]

('the character, <subject>es, was <predicate> by <object>.',
 ['<subject>', '<predicate>', '<object>'],
 {'m_object': 'Len_Wein', 'm_predicate': 'creator', 'm_subject': 'Aurakles'})

In [43]:
c

'<subject> is a <object>.'

In [44]:
c_search_spo.match(c).groupdict()

{'object': None, 'predicate': None, 'subject': '<subject>'}

In [52]:
c_search_spo.findall('<subject> <object> <predicate> <subject>')

[('<subject>', '', '', '<subject>'),
 ('<object>', '<object>', '', ''),
 ('<predicate>', '', '<predicate>', ''),
 ('<subject>', '', '', '<subject>')]

# Spacy

In [51]:
import spacy
%run ../script/spacy_util.py

nlp = spacy.load('en_core_web_lg')

In [15]:
e.triples()

[{'m_object': 'Left_Ecology_Freedom',
  'm_predicate': 'leaderParty',
  'm_subject': 'Gubbio'}]

In [63]:
e = train.sample()
pprint(e.triples())

print(e.lexes()[0])
doc = nlp(e.lexes()[0])

displacy_noun_chunks(doc)
displacy_noun_chunks(doc)

[{'m_object': 'Spain', 'm_predicate': 'country', 'm_subject': 'Arròs_negre'},
 {'m_object': 'White_rice',
  'm_predicate': 'ingredient',
  'm_subject': 'Arròs_negre'}]
White rice is an ingredient of Arros negre which is a traditional dish from Spain.


In [24]:
doc_to_df(doc)

Unnamed: 0,Text,POS,Dep,Tag,Shape,Alpha,Stop,Head,Left,Right,Entity,EntIOB,Lemma
0,The,determiner,det,DT,Xxx,X,--,Freedom,The,The,,O,the
1,Left,proper noun,compound,NNP,Xxxx,X,--,Freedom,Left,Left,,O,left
2,Ecology,proper noun,compound,NNP,Xxxxx,X,--,Freedom,Ecology,Ecology,,O,ecology
3,Freedom,proper noun,nsubj,NNP,Xxxxx,X,--,are,The,Freedom,,O,freedom
4,are,verb,ROOT,VBP,xxx,X,--,are,The,.,,O,be
5,the,determiner,det,DT,xxx,X,--,party,the,the,,O,the
6,leading,verb,amod,VBG,xxxx,X,--,party,leading,leading,,O,lead
7,party,noun,attr,NN,xxxx,X,--,are,the,Gubbio,,O,party
8,in,adposition,prep,IN,xx,X,--,party,in,Gubbio,,O,in
9,Gubbio,proper noun,pobj,NNP,Xxxxx,X,--,in,Gubbio,Gubbio,GPE,B,gubbio


In [172]:
def mask_noun_chunks(doc):
    
    lexe_s = list(doc.text)
    for nc in doc.noun_chunks:

        lexe_s[nc.start_char: nc.end_char] = '#' * len(nc.__str__())
        
    return ''.join(lexe_s)

In [197]:
values = []

for _ in range(10):
    
    e = train.sample()
    
    for lexe in e.lexes():
        
        doc = nlp(lexe)
        
        values.append({
            'lexe': lexe,
            'noun_chunks': list(doc.noun_chunks),
            'masked': mask_noun_chunks(doc),
            'triples': e.triples()
        })

In [207]:
df = pd.DataFrame(values)
df.to_html('oi.html')

In [205]:
train.sample().triples()

[{'m_object': 'Switzerland',
  'm_predicate': 'country',
  'm_subject': 'Accademia_di_Architettura_di_Mendrisio'},
 {'m_object': 'Meride',
  'm_predicate': 'neighboringMunicipality',
  'm_subject': 'Mendrisio'},
 {'m_object': 'Mendrisio',
  'm_predicate': 'city',
  'm_subject': 'Accademia_di_Architettura_di_Mendrisio'},
 {'m_object': 'Johann_Schneider-Ammann',
  'm_predicate': 'leaderName',
  'm_subject': 'Switzerland'}]