In [1]:
%run ../script/webnlg.py

import pandas as pd
pd.set_option('max_colwidth', 1000)

In [2]:
import spacy
from textacy import similarity

nlp = spacy.load('en_core_web_lg')

In [57]:
def get_span(doc, node):
    
    return doc[node.left_edge.i: node.right_edge.i + 1]

def get_left_span(doc, node):
    
    return doc[node.left_edge.i: node.i + 1]

def get_right_span(doc, node):
    
    return doc[node.i: node.right_edge.i + 1]


def as_span(doc, node):
    
    return doc[node.i: node.i + 1]


def get_distances(doc, data, distance_metric):
    
    distances, nodes = [], []

    roots = [token for token in doc if token.head == token]

    for root in roots:
        
        root_span = get_span(doc, root)
        root_left_span = get_left_span(doc, root)
        root_right_span = get_right_span(doc, root)
        root = as_span(doc, root)

        # test agains the node and its subtree
        for node in set((root, root_span, root_left_span, root_right_span)):
            
            nodes.append(node)
            
            distances_node = []

            for d in data.values():

                distances_node.append(distance_metric(d, node.text))

            distances.append(distances_node)

        roots.extend(root.lefts)
        roots.extend(root.rights)
        
    return pd.DataFrame(distances, index=nodes, columns=data.keys())

In [110]:
def generate_template(text, data):
    
    doc = nlp(text)

    df = get_distances(doc, data, similarity.token_sort_ratio)
    
    text_char = list(text)
    base = 0

    for idx, span in sorted(df.idxmax().iteritems(), key=lambda v: v[1].start_char):
        
        text_char[base + span.start_char: base + span.end_char] = idx

        base -= span.end_char - span.start_char - len(idx)
        
    return ''.join(text_char)

In [111]:
text = 'Eleanor Rigby picks up the rice in the church'
data = {'{subject}': 'Eleanor Rigby', 
        '{predicate}': 'pick up', 
        '{object}': 'rice'}
template = generate_template(text, data)

print(template)

template.format(**{'subject': 'Abelardo Vieira Mota',
          'predicate': 'drive',
          'object': 'car'})

{subject} {predicate} up the {object} in the church


'Abelardo Vieira Mota drive up the car in the church'

# WebNLG

In [11]:
%run ../script/webnlg.py

train = WebNLGCorpus.load('train')

train_1 = train.subset(ntriples=1)

In [122]:
s = train_1.sample()

text = s.ldf.ltext.values.tolist()[0]
data = s.mdf[['m_subject', 'm_predicate', 'm_object']].to_dict(orient='records')[0]

template = generate_template(text, data)

print(text)
print()
print(data)
print()
print(template)
print()
template.format(**{'subject': 'Abelardo Vieira Mota',
          'predicate': 'drive',
          'object': 'car'})

The comic character, Bolt, was created by comic book writer Gary Cohn.

{'m_subject': 'Bolt_(comicsCharacter)', 'm_predicate': 'creator', 'm_object': 'Gary_Cohn_(comics)'}

The m_object m_subject was m_predicate by comic book writer Gary Cohn.



'The m_object m_subject was m_predicate by comic book writer Gary Cohn.'