In [1]:
%run ../script/webnlg.py

import pandas as pd
import spacy
from textacy import similarity

pd.set_option('max_colwidth', 1000)

nlp = spacy.load('en_core_web_lg')

# Data Alignment

In [158]:
class RootDataAlignmentModel:
    
    def __init__(self, similarity_metric):
        
        self.similarity_metric = similarity_metric
        

    @staticmethod
    def get_span(doc, node):

        return doc[node.left_edge.i: node.right_edge.i + 1]
    @staticmethod
    def get_left_span(doc, node):

        return doc[node.left_edge.i: node.i + 1]
    @staticmethod
    def get_right_span(doc, node):

        return doc[node.i: node.right_edge.i + 1]
    @staticmethod
    def as_span(doc, node):

        return doc[node.i: node.i + 1]

    def get_distances(self, doc, data):

        distances, nodes = [], []

        roots = [token for token in doc if token.head == token]

        for root in roots:

            # root subtree
            root_span = get_span(doc, root)
            # root left subtree
            root_left_span = get_left_span(doc, root)
            # root right subtree
            root_right_span = get_right_span(doc, root)
            # root node
            root = as_span(doc, root)

            # test agains the node and its subtree
            for node in set((root, root_span, root_left_span, root_right_span)):

                nodes.append(node)

                distances_node = []

                for d in data.values():

                    distances_node.append(self.similarity_metric(d, node.text))

                distances.append(distances_node)

            roots.extend(root.lefts)
            roots.extend(root.rights)

        return pd.DataFrame(distances, index=nodes, columns=data.keys())

# Template Extraction

In [159]:
def overlaps(x1, x2, y1, y2):

    return max(0, min(x2, y2) - max(x1, y1)) > 0


class TemplateExtractionModel:
    
    def __init__(self, data_alignment_model):
    
        self.data_alignment_model = data_alignment_model
        

    def extract_template(self, text, data):

        doc = nlp(text)

        df = self.data_alignment_model.get_distances(doc, data)

        text_char = list(text)

        # subject
        #! BIAS: subject wins priority over distances tie
        m_subject_span = df.m_subject.nlargest(1).index.values[0]

        # replaces subject text by m_subject placeholder
        text_char[m_subject_span.start_char: m_subject_span.end_char] = 'm_subject'

        # object
        for span in df.m_object.sort_values(ascending=False).index.values:

            if overlaps(span.start_char, span.end_char,
                        m_subject_span.start_char, m_subject_span.end_char):

                continue

            # tests if the object occurs after the subject >
            #    if its the case, you have to adjust the indexes accordingly
            if m_subject_span.start_char > span.end_char:

                base = 0
            else:
                len_subject_text = m_subject_span.end_char - m_subject_span.start_char
                base = len('m_subject') - len_subject_text

            # replaces object text by m_object placeholder
            text_char[base + span.start_char: base + span.end_char] = 'm_object'

            break

        return ''.join(text_char)

# WebNLG

In [5]:
train = WebNLGCorpus.load('train')

train_1 = train.subset(ntriples=1)

In [160]:
e = train_1.sample(idx='0_210')

da = RootDataAlignmentModel(similarity.token_sort_ratio)
te = TemplateExtractionModel(da)
    
te.extract_template(e.ldf.ltext.values.tolist()[0], e.mdf[['m_subject', 'm_object']].to_dict(orient='records')[0])

'm_object is spoken in m_subject.'

In [7]:
e.ldf

Unnamed: 0,comment,idx,lid,ltext
524,good,0_210,Id1,The Faroese language is spoken in Denmark.
525,good,0_210,Id2,Denmark's language is Faroese.


In [8]:
e.mdf

Unnamed: 0,idx,mtext,m_subject,m_predicate,m_object
210,0_210,Denmark | language | Faroese_language,Denmark,language,Faroese_language


In [161]:
text = 'Eleanor Rigby picks up the rice in the church'
data = {'m_subject': 'Eleanor Rigby', 
        'm_object': 'rice'}

da = RootDataAlignmentModel(similarity.token_sort_ratio)
te = TemplateExtractionModel(da)

template = te.extract_template(text, data)

print(template)

template.format(**{'subject': 'Abelardo Vieira Mota',
          'object': 'car'})

m_subject picks up the m_object in the church


'm_subject picks up the m_object in the church'

# Train

In [10]:
from collections import defaultdict
import re

# replaces m_subject -> {m_subject} to be used as a python string template
c = re.compile('(m_subject|m_object)')

In [162]:
%%time

from collections import Counter

da = RootDataAlignmentModel(similarity.token_sort_ratio)
te = TemplateExtractionModel(da)

template_db = defaultdict(list)

#! BIAS: using only train_1 sentences
# for each sentence, extracts template
for entry in train_1:
    
    for text in entry.ldf.ltext.tolist():
        # to dictionary of s, o; [0] because to_dict returns a list of dicts(and, in this case, there
        #    will be only one element)
        data = entry.mdf[['m_subject', 'm_object']].to_dict(orient='records')[0]
        predicate = entry.mdf.m_predicate.values[0]

        template = te.extract_template(text, data)

        # puts placeholders
        template = c.sub(r'{\1}', template)

        # add to db
        template_db[predicate].append((entry.edf.idx.values[0], template))
    
# most frequent template
for k, templates in template_db.items():
    
    template_db[k] = Counter([v[1] for v in templates]).most_common(1)[0][0]
    
predicates_in_db = list(template_db.keys())

CPU times: user 3min 3s, sys: 8.24 s, total: 3min 11s
Wall time: 50.2 s


In [147]:
len(template_db)

227

In [163]:
template_db['language']

'{m_object} is spoken in {m_subject}.'

## Do I have one predicate for each predicate in test set?

In [19]:
test = WebNLGCorpus.load('test_no_lex')

In [21]:
predicates_in_test = set(test.mdf.m_predicate.tolist())
predicates_w_template = template_db.keys()

len("There are {} predicates in test which don't have a template".format(len(predicates_in_test ^ predicates_w_template)))

60

# So, let them fall back to the nearest predicate and then to baseline model

In [164]:
import logging

logger = logging.Logger('TemplateBasedModel')

unwanted_separators = re.compile(r'(\||_)')
unwanted_multiple_empty = re.compile(r'\s+')

def preprocess_triple(s):
    
    sep_changed = unwanted_separators.sub(' ', s)
    mult_empty_removed = unwanted_multiple_empty.sub(' ', sep_changed)
    
    return mult_empty_removed.replace('"', '')

def get_nearest_predicate(predicate):
    
    distances = [(in_db, similarity.jaro_winkler(predicate, in_db)) for in_db in predicates_in_db]
    
    return min(distances, key=lambda v: v[1])


def generate_sentences(entry):
    
    texts = []
    
    for i, triple in entry.mdf.iterrows():

        m_predicate = triple.m_predicate

        if m_predicate in template_db:
            
            template = template_db[m_predicate]

            preprocessed_triple = triple[['m_subject', 'm_object']].apply(preprocess_triple)

            text = template.format(**preprocessed_triple.to_dict())

        else:
            
            nearest_predicate, similarity = get_nearest_predicate(m_predicate)
            
            if similarity > .4:
                
                logger.warning("Fallback nearest predicate for predicate %s", m_predicate)
                
                template = template_db[nearest_predicate]

                preprocessed_triple = triple[['m_subject', 'm_object']].apply(preprocess_triple)

                text = template.format(**preprocessed_triple.to_dict())
                
            else:
                
               # logger.warning("Fallback baseline for predicate %s", m_predicate)

                text = preprocess_triple(triple.mtext)

        texts.append(text)
    
    return texts

In [165]:
test_sample = test.sample(idx='0_419')

test_sample.mdf

Unnamed: 0,idx,mtext,m_subject,m_predicate,m_object
639,0_419,"Acta_Palaeontologica_Polonica | ISSN_number | ""0567-7920""",Acta_Palaeontologica_Polonica,ISSN_number,"""0567-7920"""
640,0_419,Acta_Palaeontologica_Polonica | LCCN_number | 60040714,Acta_Palaeontologica_Polonica,LCCN_number,60040714
641,0_419,"Acta_Palaeontologica_Polonica | abbreviation | ""Acta Palaeontol. Pol.""",Acta_Palaeontologica_Polonica,abbreviation,"""Acta Palaeontol. Pol."""


In [166]:
logger.setLevel(logging.WARN)
generate_sentences(test_sample)

['The ISSN number of Acta Palaeontologica Polonica is 0567-7920.',
 'The LCCN number of Acta Palaeontologica Polonica is 60040714.',
 'Acta Palaeontologica Polonica is abbreviated Acta Palaeontol. Pol.. Pol.']

# Generating texts for test set

In [167]:
import codecs

logger.setLevel(logging.WARN)

with codecs.open('models/template_based/output.txt', 'w', 'utf-8') as f:
    
    for entry in test:
        
        entry_sentences = generate_sentences(entry)
        
        entry_text = ' '.join(entry_sentences)
        
        f.write(entry_text)
        f.write('\n')

In [168]:
!head -100 models/template_based/output.txt | tail -10

English language is spoken in Castle (novel).
Eric Flint was born in Burbank, California.
Farrar, Straus and Giroux is the parent company of the Macmillan Publishers Press.
One of John Cowper Powys notable works is Oliver A Glastonbury Romance.
Soho Press is located in United States.
Faber and Faber is the publisher of The Secret Scripture.
Asian Americans are an ethnic group in the United United States.
English language is spoken in United States.
Weymouth Sands by 1634: The A Glastonbury Romance Affair.
Rolando Maran manages A.C. Chievo Verona.


In [169]:
!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name template_based --team_filepath models/template_based/output.txt --outdir models/template_based/

Files creating finished for:  template_based


In [170]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < models/template_based/template_based_all-cat.txt

BLEU = 37.01, 65.0/44.8/30.7/21.0 (BP=1.000, ratio=1.156, hyp_len=52216, ref_len=45189)
