### gets notebook name and commit hash

In [4]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [5]:
nb_name = nb_name.rsplit('.')[0]

commit = !git rev-parse HEAD
commit = commit[0]

model_name = "{}_{}".format(nb_name, commit)

In [6]:
output_filename = "{}.txt".format(model_name)
log_filename = "{}.log".format(model_name)

import os

output_filepath = os.path.join('../data/models', output_filename)
log_filepath = os.path.join('../data/models', log_filename)

model_temp_dir = os.path.join('../tmp/', model_name)

bleu_all_cat = os.path.join(model_temp_dir, "{}_all-cat.txt".format(model_name))

if not os.path.isdir(model_temp_dir):
    os.mkdir(model_temp_dir)

### logs to file

In [90]:
import logging

logging.basicConfig(filename=log_filepath, 
                    level=logging.DEBUG, 
                    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                    filemode='w')

# Let's generate the template based model

In [8]:
%run ../script/webnlg.py
%run ../script/data_alignment.py

pd.set_option('max_colwidth', 1000)

nlp = spacy.load('en_core_web_lg')

# Template Extraction

In [9]:
class TemplateExtractionModel:
    
    def __init__(self, data_alignment_model):
    
        self.data_alignment_model = data_alignment_model
        

    def extract_template(self, text, data):

        # spacy model
        doc = nlp(text)
        
        # TODO: monitor success of subj/obje alignment
        m_subject_span, m_object_span = self.data_alignment_model.align_data(doc, data)

        # breaks text into char array
        text_char = list(text)

        # replaces subject text with m_subject placeholder
        text_char[m_subject_span.start_char: m_subject_span.end_char] = '{m_subject}'

        if m_object_span:
            # tests if the object occurs after the subject >
            #    if it is the case, you have to adjust the indexes accordingly
            if m_subject_span.start_char > m_object_span.end_char:

                base = 0
            else:
                # adjustes the indexes
                # length of the extracted subject text
                len_subject_text = m_subject_span.end_char - m_subject_span.start_char
                # length of the placeholder minus len_subject_text
                base = len('{m_subject}') - len_subject_text

            # replaces object text with m_object placeholder
            text_char[base + m_object_span.start_char: base + m_object_span.end_char] = '{m_object}'

        # build template using the char array
        return ''.join(text_char)

# WebNLG

In [17]:
train = WebNLGCorpus.load('train')
dev = WebNLGCorpus.load('dev')

# BIAS: use only 1 tripleset size dataset
train_1 = train.subset(ntriples=1)
dev_1 = dev.subset(ntriples=1)

### how many m_predicates exists in train and not in train_1?

In [18]:
in_train_ni_train_1 = set(train.mdf.m_predicate.unique()).difference(train_1.mdf.m_predicate.unique())
in_train_ni_train_1

{'5th_runway_SurfaceType',
 'affiliations',
 'architecture',
 'broadcastedBy',
 'campus',
 'chief',
 'child',
 'firstAired',
 'gemstone',
 'has to its northwest',
 'has to its southeast',
 'mascot',
 'neighboringMunicipality',
 'numberOfRooms',
 'patronSaint',
 'protein',
 'series',
 'served',
 'servingSize'}

# Example

In [19]:
# sample an entry
e = train_1.sample(idx='0_210')

# data alignment model with token_sort_ratio similarity metric
# PARAM/BIAS: similarity_metric 
# TODO: train changing similarity_metric
da = RootDataAlignmentModel(similarity.token_sort_ratio)
# template extraction model
te = TemplateExtractionModel(da)
    
# uses the first reference text
# it can have more than 1 reference text
first_lexicalization = e.ldf.ltext.values.tolist()[0]
# uses the first triple
first_triple = e.preprocessed_so()[0]

da.render_aligned(nlp(first_lexicalization), first_triple)

In [20]:
e.mdf.m_predicate.values[0]

'language'

In [21]:
# extracts the template
template = te.extract_template(first_lexicalization, first_triple)
template

'{m_object} is spoken in {m_subject}.'

In [22]:
template.format(**{'m_subject': 'Brazil', 'm_object': 'Portuguese'})

'Portuguese is spoken in Brazil.'

In [23]:
# TODO: remove lexicalized information not present in data
text = 'Eleanor Rigby picks up the rice in the church'
data = {'m_subject': 'Eleanor Rigby', 
        'm_object': 'rice'}

da = RootDataAlignmentModel(similarity.token_sort_ratio)
te = TemplateExtractionModel(da)

template = te.extract_template(text, data)

print(template)

template.format(**{'m_subject': 'Abelardo Vieira Mota', 'm_object': 'car'})

{m_subject} picks up the {m_object} in the church


'Abelardo Vieira Mota picks up the car in the church'

# Train

In [24]:
%%time

from collections import Counter, defaultdict
from itertools import chain

da = RootDataAlignmentModel(similarity.token_sort_ratio)
te = TemplateExtractionModel(da)

template_db = defaultdict(list)

#! BIAS: using only train_1 sentences
# for each sentence, extracts template
for entry in chain(train_1, dev_1):
    
    for text in entry.ldf.ltext.tolist():
        # to dictionary of s, o; [0] because to_dict returns a list of dicts(and, in this case, there
        #    will be only one element)
        data = entry.preprocessed_so()[0]
        predicate = data['m_predicate']

        template = te.extract_template(text, data)

        # add to db
        template_db[predicate].append(template)
    
# most frequent template
# BIAS: not necessarily are the better ones -> they can be a flawed one, like the ones without m_object
for k, templates in template_db.items():
    
    template_db[k] = Counter(templates).most_common(1)[0][0]
    
predicates_in_db = list(template_db.keys())

CPU times: user 3min 32s, sys: 3.11 s, total: 3min 35s
Wall time: 59.7 s


In [25]:
len(template_db)

237

In [26]:
template_db['language']

'{m_object} is spoken in {m_subject}.'

## Do I have one predicate for each predicate in test set?

In [27]:
test = WebNLGCorpus.load('test_no_lex')

In [28]:
predicates_in_train_dev = set(train.mdf.m_predicate.tolist())
predicates_in_train_dev = predicates_in_train_dev.union(set(dev.mdf.m_predicate.tolist()))

In [29]:
len(predicates_in_train_dev)

246

In [30]:
# TODO: how to deal with?
predicates_in_test = set(test.mdf.m_predicate.apply(preprocess_triple_text).tolist())
predicates_w_template = template_db.keys()

"There are {} predicates in test, from {}, which don't have a template".format(len(predicates_in_test.difference(predicates_w_template)),
                                                                               len(predicates_in_test))

"There are 117 predicates in test, from 300, which don't have a template"

117 predicates não tiveram templates extraídos. Why?

# So, let them fall back to the nearest predicate and then to baseline model

In [None]:
logger.setLevel(logging.INFO)

from spacy.lang.en.stop_words import STOP_WORDS

def remove_stops(t):
    
    return ' '.join([w for w in t.split() if w not in STOP_WORDS])

predicates_not_in_train_1 = set(test.mdf.m_predicate.unique()).difference(train_1.mdf.m_predicate.unique())
predicates_not_in_train_1 = [preprocess_triple_text(p) for p in predicates_not_in_train_1]

def get_nearest_predicate(predicate):
    
    similarities = []
    
    for in_db in predicates_in_db:
        
        no_stop_in_db = remove_stops(in_db)
        no_stop_predicate = remove_stops(predicate)
        
        doc_in_db = nlp(no_stop_in_db)
        doc_predicate = nlp(no_stop_predicate)
        
        sim = similarity.word2vec(doc_predicate, doc_in_db)
        
        similarities.append((in_db, sim))
    
    return max(similarities, key=lambda v: v[1])

template_nearest_db = {}

for m_predicate in predicates_not_in_train_1:
    
    logging.info("Processando {}".format(m_predicate))
    
    nearest, sim = get_nearest_predicate(m_predicate)

    template_nearest_db[m_predicate] = (nearest, sim)
    
    logging.info("Processado {}".format(m_predicate))

In [149]:
logger = logging.getLogger("Generate Sentence")

PREDICATE_NEAREST_SIMILARITY_THRESHOLD = .9

# gets the nearest predicate in template_db
# BIAS: similarity metric for predicate fallback
# TODO: parameterize
def get_nearest_predicate(predicate, similarity_metric):
    
    distances = [(in_db, similarity_metric(predicate, in_db)) for in_db in predicates_in_db]
    
    return max(distances, key=lambda v: v[1])

# generates the sentences
def generate_sentences(entry, similarity_metric):
    
    texts = []
    
    for i, triple in enumerate(entry.preprocessed_so()):

        m_predicate = triple['m_predicate']

        if m_predicate in template_db:
            
            template = template_db[m_predicate]
            
            if i > 0:
                triple['m_subject'] = ','
            
            text = template.format(**triple)

        else:
            
            nearest_predicate, sim = template_nearest_db[m_predicate]
            
            if sim > PREDICATE_NEAREST_SIMILARITY_THRESHOLD:
            
                template = template_db[nearest_predicate]
                
                if i > 0:
                    triple['m_subject'] = ','

                text = template.format(**triple)
            
            else:
                
                logger.warning("Fallback baseline for predicate %s", m_predicate)
                
                text = '{m_subject} {m_predicate} {m_object}'.format(**triple)

        texts.append(text)
    
    return texts

In [111]:
test_sample = test.sample(idx='0_419')

test_sample.mdf

Unnamed: 0,idx,mtext,m_subject,m_predicate,m_object
639,0_419,"Acta_Palaeontologica_Polonica | ISSN_number | ""0567-7920""",Acta_Palaeontologica_Polonica,ISSN_number,"""0567-7920"""
640,0_419,Acta_Palaeontologica_Polonica | LCCN_number | 60040714,Acta_Palaeontologica_Polonica,LCCN_number,60040714
641,0_419,"Acta_Palaeontologica_Polonica | abbreviation | ""Acta Palaeontol. Pol.""",Acta_Palaeontologica_Polonica,abbreviation,"""Acta Palaeontol. Pol."""


In [112]:
logger.setLevel(logging.WARNING)
generate_sentences(test_sample, similarity_metric=similarity.jaro_winkler)

['The ISSN number of Acta Palaeontologica Polonica is 0567-7920.',
 'The LCCN number of Acta Palaeontologica Polonica is 60040714.',
 'Acta Palaeontologica Polonica is abbreviated to Acta Palaeontol. Pol..']

# Generating texts for test set

In [129]:
len(template_nearest_db)

122

In [150]:
%%time 

import codecs

with codecs.open(output_filepath, 'w', 'utf-8') as f:
    
    for entry in test:
        
        # generates one sentence per triple
        entry_sentences = generate_sentences(entry, similarity.jaro_winkler)
        
        # BIAS: aggregation
        # TODO: how to deal with?
        entry_text = ' '.join(entry_sentences)
        
        f.write(entry_text)
        f.write('\n')

CPU times: user 7.62 s, sys: 172 ms, total: 7.8 s
Wall time: 7.9 s


In [151]:
!head -100 "$output_filepath" | tail -10

English language is spoken in Castle (novel).
Eric Flint was born in Burbank, California.
Macmillan Publishers is the parent company of Farrar, Straus and Giroux.
One of John Cowper Powys notable works is A Glastonbury Romance.
Soho Press is located in United States.
The Secret Scripture is published by Faber and Faber.
Asian Americans are an ethnic group in United States.
English language is spoken in United States.
Weymouth Sands is preceded by A Glastonbury Romance.
The manager of A.C. Chievo Verona is Rolando Maran.


In [152]:
!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name "$model_name" --team_filepath "$output_filepath" --outdir "$model_temp_dir"

Files creating finished for:  5 - Model - Template Based - roots_604049f5126c8e7723be2bcc42a5b3ba96fb29e8


In [153]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$bleu_all_cat"

BLEU = 38.75, 73.3/47.6/31.4/20.6 (BP=1.000, ratio=1.059, hyp_len=47216, ref_len=44580)
