In [1]:
import os

os.sys.path.insert(0, '../script')

# Gets notebook name and commit hash

In [2]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [3]:
nb_name = nb_name.rsplit('.')[0]

commit = !git rev-parse HEAD
commit = commit[0]

model_name = "{}_{}".format(nb_name, commit)

In [4]:
output_filename = "{}.txt".format(model_name)
model_filename = "{}".format(model_name)
log_filename = "{}.log".format(model_name)

import os

output_filepath = os.path.join('../data/models', output_filename)
model_filepath = os.path.join('../data/models', model_filename)
log_filepath = os.path.join('../data/models', log_filename)

model_temp_dir = os.path.join('../tmp/', model_name)

bleu_all_cat = os.path.join(model_temp_dir, "{}_all-cat.txt".format(model_name))

if not os.path.isdir('../tmp'):
    os.mkdir('../tmp')
    
if not os.path.isdir(model_temp_dir):
    os.mkdir(model_temp_dir)

In [5]:
model_name

'5 - Model - Template Based - roots_4e9245cbbc05e4fdf0de404fbada20d8294127b2'

# Logs to file

In [6]:
import logging

logging.basicConfig(filename=log_filepath, 
                    level=logging.DEBUG, 
                    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                    filemode='w')

# Data

In [7]:
import spacy
from webnlg import WebNLGCorpus

nlp = spacy.load('en_core_web_lg')

train = WebNLGCorpus.load('train')
dev = WebNLGCorpus.load('dev')
test = WebNLGCorpus.load('test_no_lex')

# BIAS: use only 1 tripleset size dataset
train_1 = train.subset(ntriples=1)
dev_1 = dev.subset(ntriples=1)

### how many m_predicates exists in train+dev and not in train_1+dev_1?

In [8]:
train_dev_1_predicates = set(train_1.mdf.m_predicate.unique()).union(dev_1.mdf.m_predicate.unique())
train_dev_predicates = set(train.mdf.m_predicate.unique()).union(dev.mdf.m_predicate.unique())

all_not_in_1 = train_dev_predicates.difference(train_dev_1_predicates)

print("There are {} m_predicates in train+dev not present in train_1+dev_1.\nThey are:\n\n{}".format(
      len(all_not_in_1), '\n'.join(all_not_in_1)))

There are 9 m_predicates in train+dev not present in train_1+dev_1.
They are:

has to its northwest
served
gemstone
5th_runway_SurfaceType
architecture
servingSize
neighboringMunicipality
numberOfRooms
has to its southeast


### how many m_predicates exists in test and not in train_1+dev_1?

In [9]:
predicates_in_test = set(test.mdf.m_predicate.unique())
test_not_in_1 = predicates_in_test.difference(train_dev_1_predicates)

"There are {} predicates in test, from {}, which don't have a template".format(len(test_not_in_1),
                                                                               len(predicates_in_test))

"There are 117 predicates in test, from 300, which don't have a template"

# If the predicate doesn't exist, fall back to baseline

In [10]:
%%time
from sentence_generation import FallBackPipelineSentenceGenerator, NearestPredicateTemplateSentenceGenerator
from sentence_aggregation import JustJoinSentencesSentenceAggregator
from sentence_generation import JustJoinTripleSentenceGenerator, MostFrequentTemplateSentenceGenerator
from data_alignment import RootDataAlignmentModel
from template_extraction import TemplateExtractor
from text_generation import IfAfterNthProcessPipelineTextGenerator
from webnlg import preprocess_triple_text
from collections import Counter
from itertools import chain
from textacy import similarity


da = RootDataAlignmentModel(similarity.token_sort_ratio, nlp)

CPU times: user 844 ms, sys: 375 ms, total: 1.22 s
Wall time: 1.52 s


In [11]:
%%time
te = TemplateExtractor(da)

#! BIAS: using only train_1 sentences
# for each sentence, extracts template
texts = chain.from_iterable((entry.ldf.ltext.tolist() for entry in chain(train_1, dev_1)))
# to dictionary of s, o; [0] because to_dict returns a list of dicts(and, in this case, there
#    will be only one element)
datas = chain.from_iterable(([entry.get_data(preprocessor=preprocess_triple_text)[0]] * entry.ldf.shape[0] for entry in chain(train_1, dev_1)))

te.fit(texts, datas)

TemplateExtractor.save(te, model_filepath)

CPU times: user 2min, sys: 1min 16s, total: 3min 16s
Wall time: 1min 35s


In [12]:
tte = TemplateExtractor.load(model_filepath)

In [23]:
mft = MostFrequentTemplateSentenceGenerator(te, preprocessor=preprocess_triple_text)

npt = NearestPredicateTemplateSentenceGenerator(template_sentence_generator=mft,
                                                similarity_metric=similarity.levenshtein,
                                                predicates=test.mdf.m_predicate.unique().tolist(),
                                                preprocessor=preprocess_triple_text,
                                                threshold=.5)

jjt = JustJoinTripleSentenceGenerator(preprocessor=preprocess_triple_text)

sent_pipe = FallBackPipelineSentenceGenerator([mft, npt, jjt])
text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

def replace_subject(d):
    
    d['m_subject'] = ','
    
    return d

pipe = IfAfterNthProcessPipelineTextGenerator(sent_pipe, text_agg, processor=replace_subject, nth=0)

# Example

In [24]:
e = train.sample(idx='32_39')
e

Triple info: {'category': 'Astronaut', 'eid': 'Id40', 'idx': '32_39', 'ntriples': 4}

	Modified triples:

Buzz_Aldrin | birthPlace | Glen_Ridge,_New_Jersey
Buzz_Aldrin | was a crew member of | Apollo_11
Buzz_Aldrin | almaMater | "Massachusetts Institute of Technology, Sc.D. 1963"
Buzz_Aldrin | birthDate | "1930-01-20"


	Lexicalizations:

Buzz Aldrin was born in Glen Ridge, New Jersey on 1930-01-20. He attended the Massachusetts Institute of Technology obtaining a Sc.D in 1963. He was a crew member on Apollo 11.
Buzz Aldrin was born in Glen Ridge, New Jersey on 20 January 1930. He graduated from MIT ScD in 1963 and was a crew member of Apollo 11.

In [25]:
pipe.generate([e.get_data()])[0]

'Buzz Aldrin was born in Glen Ridge, New Jersey. , served as a crew member of Apollo 11. Massachusetts Institute of Technology, Sc.D. 1963 is , almaMater. , was born on 1930-01-20.'

# Test evaluation

In [26]:
%%time 
import codecs

with codecs.open(output_filepath, 'w', 'utf-8') as f:
    
    for text in pipe.generate((entry.get_data() for entry in test)):
        
        f.write("{}\n".format(text))

CPU times: user 5.78 s, sys: 344 ms, total: 6.12 s
Wall time: 6.2 s


In [27]:
!head -100 "$output_filepath" | tail -10

English language is spoken in Castle (novel).
Eric Flint was born in Burbank, California.
Macmillan Publishers is the parent company of Farrar, Straus and Giroux.
A Glastonbury Romance is a notable work by John Cowper Powys.
Soho Press is located in United States.
The Secret Scripture is published by Faber and Faber.
Asian Americans are an ethnic group in United States.
English language is spoken in United States.
Weymouth Sands is preceded by A Glastonbury Romance.
The manager of A.C. Chievo Verona is Rolando Maran.


In [28]:
!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name "$model_name" --team_filepath "$output_filepath" --outdir "$model_temp_dir"

Files creating finished for:  5 - Model - Template Based - roots_4e9245cbbc05e4fdf0de404fbada20d8294127b2


In [29]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$bleu_all_cat"

BLEU = 40.12, 75.6/49.0/32.5/21.5 (BP=1.000, ratio=1.037, hyp_len=45726, ref_len=44096)
