In [1]:
import os

os.sys.path.insert(0, '../script')

# Gets notebook name and commit hash

In [2]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [3]:
nb_name = nb_name.rsplit('.')[0]

commit = !git rev-parse HEAD
commit = commit[0]

model_name = "{}_{}".format(nb_name, commit)

In [4]:
output_filename = "{}.txt".format(model_name)
model_filename = "{}".format(model_name)
log_filename = "{}.log".format(model_name)

import os

output_filepath = os.path.join('../data/models', output_filename)
model_filepath = os.path.join('../data/models', model_filename)
log_filepath = os.path.join('../data/models', log_filename)

model_temp_dir = os.path.join('../tmp/', model_name)

bleu_all_cat = os.path.join(model_temp_dir, "{}_all-cat.txt".format(model_name))

if not os.path.isdir('../tmp'):
    os.mkdir('../tmp')
    
if not os.path.isdir(model_temp_dir):
    os.mkdir(model_temp_dir)

In [5]:
model_name

'5 - Model - Template Based - roots_9c17c33ef5d01be49d733d80c2fca2e8708bc9e2'

# Logs to file

In [6]:
import logging

logging.basicConfig(filename=log_filepath, 
                    level=logging.DEBUG, 
                    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                    filemode='w')

# Data

In [7]:
import spacy
from webnlg import WebNLGCorpus

nlp = spacy.load('en_core_web_lg')

train_dev = WebNLGCorpus.load(['train', 'dev'])
test = WebNLGCorpus.load('test_no_lex')

# BIAS: use only 1 tripleset size dataset
train_dev_1 = train_dev.subset(ntriples=[1])

### how many m_predicates exists in train+dev and not in train_1+dev_1?

In [8]:
train_dev_1_predicates = set(train_dev_1.mdf.m_predicate.unique())
train_dev_predicates = set(train_dev.mdf.m_predicate.unique())

all_not_in_1 = train_dev_predicates.difference(train_dev_1_predicates)

print("There are {} m_predicates in train+dev not present in train_1+dev_1.\nThey are:\n\n{}".format(
      len(all_not_in_1), '\n'.join(all_not_in_1)))

There are 9 m_predicates in train+dev not present in train_1+dev_1.
They are:

gemstone
served
has to its southeast
has to its northwest
numberOfRooms
5th_runway_SurfaceType
architecture
neighboringMunicipality
servingSize


### how many m_predicates exists in test and not in train_1+dev_1?

In [9]:
predicates_in_test = set(test.mdf.m_predicate.unique())
test_not_in_1 = predicates_in_test.difference(train_dev_1_predicates)

"There are {} predicates in test, from {}, which don't have a template".format(len(test_not_in_1),
                                                                               len(predicates_in_test))

"There are 117 predicates in test, from 300, which don't have a template"

# If the predicate doesn't exist, fall back to baseline

In [10]:
from sentence_generation import FallBackPipelineSentenceGenerator, NearestPredicateTemplateSentenceGenerator
from sentence_aggregation import JustJoinSentencesSentenceAggregator
from sentence_generation import JustJoinTripleSentenceGenerator, MostFrequentTemplateSentenceGenerator
from discourse_structuring import MostFrequentFirstDiscourseStructuring, ChainDiscourseStructuring
from data_alignment import RootDataAlignmentModel, NGramDataAlignmentModel, SPODataAlignmentModel, FallBackDataAlignmentModel
from template_extraction import TemplateExtractor
from text_generation import IfAfterNthProcessPipelineTextGenerator
from lexicalization import LexicalizeAsAligned,LexicalizePreprocessed
from webnlg import preprocess_triple_text
from collections import Counter
from itertools import chain
from textacy import similarity

## Data Alignment model

In [11]:
rda = RootDataAlignmentModel(similarity.token_sort_ratio, nlp)
ngramda = NGramDataAlignmentModel(4, similarity.levenshtein, nlp)
spoda = SPODataAlignmentModel(nlp)

# Subject Predicate Object
# > Ngram 
# > > Dependency tree Root
da = FallBackDataAlignmentModel(models=[ngramda, spoda, rda])

In [12]:
%%time


if os.path.exists(model_filepath):
    te = TemplateExtractor.load(model_filepath)
else:
    te = TemplateExtractor(da)

    # texts from entries
    texts = train_dev_1.ldf.ltext.tolist()

    # to dictionary of s, o; [0] because to_dict returns a list of dicts(and, in this case, there
    #    will be only one element)

    # for each entry you have one or more verbalizations
    #    you have to repeat the tripleset N times, N = number of verbalizations
    datas = chain.from_iterable([\
        # data for entry
        [entry.get_data()[0]] \
             * \
        # number of verbalizations
        entry.ldf.shape[0] for entry in train_dev_1])

    te.fit(texts, datas)

    TemplateExtractor.save(te, model_filepath)

CPU times: user 0 ns, sys: 31.2 ms, total: 31.2 ms
Wall time: 32.4 ms


## Sentence generation model

In [13]:
# uses the most frequente template
mft = MostFrequentTemplateSentenceGenerator(te, preprocessor=preprocess_triple_text)
# uses the nearest predicate templates
#    precalculate nearests for test_not_in_1
npt = NearestPredicateTemplateSentenceGenerator(template_sentence_generator=mft,
                                                similarity_metric=similarity.levenshtein,
                                                predicates=test_not_in_1,
                                                preprocessor=preprocess_triple_text,
                                                threshold=.7)
# baseline
jjt = JustJoinTripleSentenceGenerator(preprocessor=preprocess_triple_text)

sent_pipe = FallBackPipelineSentenceGenerator([mft, npt, jjt])

## Sentence aggregation model

In [14]:
text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

## Discourse structuring model

In [15]:
#mff = MostFrequentFirstDiscourseStructuring(template_model=te)
# Starts from a node without incoming vertices
#    and then do a Breadth first search
cds = ChainDiscourseStructuring()

## Lexicalization model

In [16]:
le = LexicalizeAsAligned(da)
lp = LexicalizePreprocessed()

## Final model

In [17]:
def replace_subject(d):
    
    d['m_subject'] = ','
    
    return d

# starting from second sentence, apply replace_subject function over data :)

#pipe = IfAfterNthProcessPipelineTextGenerator(sent_pipe, text_agg, cds, le, processor=replace_subject, nth=0)
pipe = IfAfterNthProcessPipelineTextGenerator(sent_pipe, text_agg, cds, lp, processor=replace_subject, nth=0)

# Example 1

In [18]:
e = test.sample(idx='0_3')
e

Triple info: {'category': 'Airport', 'eid': 'Id4', 'idx': '0_3', 'ntriples': 1}

	Modified triples:

Afonso_Pena_International_Airport | ICAO_Location_Identifier | "SBCT"


In [19]:
pipe.generate([e.get_data()])[0]

'The ICAO Location Identifier of Afonso Pena International Airport is SBCT.'

In [20]:
le.lexicalize(e.get_data()[0])

{'m_subject': 'Afonso_Pena_International_Airport',
 'm_predicate': 'ICAO_Location_Identifier',
 'm_object': '"SBCT"'}

# Example 2

In [21]:
sample = test.sample()
pipe.generate([sample.get_data()])[0]

'Aaron Bertram associated Band/associated Musical Artist Suburban Legends Alcatraz , is in the genre of Ska punk. , stylistic Origin Ska'

In [22]:
sample

Triple info: {'category': 'Artist', 'eid': 'Id1488', 'idx': '0_1487', 'ntriples': 3}

	Modified triples:

Ska_punk | stylisticOrigin | Ska
Aaron_Bertram | associatedBand/associatedMusicalArtist | Suburban_Legends
Aaron_Bertram | genre | Ska_punk


# Test evaluation

In [23]:
%%time 
import codecs

with codecs.open(output_filepath, 'w', 'utf-8') as f:
    
    for text in pipe.generate((entry.get_data() for entry in test)):
        
        f.write("{}\n".format(text))

CPU times: user 6 s, sys: 344 ms, total: 6.34 s
Wall time: 6.39 s


In [24]:
!head -100 "$output_filepath" | tail -10

The English language is spoken in novel Castle.
Eric Flint was born in Burbank, California.
Macmillan Publishers is the parent company of Farrar, Straus and Giroux.
One of John Cowper Powys notable works is Oliver A Glastonbury Romance.
Soho Press is in the United States.
The Secret Scripture was published by Faber and Faber.
Asian Americans are an ethnic group in the United States.
The English language is spoken in United States.
Weymouth Sands was preceded by A Glastonbury Romance.
The manager of A.C. Chievo Verona is Rolando Maran.


In [25]:
!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name "$model_name" --team_filepath "$output_filepath" --outdir "$model_temp_dir"

Files creating finished for:  5 - Model - Template Based - roots_9c17c33ef5d01be49d733d80c2fca2e8708bc9e2


In [26]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$bleu_all_cat"

BLEU = 39.89, 75.3/49.0/32.3/21.2 (BP=1.000, ratio=1.068, hyp_len=47712, ref_len=44677)
