# Data

In [47]:
import os

os.sys.path.insert(0, '../script')

from evaluation import evaluate_model

from content_selection import SelectAllContentSelector
from discourse_structuring import DoesntSortDiscourseStructurer
from sentence_aggregation import OneSentenceSentenceAggregator
from lexicalization import PreprocessLexicalizer, preprocess_so
from sentence_generation import MostFrequentTemplateSentenceGenerator, FallBackPipelineSentenceGenerator, JustJoinTripleSentenceGenerator
from text_generation import TextGenerator
from data_alignment import NGramDataAlignmentModel
from template_extraction import TemplateExtractor

import spacy
from textacy import similarity

nlp = spacy.load('en')

In [15]:
from webnlg_corpus import webnlg

corpus = webnlg.load('webnlg_challenge_2017')

train_dev = corpus.subset(datasets=['train', 'dev'])

train_dev_1 = train_dev.subset(ntriples=[1])

In [61]:
TEMPLATES_FILE = '../model/ngram_3_levenshtein_templates'

if os.path.isfile(TEMPLATES_FILE):
    template_db = TemplateExtractor.load(TEMPLATES_FILE)
else:
    te = TemplateExtractor(
            data_alignment_model=NGramDataAlignmentModel(3, similarity.levenshtein, nlp))

    template_db = te.extract(train_dev_1)
    TemplateExtractor.save('../model/ngram_3_levenshtein_templates')

model = TextGenerator(
    content_selection_model=SelectAllContentSelector(),
    discourse_structuring_model=DoesntSortDiscourseStructurer(),
    sentence_aggregation_model=OneSentenceSentenceAggregator(),
    lexicalization_model=PreprocessLexicalizer(preprocess=preprocess_so),
    sentence_generation_model=FallBackPipelineSentenceGenerator([
                                MostFrequentTemplateSentenceGenerator().fit(template_db),
                                JustJoinTripleSentenceGenerator()])
)

In [62]:
evaluate_model(model, '1-predicate-template')

{'bleu': 31.07, 'meteor': 0.303327810176281, 'ter': 0.6234476919820661}

In [16]:
import os

os.sys.path.insert(0, '../script')

from sentence_generation import FallBackPipelineSentenceGenerator, NearestPredicateTemplateSentenceGenerator
from sentence_aggregation import OneSentenceAggregator
from sentence_generation import JustJoinTripleSentenceGenerator, MostFrequentTemplateSentenceGenerator
from discourse_structuring import MostFrequentFirstDiscourseStructuring, ChainDiscourseStructuring
from data_alignment import RootDataAlignmentModel, NGramDataAlignmentModel, SPODataAlignmentModel, FallBackDataAlignmentModel
from template_extraction import TemplateExtractor
from text_generation import IfAfterNthProcessPipelineTextGenerator
from lexicalization import LexicalizeAsAligned,LexicalizePreprocessed
from webnlg import preprocess_triple_text
from collections import Counter
from itertools import chain
from textacy import similarity

ModuleNotFoundError: No module named 'networkx'

## Data Alignment model

In [34]:
%%time

from itertools import product

rda = RootDataAlignmentModel(similarity.levenshtein, nlp)
ngramda = NGramDataAlignmentModel(3, similarity.levenshtein, nlp)

# Subject Predicate Object
# > Ngram 
# > > Dependency tree Root
da1 = FallBackDataAlignmentModel(models=[ngramda, rda])
da2 = FallBackDataAlignmentModel(models=[rda, ngramda])


das = [da1, da2]

tes = []

for i, (da, threshold) in enumerate(product(das, [.3, .5, .8])):
    
    filepath = f'{MODEL_DIR}{i}'

    if os.path.exists(filepath):
        te = TemplateExtractor.load(filepath)
    else:
        te = TemplateExtractor(da)

        # texts from entries
        texts = train_dev_1.ldf.ltext.tolist()

        # to dictionary of s, o; [0] because to_dict returns a list of dicts(and, in this case, there
        #    will be only one element)

        # for each entry you have one or more verbalizations
        #    you have to repeat the tripleset N times, N = number of verbalizations
        datas = chain.from_iterable([\
            # data for entry
            [entry.get_data()[0]] \
                 * \
            # number of verbalizations
            entry.ldf.shape[0] for entry in train_dev_1])

        te.fit(texts, datas)

        TemplateExtractor.save(te, filepath)
        
    tes.append(te)
    
    
    # uses the most frequente template
    mft = MostFrequentTemplateSentenceGenerator(te, preprocessor=preprocess_triple_text)
    # uses the nearest predicate templates
    #    precalculate nearests for test_not_in_1
    npt = NearestPredicateTemplateSentenceGenerator(template_sentence_generator=mft,
                                                    similarity_metric=similarity.levenshtein,
                                                    predicates=test_not_in_1,
                                                    preprocessor=preprocess_triple_text,
                                                    threshold=threshold)
    # baseline
    jjt = JustJoinTripleSentenceGenerator(preprocessor=preprocess_triple_text)

    sent_pipe = FallBackPipelineSentenceGenerator([mft, npt, jjt])

    text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

    #mff = MostFrequentFirstDiscourseStructuring(template_model=te)
    # Starts from a node without incoming vertices
    #    and then do a Breadth first search
    cds = ChainDiscourseStructuring()

    le = LexicalizeAsAligned(da)
    lp = LexicalizePreprocessed()

    def replace_subject(d):

        d['m_subject'] = ','

        return d

    # starting from second sentence, apply replace_subject function over data :)

    #pipe = IfAfterNthProcessPipelineTextGenerator(sent_pipe, text_agg, cds, le, processor=replace_subject, nth=0)
    pipe = IfAfterNthProcessPipelineTextGenerator(sent_pipe, text_agg, cds, lp, processor=replace_subject, nth=0)

    import codecs

    with codecs.open(filepath, 'w', 'utf-8') as f:

        for text in pipe.generate((entry.get_data() for entry in test)):

            f.write("{}\n".format(text))

CPU times: user 19min 16s, sys: 8min 14s, total: 27min 31s
Wall time: 10min 37s


In [35]:
for i, (da, threshold) in enumerate(product(das, [.3, .5, .8])):
    
    filepath = f'{MODEL_DIR}{i}'
    model_namee = f'{model_name}{i}'

    !python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name "$model_namee" --team_filepath "$filepath" --outdir "$model_temp_dir"

Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c0
Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c1
Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c2
Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c3
Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c4
Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c5
Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c6
Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c7
Files creating finished for:  5 - Model - Template Based - roots_310dd152c335dfb79d9d63de09095d7f70ce7c7c8


In [36]:
for i, (da, threshold) in enumerate(product(das, [.3, .5, .8])):
    
    bleu_all_cat_filepath = os.path.join(model_temp_dir, f"{model_name}{i}_all-cat.txt")

    !../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$bleu_all_cat_filepath"

BLEU = 36.97, 70.8/45.1/29.9/19.6 (BP=1.000, ratio=1.138, hyp_len=52032, ref_len=45719)
BLEU = 38.57, 73.4/47.3/31.2/20.5 (BP=1.000, ratio=1.097, hyp_len=49259, ref_len=44910)
BLEU = 38.94, 74.1/47.8/31.4/20.7 (BP=1.000, ratio=1.085, hyp_len=48519, ref_len=44714)
BLEU = 32.02, 65.7/39.7/25.3/16.0 (BP=1.000, ratio=1.175, hyp_len=53718, ref_len=45703)
BLEU = 33.33, 68.4/41.6/26.2/16.6 (BP=1.000, ratio=1.129, hyp_len=50663, ref_len=44889)
BLEU = 33.80, 69.2/42.2/26.5/16.9 (BP=1.000, ratio=1.113, hyp_len=49718, ref_len=44672)
BLEU = 39.40, 74.6/48.1/32.1/20.9 (BP=1.000, ratio=1.061, hyp_len=47444, ref_len=44702)
BLEU = 40.52, 76.6/49.8/32.9/21.5 (BP=1.000, ratio=1.030, hyp_len=45505, ref_len=44163)
BLEU = 40.84, 77.2/50.2/33.1/21.7 (BP=1.000, ratio=1.021, hyp_len=44946, ref_len=44016)


In [18]:
rda = RootDataAlignmentModel(similarity.token_sort_ratio, nlp)
ngramda = NGramDataAlignmentModel(4, similarity.levenshtein, nlp)
spoda = SPODataAlignmentModel(nlp)

# Subject Predicate Object
# > Ngram 
# > > Dependency tree Root
da = FallBackDataAlignmentModel(models=[ngramda, spoda, rda])

In [19]:
%%time


if os.path.exists(MODEL_DIR):
    te = TemplateExtractor.load(MODEL_DIR)
else:
    te = TemplateExtractor(da)

    # texts from entries
    texts = train_dev_1.ldf.ltext.tolist()

    # to dictionary of s, o; [0] because to_dict returns a list of dicts(and, in this case, there
    #    will be only one element)

    # for each entry you have one or more verbalizations
    #    you have to repeat the tripleset N times, N = number of verbalizations
    datas = chain.from_iterable([\
        # data for entry
        [entry.get_data()[0]] \
             * \
        # number of verbalizations
        entry.ldf.shape[0] for entry in train_dev_1])

    te.fit(texts, datas)

    TemplateExtractor.save(te, MODEL_DIR)

CPU times: user 2min 6s, sys: 1min 14s, total: 3min 21s
Wall time: 1min 3s


## Sentence generation model

In [20]:
# uses the most frequente template
mft = MostFrequentTemplateSentenceGenerator(te, preprocessor=preprocess_triple_text)
# uses the nearest predicate templates
#    precalculate nearests for test_not_in_1
npt = NearestPredicateTemplateSentenceGenerator(template_sentence_generator=mft,
                                                similarity_metric=similarity.levenshtein,
                                                predicates=test_not_in_1,
                                                preprocessor=preprocess_triple_text,
                                                threshold=.7)
# baseline
jjt = JustJoinTripleSentenceGenerator(preprocessor=preprocess_triple_text)

sent_pipe = FallBackPipelineSentenceGenerator([mft, npt, jjt])

## Sentence aggregation model

In [21]:
text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

## Discourse structuring model

In [22]:
#mff = MostFrequentFirstDiscourseStructuring(template_model=te)
# Starts from a node without incoming vertices
#    and then do a Breadth first search
cds = ChainDiscourseStructuring()

## Lexicalization model

In [23]:
le = LexicalizeAsAligned(da)
lp = LexicalizePreprocessed()

## Final model

In [24]:
def replace_subject(d):
    
    d['m_subject'] = ','
    
    return d

# starting from second sentence, apply replace_subject function over data :)

#pipe = IfAfterNthProcessPipelineTextGenerator(sent_pipe, text_agg, cds, le, processor=replace_subject, nth=0)
pipe = IfAfterNthProcessPipelineTextGenerator(sent_pipe, text_agg, cds, lp, processor=replace_subject, nth=0)

# Example 1

In [25]:
e = test.sample(idx='0_3')
e

Triple info: {'category': 'Airport', 'eid': 'Id4', 'idx': '0_3', 'ntriples': 1}

	Modified triples:

Afonso_Pena_International_Airport | ICAO_Location_Identifier | "SBCT"


In [26]:
pipe.generate([e.get_data()])[0]

'The ICAO Location Identifier of Afonso Pena International Airport is SBCT.'

In [27]:
le.lexicalize(e.get_data()[0])

{'m_subject': 'Afonso Pena International Airport',
 'm_predicate': 'ICAO_Location_Identifier',
 'm_object': '"SBCT"'}

# Example 2

In [28]:
sample = test.sample()
pipe.generate([sample.get_data()])[0]

'The total area of Atlantic City, New Jersey is square kilometres 44.125.'

In [29]:
sample

Triple info: {'category': 'City', 'eid': 'Id172', 'idx': '0_171', 'ntriples': 1}

	Modified triples:

Atlantic_City,_New_Jersey | areaTotal | 44.125 (square kilometres)


# Test evaluation

In [30]:
%%time 
import codecs

with codecs.open(output_filepath, 'w', 'utf-8') as f:
    
    for text in pipe.generate((entry.get_data() for entry in test)):
        
        f.write("{}\n".format(text))

CPU times: user 6.3 s, sys: 266 ms, total: 6.56 s
Wall time: 6.68 s


In [31]:
!head -100 "$output_filepath" | tail -10

The English language is spoken in novel Castle.
Eric Flint was born in Burbank, California.
Macmillan Publishers is the parent company of Farrar, Straus and Giroux.
One of John Cowper Powys notable works is Oliver A Glastonbury Romance.
Soho Press is in the United States.
The Secret Scripture was published by Faber and Faber.
Asian Americans are an ethnic group in the United States.
The English language is spoken in United States.
Weymouth Sands was preceded by A Glastonbury Romance.
The manager of A.C. Chievo Verona is Rolando Maran.


In [32]:
!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name "$model_name" --team_filepath "$output_filepath" --outdir "$model_temp_dir"

Files creating finished for:  5 - Model - Template Based - roots_c952c4774651e01f72450b3ac8ebcde98ab9f157


In [33]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$bleu_all_cat"

BLEU = 39.89, 75.3/49.0/32.3/21.2 (BP=1.000, ratio=1.068, hyp_len=47712, ref_len=44677)
