In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

from sklearn.base import BaseEstimator, RegressorMixin
from itertools import repeat
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.tokenize import WhitespaceTokenizer

import os

os.sys.path.insert(0, '../script')
from webnlg import WebNLGCorpus
from lexicalization import preprocess_so

# score -> 
#    -> [OK] NLTK BLEU
#    -> Competition BLEU

# fit -> 
#    -> my model

# predict -> 
#    -> my model



In [2]:
class NLGBaseline(BaseEstimator, RegressorMixin):
    
    def __init__(self, sep=None, preprocess_data=None):
        
        self.sep = sep
        self.preprocess_data = preprocess_data

            
    def fit(self, X, y=None):
        pass
    
    
    def predict_entry(self, x):
        
        sens = []
        
        for data in x:
                
            m_s = self.preprocess_data(data['m_subject'])
            m_p = self.preprocess_data(data['m_predicate'])
            m_o = self.preprocess_data(data['m_object'])
        
            sens.append(f'{m_s} {m_p} {m_o}')
        
        return self.sep.join(sens)
    
    
    def predict(self, X, y=None):
        
        return [self.predict_entry(x) for x in X]

In [13]:
# otimiza esses iterators maxo!
test = WebNLGCorpus.load("test_with_lex")
train = WebNLGCorpus.load(['train', 'dev'])

X = [t.get_data() for t in train]
y = [t.lexes() for t in train]

X_test = [t.get_data() for t in test]
y_test = [t.lexes() for t in test]

In [264]:
wt = WhitespaceTokenizer()

def bleu_(y_true, y_pred):
    
    y_true_ = [[wt.tokenize(ref) for ref in refs] for refs in y_true]
    y_pred_ = [wt.tokenize(hypothesi) for hypothesi in y_pred]
    
    return corpus_bleu(y_true_, y_pred_)


bleu = make_scorer(bleu_)
param_grid = {'sep': [' '],
              'preprocess_data': [lambda x: x, preprocess_so]}

cv = GridSearchCV(NLG(), param_grid, scoring=bleu, cv=2)

cv.fit(X, y)

GridSearchCV(cv=2, error_score='raise',
       estimator=NLG(preprocess_data=None, sep=None), fit_params={},
       iid=True, n_jobs=1,
       param_grid={'sep': [' ', ','], 'preprocess_data': [<function <lambda> at 0x7f53a676de18>, <function preprocess_so at 0x7f53a73ed510>]},
       pre_dispatch='2*n_jobs', refit=True, scoring=make_scorer(bleu_),
       verbose=0)

In [267]:
cv.grid_scores_

[mean: 0.03541, std: 0.00875, params: {'preprocess_data': <function <lambda> at 0x7f53a676de18>, 'sep': ' '},
 mean: 0.02821, std: 0.00709, params: {'preprocess_data': <function <lambda> at 0x7f53a676de18>, 'sep': ','},
 mean: 0.14677, std: 0.02632, params: {'preprocess_data': <function preprocess_so at 0x7f53a73ed510>, 'sep': ' '},
 mean: 0.12454, std: 0.02600, params: {'preprocess_data': <function preprocess_so at 0x7f53a73ed510>, 'sep': ','}]

# Template-Based

In [26]:
from template_extraction import TemplateExtractor
from itertools import chain

class NLGTemplateBased(BaseEstimator, RegressorMixin):
    
    def __init__(self, data_alignment=None, discourse_structurer=None, processor=lambda x: x, sentence_generator=None, lexicalizer=None, sentence_aggregator=None, nth=-1):
        
        self.data_alignment = data_alignment
        self.discourse_structurer = discourse_structurer
        self.sentence_generator = sentence_generator
        self.lexicalizer = lexicalizer
        self.sentence_aggregator = sentence_aggregator
        self.nth = nth
        

    def fit(self, X, y=None):
        
        self.template_extractor = TemplateExtractor(self.data_alignment)
        
        # 1-triple size
        X_1, y_1 = zip(*[(x[0], y_) for (x, y_) in zip(X, y) if len(x) == 1])
        
        X_1 = list(chain.from_iterable([x_1]*len(y_1_) for (x_1, y_1_) in zip(X_1, y_1)))
        y_1 = list(chain.from_iterable(y_1))
        
        self.template_extractor.fit(y_1, X_1)
        self.sentence_generator.fit(self.template_extractor)
        self.lexicalizer.fit(self.data_alignment)
        self.discourse_structurer.fit(self.template_extractor)
    
    
    def predict_entry(self, x):
        
        sens = []
        
        for data in x:
                
            m_s = self.preprocess_data(data['m_subject'])
            m_p = self.preprocess_data(data['m_predicate'])
            m_o = self.preprocess_data(data['m_object'])
        
            sens.append(f'{m_s} {m_p} {m_o}')
        
        return self.sep.join(sens)
    
    
    def predict(self, X, y=None):
        
        generated_texts = []

        for entry in X:

            sorted_data = self.discourse_structurer.sort(entry)
            
            sentences = []
            for i, d in enumerate(sorted_data):

                if i > self.nth:

                    d = self.processor(d)
                
                sentence = self.sentence_generator.generate(self.lexicalizer.lexicalize(d))
                
                sentences.append(sentence)
                
            text = self.sentence_aggregator.aggregate(sentences)

            generated_texts.append(text)

        return generated_texts

In [4]:
from sentence_aggregation import JustJoinSentencesSentenceAggregator
from sentence_generation import JustJoinTripleSentenceGenerator, MostFrequentTemplateSentenceGenerator, FallBackPipelineSentenceGenerator, NearestPredicateTemplateSentenceGenerator
from discourse_structuring import MostFrequentFirstDiscourseStructuring, ChainDiscourseStructuring
from data_alignment import RootDataAlignmentModel, NGramDataAlignmentModel, SPODataAlignmentModel, FallBackDataAlignmentModel
from template_extraction import TemplateExtractor
from text_generation import IfAfterNthProcessPipelineTextGenerator
from lexicalization import LexicalizeAsAligned, LexicalizePreprocessed
from webnlg import preprocess_triple_text
from textacy import similarity
import spacy

nlp = spacy.load('en_core_web_lg')

In [24]:
from importlib import reload

import discourse_structuring

reload(discourse_structuring)

from discourse_structuring import MostFrequentFirstDiscourseStructuring

In [64]:
rda1 = RootDataAlignmentModel(similarity.token_sort_ratio, nlp)
rda2 = RootDataAlignmentModel(similarity.levenshtein, nlp)
rda3 = RootDataAlignmentModel(similarity.jaro_winkler, nlp)
ngramda41 = NGramDataAlignmentModel(4, similarity.levenshtein, nlp)
ngramda42 = NGramDataAlignmentModel(4, similarity.token_sort_ratio, nlp)
ngramda43 = NGramDataAlignmentModel(4, similarity.jaro_winkler, nlp)
spoda = SPODataAlignmentModel(nlp)

# Subject Predicate Object
# > Ngram 
# > > Dependency tree Root
da111 = FallBackDataAlignmentModel(models=[ngramda41, spoda, rda1])
da112 = FallBackDataAlignmentModel(models=[ngramda41, spoda, rda2])
da113 = FallBackDataAlignmentModel(models=[ngramda41, spoda, rda3])
da121 = FallBackDataAlignmentModel(models=[ngramda42, spoda, rda1])
da122 = FallBackDataAlignmentModel(models=[ngramda42, spoda, rda2])
da123 = FallBackDataAlignmentModel(models=[ngramda42, spoda, rda3])
da131 = FallBackDataAlignmentModel(models=[ngramda43, spoda, rda1])
da132 = FallBackDataAlignmentModel(models=[ngramda43, spoda, rda2])
da133 = FallBackDataAlignmentModel(models=[ngramda43, spoda, rda3])

# uses the most frequente template
mft = MostFrequentTemplateSentenceGenerator(preprocessor=preprocess_triple_text)
jjt = JustJoinTripleSentenceGenerator(preprocessor=preprocess_triple_text)
sent_pipe = FallBackPipelineSentenceGenerator([mft, jjt])

# baseline
jjt = JustJoinTripleSentenceGenerator(preprocessor=preprocess_triple_text)

text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

#cds = ChainDiscourseStructuring()
cds = MostFrequentFirstDiscourseStructuring()

le = LexicalizeAsAligned()
lp = LexicalizePreprocessed()

def replace_subject(d):
    
    d['m_subject'] = ','
    
    return d

nlg = NLGTemplateBased(data_alignment=da111,
                       discourse_structurer=cds,
                       processor=replace_subject,
                       sentence_generator=sent_pipe,
                       sentence_aggregator=text_agg,
                       lexicalizer=lp,
                       nth=0)

nlg.fit(X[:100], y[:100])

nlg.predict([[{'m_subject': 'Amsterdam_Airport_Schiphol',
   'm_predicate': '5th_runway_SurfaceType',
   'm_object': '"Asphalt"'}]])

['Amsterdam Airport Schiphol 5th runway Surface Type Asphalt']

In [65]:
%%time

wt = WhitespaceTokenizer()

def bleu_(y_true, y_pred):
    
    y_true_ = [[wt.tokenize(ref) for ref in refs] for refs in y_true]
    y_pred_ = [wt.tokenize(hypothesi) for hypothesi in y_pred]
    
    return corpus_bleu(y_true_, y_pred_)


bleu = make_scorer(bleu_)
param_grid = {'data_alignment': [da111, da112, da113, da121, da122, da123, da131, da132, da133],
              'discourse_structurer': [cds],
              'processor': [replace_subject],
              'sentence_generator': [sent_pipe],
              'sentence_aggregator': [text_agg],
              'lexicalizer': [lp, le],
              'nth': [1]}

cv = GridSearchCV(NLGTemplateBased(), param_grid, scoring=bleu, cv=2)

cv.fit(X, y)

CPU times: user 1h 8min 44s, sys: 31min 29s, total: 1h 40min 14s
Wall time: 41min 14s


In [66]:
cv.grid_scores_

[mean: 0.27262, std: 0.00005, params: {'data_alignment': FallBackDataAlignmentModel(models=[NGramDataAlignmentModel(max_n=4,
             nlp=<spacy.lang.en.English object at 0x7f3a198e5f60>,
             similarity_metric=<function levenshtein at 0x7f3a19567d90>), SPODataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f3a198e5f60>), RootDataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f3a198e5f60>,
             similarity_metric=<function token_sort_ratio at 0x7f3a19567ea0>)]), 'discourse_structurer': MostFrequentFirstDiscourseStructuring(template_model=None), 'lexicalizer': LexicalizePreprocessed(), 'nth': 0, 'processor': <function replace_subject at 0x7f39a3e73d08>, 'sentence_aggregator': JustJoinSentencesSentenceAggregator(sep=' '), 'sentence_generator': <sentence_generation.FallBackPipelineSentenceGenerator object at 0x7f39a8757780>},
 mean: 0.29154, std: 0.01887, params: {'data_alignment': FallBackDataAlignmentModel(models=[NGramDataAlignmentModel(max_n=4,
      

In [67]:
cv.best_estimator_

NLGTemplateBased(data_alignment=FallBackDataAlignmentModel(models=[NGramDataAlignmentModel(max_n=4,
            nlp=<spacy.lang.en.English object at 0x7f3a198e5f60>,
            similarity_metric=<function levenshtein at 0x7f3a19567d90>), SPODataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f3a198e5f60>), RootDataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f3a198e5f60>,
            similarity_metric=<function token_sort_ratio at 0x7f3a19567ea0>)]),
         discourse_structurer=MostFrequentFirstDiscourseStructuring(template_model=None),
         lexicalizer=LexicalizePreprocessed(), nth=1,
         processor=<function replace_subject at 0x7f39a3e73d08>,
         sentence_aggregator=JustJoinSentencesSentenceAggregator(sep=' '),
         sentence_generator=<sentence_generation.FallBackPipelineSentenceGenerator object at 0x7f39a8757780>)

In [68]:
model = cv.best_estimator_

In [69]:
model.fit(X, y)

In [70]:
bleu(model, X_test, y_test)

0.2666463479271896

In [71]:
texts = model.predict(X_test)

In [72]:
import codecs

with codecs.open('../data/models/scikit-learn', 'w', 'utf-8') as f:
    
    for text in texts:
        
        f.write("{}\n".format(text))

In [73]:
!head -100 ../data/models/scikit-learn | tail -10

English language is spoken in novel Castle.
Eric Flint was born in Burbank, California.
Farrar, Straus and Giroux University is the parent company of the Macmillan Publishers University Press.
One of John Cowper Powys Powys notable works is A Glastonbury Romance.
Soho Press is located in United States.
Faber and Faber is the publisher of The Secret Scripture Science Quarterly.
Asian Americans are an ethnic group United States U.S.
English language is spoken in United States.
DeMarce short stories in A Glastonbury Romance Gazettes preceded 1634: Weymouth Sands Crisis.
The manager of A.C. Chievo Verona is Rolando Maran.


In [74]:
!mkdir ../tmp/scikit-learn 

mkdir: cannot create directory ‘../tmp/scikit-learn’: File exists


In [75]:
!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name scikit-learn --team_filepath ../data/models/scikit-learn --outdir ../tmp/scikit-learn 

Files creating finished for:  scikit-learn


In [76]:
bleu_all_cat = "../tmp/scikit-learn/scikit-learn_all-cat.txt"

!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$bleu_all_cat"

BLEU = 35.54, 68.0/44.0/28.7/18.6 (BP=1.000, ratio=1.155, hyp_len=52192, ref_len=45198)
