In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

from sklearn.base import BaseEstimator, RegressorMixin
from itertools import repeat, chain
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.tokenize import WhitespaceTokenizer

import os

os.sys.path.insert(0, '../script')
from webnlg import WebNLGCorpus
from lexicalization import preprocess_so

from sentence_aggregation import JustJoinSentencesSentenceAggregator
from sentence_generation import JustJoinTripleSentenceGenerator, MostFrequentTemplateSentenceGenerator, FallBackPipelineSentenceGenerator, NearestPredicateTemplateSentenceGenerator
from discourse_structuring import MostFrequentFirstDiscourseStructuring, ChainDiscourseStructuring
from data_alignment import RootDataAlignmentModel, NGramDataAlignmentModel, SPODataAlignmentModel, FallBackDataAlignmentModel
from template_extraction import TemplateExtractor
from text_generation import IfAfterNthProcessPipelineTextGenerator
from lexicalization import LexicalizeAsAligned, LexicalizePreprocessed, FallBackLexicalize
from webnlg import preprocess_triple_text
from textacy import similarity
import spacy

nlp = spacy.load('en_core_web_lg')

In [42]:
class NLGBaseline(BaseEstimator, RegressorMixin):
    
    def __init__(self, sep=None, preprocess_data=None):
        
        self.sep = sep
        self.preprocess_data = preprocess_data

            
    def fit(self, X, y=None):
        pass
    
    
    def predict_entry(self, x):
        
        sens = []
        
        for data in x:
                
            m_s = self.preprocess_data(data['subject'])
            m_p = self.preprocess_data(data['predicate'])
            m_o = self.preprocess_data(data['object'])
        
            sens.append(f'{m_s} {m_p} {m_o}')
        
        return self.sep.join(sens)
    
    
    def predict(self, X, y=None):
        
        return [self.predict_entry(x) for x in X]

In [2]:
test = WebNLGCorpus.load("test_with_lex")
train = WebNLGCorpus.load(['train', 'dev'])

X = np.array([t.get_data() for t in train])
y = np.array([t.lexes() for t in train])

X_test = np.array([t.get_data() for t in test])
y_test = np.array([t.lexes() for t in test])

In [43]:
wt = WhitespaceTokenizer()

def bleu_(y_true, y_pred):
    
    y_true_ = [[wt.tokenize(ref) for ref in refs] for refs in y_true]
    y_pred_ = [wt.tokenize(hypothesi) for hypothesi in y_pred]
    
    return corpus_bleu(y_true_, y_pred_)


bleu = make_scorer(bleu_)
param_grid = {'sep': [' '],
              'preprocess_data': [preprocess_so]}

cv = GridSearchCV(NLGBaseline(), param_grid, scoring=bleu, cv=4)

cv.fit(X, y)

GridSearchCV(cv=4, error_score='raise',
       estimator=NLGBaseline(preprocess_data=None, sep=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'sep': [' '], 'preprocess_data': [<function preprocess_so at 0x7f4be2e73048>]},
       pre_dispatch='2*n_jobs', refit=True, scoring=make_scorer(bleu_),
       verbose=0)

In [44]:
cv.grid_scores_

[mean: 0.17828, std: 0.02259, params: {'preprocess_data': <function preprocess_so at 0x7f4be2e73048>, 'sep': ' '}]

# Template-Based

In [3]:
class NLGTemplateBased(BaseEstimator, RegressorMixin):
    
    def __init__(self, data_alignment=None, discourse_structurer=None, processor=lambda x: x, sentence_generator=None, lexicalizer=None, sentence_aggregator=None, nth=-1):
        
        self.data_alignment = data_alignment
        self.discourse_structurer = discourse_structurer
        self.sentence_generator = sentence_generator
        self.lexicalizer = lexicalizer
        self.sentence_aggregator = sentence_aggregator
        self.nth = nth
        self.processor = processor
        

    def fit(self, X, y=None):
        
        self.template_extractor = TemplateExtractor(self.data_alignment)
        
        # 1-triple size
        X_1, y_1 = zip(*[(x[0], y_) for (x, y_) in zip(X, y) if len(x) == 1])
        
        X_1 = list(chain.from_iterable([x_1]*len(y_1_) for (x_1, y_1_) in zip(X_1, y_1)))
        y_1 = list(chain.from_iterable(y_1))
        
        self.template_extractor.fit(y_1, X_1)
        self.sentence_generator.fit(self.template_extractor)
        self.lexicalizer.fit(self.data_alignment)
        self.discourse_structurer.fit(self.template_extractor)
    

    def predict(self, X, y=None):
        
        generated_texts = []

        for entry in X:

            sorted_data = self.discourse_structurer.sort(entry)
            
            sentences = []
            for i, d in enumerate(sorted_data):

                if i > self.nth:

                    d = self.processor(d)
                    
                lexicalized = self.lexicalizer.lexicalize(d)
                
                sentence = self.sentence_generator.generate(lexicalized)
                
                sentences.append(sentence)
                
            text = self.sentence_aggregator.aggregate(sentences)

            generated_texts.append(text)

        return generated_texts

In [23]:
# Data Alignment
rda = RootDataAlignmentModel(similarity.token_sort_ratio, nlp)
ngramda = NGramDataAlignmentModel(3, similarity.jaro_winkler, nlp)
spoda = SPODataAlignmentModel(nlp)

da = FallBackDataAlignmentModel(models=[ngramda, spoda, rda])

# Sentence Generation
mft = MostFrequentTemplateSentenceGenerator()
# npt = NearestPredicateTemplateSentenceGenerator(mft, similarity.token_sort_ratio, .6)
jjt = JustJoinTripleSentenceGenerator()
sent_pipe = FallBackPipelineSentenceGenerator([mft, jjt])

text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

#cds = ChainDiscourseStructuring()
cds = MostFrequentFirstDiscourseStructuring()

le = LexicalizeAsAligned()
lp = LexicalizePreprocessed(preprocess_triple_text)
fle = FallBackLexicalize([le, lp])

def replace_subject(d):
    
    d['subject'] = ','
    
    return d

nlg = NLGTemplateBased(data_alignment=da,
                       discourse_structurer=cds,
                       processor=replace_subject,
                       sentence_generator=sent_pipe,
                       sentence_aggregator=text_agg,
                       lexicalizer=fle,
                       nth=0)

nlg.fit(X[:200], y[:200])

nlg.predict([[{'idx': '11_19',
  'mtext': 'Balder_(comicsCharacter) | alternativeName | "Balder Odinson"',
  'subject': 'Balder_(comicsCharacter) ',
  'predicate': ' alternativeName ',
  'object': ' "Balder Odinson"'},
 {'idx': '11_19',
  'mtext': 'Balder_(comicsCharacter) | creator | Stan_Lee',
  'subject': 'Balder_(comicsCharacter) ',
  'predicate': 'creator',
  'object': ' Stan_Lee'}]])

['Balder (comics Character)  alternativeName  Balder Odinson , creator Stan Lee']

In [38]:
from importlib import reload

import discourse_structuring
reload(discourse_structuring)
from discourse_structuring import ChainDiscourseStructuring

In [45]:
%%time

from itertools import product

rdas = [RootDataAlignmentModel(sim, nlp) for sim in [similarity.token_sort_ratio, similarity.levenshtein]]
ngramdas = [NGramDataAlignmentModel(3, sim, nlp) for sim in [similarity.token_sort_ratio, similarity.levenshtein]]
das = [FallBackDataAlignmentModel(models) for models in product(rdas, ngramdas)]

# Sentence Generation
mft = MostFrequentTemplateSentenceGenerator()
# npt = NearestPredicateTemplateSentenceGenerator(mft, similarity.token_sort_ratio, .6)
jjt = JustJoinTripleSentenceGenerator()
sent_pipes = [FallBackPipelineSentenceGenerator([mft, jjt]), FallBackPipelineSentenceGenerator([jjt, mft])]

text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

# poor results ChainDiscourseStructuring()
cdss = [MostFrequentFirstDiscourseStructuring()]

la = LexicalizeAsAligned()
lp = LexicalizePreprocessed(preprocess_triple_text)
les = [FallBackLexicalize([la, lp]), FallBackLexicalize([lp, la])]

def replace_subject(d):
    
    d['subject'] = ','
    
    return d

# scorer
wt = WhitespaceTokenizer()

def bleu_(y_true, y_pred):
    
    y_true_ = [[wt.tokenize(ref) for ref in refs] for refs in y_true]
    y_pred_ = [wt.tokenize(hypothesi) for hypothesi in y_pred]
    
    return corpus_bleu(y_true_, y_pred_)

bleu = make_scorer(bleu_)

# grid
param_grid = {'data_alignment': das,
              'discourse_structurer': cdss,
              'sentence_generator': sent_pipes,
              'sentence_aggregator': [text_agg],
              'lexicalizer': les,
              'nth': [0],
              'processor': [replace_subject]}

# data
from sklearn.utils import shuffle
X_train, y_train = shuffle(X, y, random_state=100)

cv = GridSearchCV(NLGTemplateBased(), param_grid, scoring=bleu, cv=2, return_train_score=True)

cv.fit(X_train, y_train)

CPU times: user 49min 28s, sys: 2.29 s, total: 49min 30s
Wall time: 1d 15h 37min 30s


In [46]:
cv.grid_scores_



[mean: 0.30081, std: 0.00178, params: {'data_alignment': FallBackDataAlignmentModel(models=(RootDataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f6d829087b8>,
             similarity_metric=<function token_sort_ratio at 0x7f6d826a1bf8>), NGramDataAlignmentModel(max_n=3,
             nlp=<spacy.lang.en.English object at 0x7f6d829087b8>,
             similarity_metric=<function token_sort_ratio at 0x7f6d826a1bf8>))), 'discourse_structurer': MostFrequentFirstDiscourseStructuring(template_model=None), 'lexicalizer': FallBackLexicalize(models=[LexicalizeAsAligned(data_alignment=FallBackDataAlignmentModel(models=(RootDataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f6d829087b8>,
             similarity_metric=<function token_sort_ratio at 0x7f6d826a1bf8>), NGramDataAlignmentModel(max_n=3,
             nlp=<spacy.lang.en.Eng...1bf8>)))), LexicalizePreprocessed(preprocessor=<function preprocess_triple_text at 0x7f6d8ee92950>)]), 'nth': 0, 'processor': <function replace_subj

In [47]:
bleu(model, X_test, y_test)

NameError: name 'model' is not defined

In [None]:
texts = model.predict(X_test)

In [None]:
import codecs

with codecs.open('../data/models/scikit-learn', 'w', 'utf-8') as f:
    
    for text in texts:
        
        f.write("{}\n".format(text))

In [None]:
!head -100 ../data/models/scikit-learn | tail -10

In [None]:
!mkdir ../tmp/scikit-learn 

In [None]:
!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name scikit-learn --team_filepath ../data/models/scikit-learn --outdir ../tmp/scikit-learn 

In [None]:
bleu_all_cat = "../tmp/scikit-learn/scikit-learn_all-cat.txt"

!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$bleu_all_cat"