In [3]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

from sklearn.base import BaseEstimator, RegressorMixin
from itertools import repeat, chain
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from nltk.tokenize import WhitespaceTokenizer

import os

os.sys.path.insert(0, '../script')
from webnlg import WebNLGCorpus
from lexicalization import preprocess_so

from content_selection import SelectAllContentSelection, SelectFirstNContentSelection
from sentence_aggregation import JustJoinSentencesSentenceAggregator
from sentence_generation import JustJoinTripleSentenceGenerator, MostFrequentTemplateSentenceGenerator, FallBackPipelineSentenceGenerator, NearestPredicateTemplateSentenceGenerator
from discourse_structuring import MostFrequentFirstDiscourseStructuring, ChainDiscourseStructuring, DoesntSortDiscourseStructuring
from data_alignment import RootDataAlignmentModel, NGramDataAlignmentModel, SPODataAlignmentModel, FallBackDataAlignmentModel
from template_extraction import TemplateExtractor
from text_generation import IfAfterNthProcessPipelineTextGenerator
from lexicalization import LexicalizeAsAligned, LexicalizePreprocessed, FallBackLexicalize
from webnlg import preprocess_triple_text
from textacy import similarity
import spacy

nlp = spacy.load('en_core_web_lg')

In [42]:
class NLGBaseline(BaseEstimator, RegressorMixin):
    
    def __init__(self, sep=None, preprocess_data=None):
        
        self.sep = sep
        self.preprocess_data = preprocess_data

            
    def fit(self, X, y=None):
        pass
    
    
    def predict_entry(self, x):
        
        sens = []
        
        for data in x:
                
            m_s = self.preprocess_data(data['subject'])
            m_p = self.preprocess_data(data['predicate'])
            m_o = self.preprocess_data(data['object'])
        
            sens.append(f'{m_s} {m_p} {m_o}')
        
        return self.sep.join(sens)
    
    
    def predict(self, X, y=None):
        
        return [self.predict_entry(x) for x in X]

In [4]:
test = WebNLGCorpus.load("test_with_lex")
train = WebNLGCorpus.load(['train', 'dev'])

X = np.array([t.get_data() for t in train])
y = np.array([t.lexes() for t in train])

X_test = np.array([t.get_data() for t in test])
y_test = np.array([t.lexes() for t in test])

In [43]:
wt = WhitespaceTokenizer()

def bleu_(y_true, y_pred):
    
    y_true_ = [[wt.tokenize(ref) for ref in refs] for refs in y_true]
    y_pred_ = [wt.tokenize(hypothesi) for hypothesi in y_pred]
    
    return corpus_bleu(y_true_, y_pred_)


bleu = make_scorer(bleu_)
param_grid = {'sep': [' '],
              'preprocess_data': [preprocess_so]}

cv = GridSearchCV(NLGBaseline(), param_grid, scoring=bleu, cv=4)

cv.fit(X, y)

GridSearchCV(cv=4, error_score='raise',
       estimator=NLGBaseline(preprocess_data=None, sep=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'sep': [' '], 'preprocess_data': [<function preprocess_so at 0x7f4be2e73048>]},
       pre_dispatch='2*n_jobs', refit=True, scoring=make_scorer(bleu_),
       verbose=0)

In [44]:
cv.grid_scores_

[mean: 0.17828, std: 0.02259, params: {'preprocess_data': <function preprocess_so at 0x7f4be2e73048>, 'sep': ' '}]

# Template-Based

In [7]:
from textacy import preprocess

class NLGTemplateBased(BaseEstimator, RegressorMixin):
    
    def __init__(self, content_selection=None, data_alignment=None, discourse_structurer=None, processor=lambda x: x, sentence_generator=None, lexicalizer=None, sentence_aggregator=None, nth=-1):
        
        self.content_selection = content_selection
        self.data_alignment = data_alignment
        self.discourse_structurer = discourse_structurer
        self.sentence_generator = sentence_generator
        self.lexicalizer = lexicalizer
        self.sentence_aggregator = sentence_aggregator
        self.nth = nth
        self.processor = processor
        

    def fit(self, X, y=None):
        
        self.template_extractor = TemplateExtractor(self.data_alignment)
        
        # 1-triple size
        X_1, y_1 = zip(*[(x[0], y_) for (x, y_) in zip(X, y) if len(x) == 1])
        
        X_1 = list(chain.from_iterable([x_1]*len(y_1_) for (x_1, y_1_) in zip(X_1, y_1)))
        y_1 = list(chain.from_iterable(y_1))
        
        self.template_extractor.fit(y_1, X_1)
        self.sentence_generator.fit(self.template_extractor)
        self.lexicalizer.fit(self.data_alignment)
        self.discourse_structurer.fit(self.template_extractor)
    

    def predict(self, X, y=None):
        
        generated_texts = []

        for entry in X:
            
            selected_data = self.content_selection.select(entry)
            
            preprocessed_data = [{k: preprocess.remove_punct(v, marks='""').strip() for k, v in data.items()} for data in selected_data]

            sorted_data = self.discourse_structurer.sort(preprocessed_data)
            
            sentences = []
            for i, d in enumerate(sorted_data):

                if i > self.nth:

                    d = self.processor(d)
                    
                lexicalized = self.lexicalizer.lexicalize(d)
                
                sentence = self.sentence_generator.generate(lexicalized)
                
                sentences.append(sentence)
                
            text = self.sentence_aggregator.aggregate(sentences)

            generated_texts.append(text)

        return generated_texts

In [8]:
# Content Selection
csall = SelectAllContentSelection()

# Data Alignment
rda = RootDataAlignmentModel(similarity.token_sort_ratio, nlp)
ngramda = NGramDataAlignmentModel(3, similarity.jaro_winkler, nlp)
spoda = SPODataAlignmentModel(nlp)

da = FallBackDataAlignmentModel(models=[spoda, ngramda, rda])

# Sentence Generation
mft = MostFrequentTemplateSentenceGenerator()
# npt = NearestPredicateTemplateSentenceGenerator(mft, similarity.token_sort_ratio, .6)
jjt = JustJoinTripleSentenceGenerator()
sent_pipe = FallBackPipelineSentenceGenerator([mft, jjt])

text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

#cds = ChainDiscourseStructuring()
cds = MostFrequentFirstDiscourseStructuring()

le = LexicalizeAsAligned()
lp = LexicalizePreprocessed(preprocess_triple_text)
fle = FallBackLexicalize([le, lp])

def replace_subject(d):
    
    d['subject'] = ','
    
    return d

nlg = NLGTemplateBased(content_selection=csall,
                       data_alignment=da,
                       discourse_structurer=cds,
                       processor=replace_subject,
                       sentence_generator=sent_pipe,
                       sentence_aggregator=text_agg,
                       lexicalizer=fle,
                       nth=0)

nlg.fit(X[:200], y[:200])

nlg.predict([[{'idx': '11_19',
  'mtext': 'Balder_(comicsCharacter) | alternativeName | "Balder Odinson"',
  'subject': 'Balder_(comicsCharacter) ',
  'predicate': ' alternativeName ',
  'object': ' "Balder Odinson"'},
 {'idx': '11_19',
  'mtext': 'Balder_(comicsCharacter) | creator | Stan_Lee',
  'subject': 'Balder_(comicsCharacter) ',
  'predicate': 'creator',
  'object': ' Stan_Lee'}]])

['Balder (comics Character) alternativeName Balder Odinson , creator Stan Lee']

In [24]:
%%time

from itertools import product

css = [SelectAllContentSelection()]

rdas = [RootDataAlignmentModel(sim, nlp) for sim in [similarity.token_sort_ratio]]
ngramdas = [NGramDataAlignmentModel(n, similarity.token_sort_ratio, nlp) for n in range(3, 5)]
spoda = SPODataAlignmentModel(nlp)
das = [FallBackDataAlignmentModel(models) for models in product(ngramdas, rdas, [spoda])]

# Sentence Generation
mft = MostFrequentTemplateSentenceGenerator()
# npt = NearestPredicateTemplateSentenceGenerator(mft, similarity.token_sort_ratio, .6)
jjt = JustJoinTripleSentenceGenerator()
sent_pipes = [FallBackPipelineSentenceGenerator([mft, jjt]), FallBackPipelineSentenceGenerator([jjt, mft])]

text_agg = JustJoinSentencesSentenceAggregator(sep=' ')

# poor results ChainDiscourseStructuring()
cdss = [MostFrequentFirstDiscourseStructuring(), ChainDiscourseStructuring(), DoesntSortDiscourseStructuring()]

la = LexicalizeAsAligned()
lp = LexicalizePreprocessed(preprocess_triple_text)
les = [FallBackLexicalize([la, lp])]

def replace_subject(d):
    
    d['subject'] = ','
    
    return d

# scorer
wt = WhitespaceTokenizer()

def bleu_(y_true, y_pred):
    
    y_true_ = [[wt.tokenize(ref) for ref in refs] for refs in y_true]
    y_pred_ = [wt.tokenize(hypothesi) for hypothesi in y_pred]
    
    return corpus_bleu(y_true_, y_pred_)

bleu = make_scorer(bleu_)

# grid
param_grid = {'content_selection': css,
              'data_alignment': das,
              'discourse_structurer': cdss,
              'sentence_generator': sent_pipes,
              'sentence_aggregator': [text_agg],
              'lexicalizer': les,
              'nth': [0],
              'processor': [replace_subject]}

# data
from sklearn.utils import shuffle
X_train, y_train = shuffle(X, y, random_state=200)

cv = GridSearchCV(NLGTemplateBased(), param_grid, scoring=bleu, cv=2, return_train_score=True)

cv.fit(X_train, y_train)



AttributeError: 'NoneType' object has no attribute 'end_char'

In [12]:
results = cv.cv_results_

In [13]:
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_content_selection', 'param_data_alignment', 'param_discourse_structurer', 'param_lexicalizer', 'param_nth', 'param_processor', 'param_sentence_aggregator', 'param_sentence_generator', 'params', 'split0_test_score', 'split1_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'mean_train_score', 'std_train_score'])

In [15]:
pos = np.where(results['rank_test_score'] == 1)
#pos = results['rank_test_score'].argmax()

print(f"Test BLEU mean:\n{results['mean_test_score'][pos]}\n")
print(f"Content Selection:\n{results['param_content_selection'][pos]}\n")
print(f"Data Alignment:\n{results['param_data_alignment'][pos]}\n")
print(f"Discourse Structuring:\n{results['param_discourse_structurer'][pos]}\n")
print(f"Lexicalizer:\n{results['param_lexicalizer'][pos]}\n")
print(f"Nth:\n{results['param_nth'][pos]}\n")
print(f"Processor:\n{results['param_processor'][pos]}\n")
print(f"Sentence Aggregator:\n{results['param_sentence_aggregator'][pos]}\n")
print(f"Sentence Generator:\n{results['param_sentence_generator'][pos]}")

Test BLEU mean:
[0.14555327 0.14555327 0.14555327 0.14555327]

Content Selection:
[SelectAllContentSelection() SelectAllContentSelection()
 SelectAllContentSelection() SelectAllContentSelection()]

Data Alignment:
[FallBackDataAlignmentModel(models=(NGramDataAlignmentModel(max_n=3,
            nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>,
            similarity_metric=<function token_sort_ratio at 0x7f98dc7bf510>), RootDataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>,
            similarity_metric=<function token_sort_ratio at 0x7f98dc7bf510>), SPODataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>)))
 FallBackDataAlignmentModel(models=(NGramDataAlignmentModel(max_n=3,
            nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>,
            similarity_metric=<function token_sort_ratio at 0x7f98dc7bf510>), RootDataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>,
            similarity_metric=<function token_sort

In [16]:
cv.best_params_

{'content_selection': SelectAllContentSelection(),
 'data_alignment': FallBackDataAlignmentModel(models=(NGramDataAlignmentModel(max_n=3,
             nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>,
             similarity_metric=<function token_sort_ratio at 0x7f98dc7bf510>), RootDataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>,
             similarity_metric=<function token_sort_ratio at 0x7f98dc7bf510>), SPODataAlignmentModel(nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>))),
 'discourse_structurer': ChainDiscourseStructuring(),
 'lexicalizer': FallBackLexicalize(models=[LexicalizeAsAligned(data_alignment=FallBackDataAlignmentModel(models=(NGramDataAlignmentModel(max_n=3,
             nlp=<spacy.lang.en.English object at 0x7f98dc7ae6d8>,
             similarity_metric=<function token_sort_ratio at 0x7f98dc7bf510>), RootDataAlignmentModel(nlp=<spacy.lang.en.Eng...e6d8>)))), LexicalizePreprocessed(preprocessor=<function preprocess_triple_text at 0x7f

In [17]:
bleu(cv, X_test, y_test)

0.14681765955986625

In [18]:
texts = cv.predict(X_test)

In [19]:
import codecs

with codecs.open('../data/models/scikit-learn', 'w', 'utf-8') as f:
    
    for text in texts:
        
        f.write("{}\n".format(text))

In [20]:
!head -100 ../data/models/scikit-learn | tail -10

Castle (novel) language English language
Eric Flint birthPlace Burbank, California
Farrar, Straus and Giroux parentCompany Macmillan Publishers
John Cowper Powys notableWork A Glastonbury Romance
Soho Press country United States
The Secret Scripture publisher Faber and Faber
United States ethnicGroup Asian Americans
United States language English language
Weymouth Sands precededBy A Glastonbury Romance
A.C. Chievo Verona manager Rolando Maran


In [21]:
!mkdir ../tmp/scikit-learn 

mkdir: cannot create directory ‘../tmp/scikit-learn’: File exists


In [22]:
!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name scikit-learn --team_filepath ../data/models/scikit-learn --outdir ../tmp/scikit-learn 

Files creating finished for:  scikit-learn


In [23]:
bleu_all_cat = "../tmp/scikit-learn/scikit-learn_all-cat.txt"

!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$bleu_all_cat"

BLEU = 25.45, 74.0/44.0/25.4/15.0 (BP=0.763, ratio=0.787, hyp_len=30926, ref_len=39274)
