In [20]:
from sklearn.base import BaseEstimator, RegressorMixin

import numpy as np

import os

os.sys.path.insert(0, '../script')
from webnlg import WebNLGCorpus

In [67]:
model_name = 'just-join-baseline'

# Algorithm

In [71]:
# The algorithm using the scikit-learn framework
class NLGBaseline(BaseEstimator, RegressorMixin):
    
    def __init__(self, sop_sep=' ', sentence_sep=','):
        
        self.sop_sep = sop_sep
        self.sentence_sep = sentence_sep
        
    
    # there isn't any training step, as it's all rule-based        
    def fit(self, X, y=None):
        pass
    
    # generating text for an entry
    def predict_entry(self, x):
        
        # first, generate, for each content unit, a string which is simply the subject, followed by the predicate,
        #    and then by the object, separated by a single space
        sens = [self.sop_sep.join([data['subject'], data['predicate'], data['object']]) for data in x]
        
        # secondly, join each sentence with a comma between then
        return self.sentence_sep.join(sens)
    
    
    def predict(self, X, y=None):
        
        # for each entry, generate a text
        return [self.predict_entry(x) for x in X]

# Loading dataset

In [76]:
test = WebNLGCorpus.load("test_with_lex")

X_test = np.array([t.get_data() for t in test])
y_test = np.array([t.lexes() for t in test])

# Generating texts

In [59]:
%%time

texts = model.predict(X_test)

import codecs

with codecs.open('../data/models/just-join-baseline', 'w', 'utf-8') as f:
    
    for text in texts:
        
        f.write("{}\n".format(text))

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 14.7 ms


## Generated texts sample

In [60]:
!head -100 ../data/models/just-join-baseline | tail -10

Castle_(novel) language English_language
Eric_Flint birthPlace Burbank,_California
Farrar,_Straus_and_Giroux parentCompany Macmillan_Publishers
John_Cowper_Powys notableWork A_Glastonbury_Romance
Soho_Press country United_States
The_Secret_Scripture publisher Faber_and_Faber
United_States ethnicGroup Asian_Americans
United_States language English_language
Weymouth_Sands precededBy A_Glastonbury_Romance
A.C._Chievo_Verona manager Rolando_Maran


## Individual samples

### Sample with ntriples = 1

In [96]:
sample = test.sample(eid='Id10')

sample

Triple info: category=Airport eid=Id10

	Modified triples:

Amsterdam_Airport_Schiphol | 1st_runway_Number | 18


	Lexicalizations:

The first runway at Amsterdam's Schiphol Airport is known as Number 18.
The Amsterdam Airport Schiphol's 1st runway number is 18.
The number of the 1st runway at Amsterdam Airport Schiphol is 18.

In [97]:
model.predict([sample.get_data()])[0]

'Amsterdam_Airport_Schiphol 1st_runway_Number 18'

### Sample with ntriples = 3

In [88]:
sample = test.sample(eid='Id397')

sample

Triple info: category=Astronaut eid=Id397

	Modified triples:

Alan_Bean | nationality | United_States
Alan_Bean | birthPlace | Wheeler,_Texas
Alan_Bean | status | "Retired"


	Lexicalizations:

American retiree Alan Bean was born in Wheeler, Texas.
Retired US National Alan Bean was born in Wheeler, Texas.
Alan Bean was born in the United States in Wheeler, Texas and has retired.

In [89]:
model.predict([sample.get_data()])[0]

'Alan_Bean nationality United_States, Alan_Bean birthPlace Wheeler,_Texas, Alan_Bean status "Retired"'

# Evaluation

## Generating the files needed to evaluate with BLEU and METEOR

In [68]:
!mkdir -p ../tmp/"$model_name"

!python ../evaluation/webnlg2017/webnlg-automatic-evaluation-v2/evaluation_v2.py --team_name "$model_name" --team_filepath ../data/models/"$model_name" --outdir ../tmp/"$model_name" 

all_cat = f"../tmp/{model_name}/{model_name}_all-cat.txt"

Files creating finished for:  just-join-baseline


## BLEU

In [69]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < "$all_cat"

BLEU = 11.62, 40.3/16.4/10.0/6.8 (BP=0.800, ratio=0.817, hyp_len=30962, ref_len=37882)


## METEOR

In [70]:
!java -Xmx2G -jar ../evaluation/webnlg2017/meteor-1.5/meteor-1.5.jar "$all_cat" ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference-3ref.meteor -l en -norm -r 3 -a ../evaluation/webnlg2017/meteor-1.5/data/paraphrase-en.gz | tail -10

Test words:             53267
Reference words:        42542
Chunks:                 17826
Precision:              0.4226531779768414
Recall:                 0.6629579252811026
f1:                     0.5162093002624989
fMean:                  0.6108610246271051
Fragmentation penalty:  0.5691478919224137

Final score:            0.2631907602030226
