In [1]:
import os

os.sys.path.insert(0, '../script')
from webnlg import WebNLGCorpus
from just_join import *
from evaluation import *

# Model

In [2]:
model = JustJoinGenerator()

In [3]:
evaluate_model(model, 'just-join')

{'bleu': 11.62, 'meteor': 0.2631907602030226, 'ter': 0.7995305073983557}

## Individual samples

### Sample with ntriples = 1

In [17]:
sample = test.sample(ntriples=1, seed=200)

sample

Triple info: category=Airport eid=Id24

	Modified triples:

Atlantic_City_International_Airport | runwayName | "13/31"


	Lexicalizations:

The name of the runway at Atlantic City International Airport is "13/31".
13/31 is the runway name for Atlantic City International airport.
Atlantic City International Airport's runway name is "13/31".

In [18]:
model.predict([sample.get_data()])[0]

'Atlantic_City_International_Airport runwayName "13/31"'

### Sample with ntriples = 5

In [7]:
sample = test.sample(ntriples=5, seed=200)
sample

Triple info: category=City eid=Id806

	Modified triples:

Austin,_Texas | isPartOf | Texas
Texas | language | English_language
Austin,_Texas | isPartOf | Williamson_County,_Texas
Williamson_County,_Texas | largestCity | Round_Rock,_Texas
Williamson_County,_Texas | countySeat | Georgetown,_Texas


	Lexicalizations:

Austin is part of Williamson County, in Texas, where English is a language spoken. Round Rock is the largest city in Williamson County and Georgetown is the county seat.
Austin is located in Texas which is where English is spoken. Austin is part of Williamson County, Texas and the largest city is Round Rock while Georgetown is the county seat.

In [8]:
model.predict([sample.get_data()])[0]

'Austin,_Texas isPartOf Texas,Texas language English_language,Austin,_Texas isPartOf Williamson_County,_Texas,Williamson_County,_Texas largestCity Round_Rock,_Texas,Williamson_County,_Texas countySeat Georgetown,_Texas'

# Variantions

In [26]:
model = JustJoinGenerator()

model.predict([sample.get_data()])[0]

'Austin,_Texas isPartOf Texas,Texas language English_language,Austin,_Texas isPartOf Williamson_County,_Texas,Williamson_County,_Texas largestCity Round_Rock,_Texas,Williamson_County,_Texas countySeat Georgetown,_Texas'

In [30]:
model = JustJoinGenerator(sen_sort=reversed)

model.predict([sample.get_data()])[0]

'Williamson_County,_Texas countySeat Georgetown,_Texas,Williamson_County,_Texas largestCity Round_Rock,_Texas,Austin,_Texas isPartOf Williamson_County,_Texas,Texas language English_language,Austin,_Texas isPartOf Texas'

In [35]:
model = JustJoinGenerator(preprocess_subject=parenthesis_underline_camelcase,
                          preprocess_predicate=parenthesis_underline_camelcase,
                          preprocess_object=parenthesis_underline_camelcase,
                          sen_sep=', ')

model.predict([sample.get_data()])[0]

'Austin, Texas is Part Of Texas, Texas language English language, Austin, Texas is Part Of Williamson County, Texas, Williamson County, Texas largest City Round Rock, Texas, Williamson County, Texas county Seat Georgetown, Texas'

# Evaluation

### Models

In [2]:
%%time


param_grid = {'spo_sep': [' '],
              'sen_sep': [','],
              'spo_order': [SPO_ORDER, OPS_ORDER],
              'sen_sort': [as_is, reversed],
              'preprocess_subject': [as_is, remove_and_invert_parenthesis, remove_underline, camelcase_to_normal, parenthesis_underline, parenthesis_underline_camelcase],
              'preprocess_predicate': [as_is, remove_and_invert_parenthesis, remove_underline, camelcase_to_normal, parenthesis_underline, parenthesis_underline_camelcase],
              'preprocess_object': [as_is, remove_and_invert_parenthesis, remove_underline, camelcase_to_normal, parenthesis_underline, parenthesis_underline_camelcase]
             }

results = evaluate_grid(JustJoinGenerator(), param_grid)
results.to_csv('just_join_baseline_evaluation.csv')

CPU times: user 1min 3s, sys: 2min 22s, total: 3min 26s
Wall time: 3h 36min 27s


# Analysis results

In [36]:
df = pd.read_csv('just_join_baseline_evaluation.csv')

df.shape

(1728, 10)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,bleu,meteor,preprocess_object,preprocess_predicate,preprocess_subject,sen_sep,sen_sort,spo_order,spo_sep
0,0,11.62,0.263191,<function as_is at 0x7f2c41d17a60>,<function as_is at 0x7f2c41d17a60>,<function as_is at 0x7f2c41d17a60>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",
1,1,11.13,0.262815,<function remove_and_invert_parenthesis at 0x7...,<function as_is at 0x7f2c41d17a60>,<function as_is at 0x7f2c41d17a60>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",
2,2,20.86,0.28708,<function remove_underline at 0x7f2c41a46510>,<function as_is at 0x7f2c41d17a60>,<function as_is at 0x7f2c41d17a60>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",
3,3,11.59,0.26347,<function camelcase_to_normal at 0x7f2c42760e18>,<function as_is at 0x7f2c41d17a60>,<function as_is at 0x7f2c41d17a60>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",
4,4,20.48,0.286698,<function pipeline.<locals>.f at 0x7f2c4276f048>,<function as_is at 0x7f2c41d17a60>,<function as_is at 0x7f2c41d17a60>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",


In [11]:
df.nlargest(3, 'bleu')

Unnamed: 0.1,Unnamed: 0,bleu,meteor,preprocess_object,preprocess_predicate,preprocess_subject,sen_sep,sen_sort,spo_order,spo_sep
176,176,26.36,0.342489,<function remove_underline at 0x7f2c41a46510>,<function pipeline.<locals>.f at 0x7f2c41d17bf8>,<function pipeline.<locals>.f at 0x7f2c4276f048>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",
212,212,26.33,0.342301,<function remove_underline at 0x7f2c41a46510>,<function pipeline.<locals>.f at 0x7f2c41d17bf8>,<function pipeline.<locals>.f at 0x7f2c41d17bf8>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",
178,178,26.28,0.341974,<function pipeline.<locals>.f at 0x7f2c4276f048>,<function pipeline.<locals>.f at 0x7f2c41d17bf8>,<function pipeline.<locals>.f at 0x7f2c4276f048>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",


In [27]:
df.nsmallest(3, 'bleu')

Unnamed: 0.1,Unnamed: 0,bleu,meteor,preprocess_object,preprocess_predicate,preprocess_subject,sen_sep,sen_sort,spo_order,spo_sep
1339,1339,10.62,0.256942,<function remove_and_invert_parenthesis at 0x7...,<function remove_and_invert_parenthesis at 0x7...,<function remove_and_invert_parenthesis at 0x7...,,<function as_is at 0x7f2c41d17a60>,"['object', 'predicate', 'subject']",
1555,1555,10.62,0.256912,<function remove_and_invert_parenthesis at 0x7...,<function remove_and_invert_parenthesis at 0x7...,<function remove_and_invert_parenthesis at 0x7...,,<class 'reversed'>,"['object', 'predicate', 'subject']",
1333,1333,10.63,0.256946,<function remove_and_invert_parenthesis at 0x7...,<function as_is at 0x7f2c41d17a60>,<function remove_and_invert_parenthesis at 0x7...,,<function as_is at 0x7f2c41d17a60>,"['object', 'predicate', 'subject']",


In [34]:
!head -3 ../data/models/176

Abilene Regional Airport city Served Abilene, Texas
Adolfo Suárez Madrid–Barajas Airport location "Madrid, Paracuellos de Jarama, San Sebastián de los Reyes and Alcobendas"
Adolfo Suárez Madrid–Barajas Airport runway Name "18L/36R"


In [37]:
!head -3 ../data/models/1339

Abilene,_Texas cityServed Abilene_Regional_Airport
"Madrid, Paracuellos de Jarama, San Sebastián de los Reyes and Alcobendas" location Adolfo_Suárez_Madrid–Barajas_Airport
"18L/36R" runwayName Adolfo_Suárez_Madrid–Barajas_Airport


In [10]:
df.nlargest(3, 'meteor')

Unnamed: 0.1,Unnamed: 0,bleu,meteor,preprocess_object,preprocess_predicate,preprocess_subject,sen_sep,sen_sort,spo_order,spo_sep
104,104,26.24,0.342777,<function remove_underline at 0x7f2c41a46510>,<function pipeline.<locals>.f at 0x7f2c41d17bf8>,<function remove_underline at 0x7f2c41a46510>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",
107,107,26.16,0.34272,<function pipeline.<locals>.f at 0x7f2c41d17bf8>,<function pipeline.<locals>.f at 0x7f2c41d17bf8>,<function remove_underline at 0x7f2c41a46510>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",
106,106,26.27,0.342496,<function pipeline.<locals>.f at 0x7f2c4276f048>,<function pipeline.<locals>.f at 0x7f2c41d17bf8>,<function remove_underline at 0x7f2c41a46510>,",",<function as_is at 0x7f2c41d17a60>,"['subject', 'predicate', 'object']",


In [20]:
df.groupby('preprocess_object')[['bleu', 'meteor']].agg(['max', 'mean']).sort_values(('bleu', 'max'), ascending=False)

Unnamed: 0_level_0,bleu,bleu,meteor,meteor
Unnamed: 0_level_1,max,mean,max,mean
preprocess_object,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
<function remove_underline at 0x7f2c41a46510>,26.36,22.354549,0.342777,0.306842
<function pipeline.<locals>.f at 0x7f2c4276f048>,26.28,21.945486,0.342496,0.306384
<function pipeline.<locals>.f at 0x7f2c41d17bf8>,26.17,21.872951,0.34272,0.306622
<function as_is at 0x7f2c41d17a60>,20.25,15.124549,0.317702,0.28263
<function camelcase_to_normal at 0x7f2c42760e18>,20.21,15.089826,0.317967,0.282899
<function remove_and_invert_parenthesis at 0x7f2c41d17598>,19.85,14.645208,0.317334,0.282193


In [21]:
df.groupby('preprocess_subject')[['bleu', 'meteor']].agg(['max', 'mean']).sort_values(('bleu', 'max'), ascending=False)

Unnamed: 0_level_0,bleu,bleu,meteor,meteor
Unnamed: 0_level_1,max,mean,max,mean
preprocess_subject,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
<function pipeline.<locals>.f at 0x7f2c4276f048>,26.36,21.166076,0.342489,0.306935
<function pipeline.<locals>.f at 0x7f2c41d17bf8>,26.33,21.146215,0.342301,0.306742
<function remove_underline at 0x7f2c41a46510>,26.27,21.370312,0.342777,0.307332
<function as_is at 0x7f2c41d17a60>,22.73,15.799236,0.317673,0.282307
<function camelcase_to_normal at 0x7f2c42760e18>,22.73,15.796632,0.317542,0.282177
<function remove_and_invert_parenthesis at 0x7f2c41d17598>,22.7,15.754097,0.31748,0.282077


In [22]:
df.groupby('preprocess_predicate')[['bleu', 'meteor']].agg(['max', 'mean']).sort_values(('bleu', 'max'), ascending=False)

Unnamed: 0_level_0,bleu,bleu,meteor,meteor
Unnamed: 0_level_1,max,mean,max,mean
preprocess_predicate,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
<function pipeline.<locals>.f at 0x7f2c41d17bf8>,26.36,19.330938,0.342777,0.314603
<function camelcase_to_normal at 0x7f2c42760e18>,26.17,19.199792,0.342076,0.314047
<function pipeline.<locals>.f at 0x7f2c4276f048>,25.3,18.181111,0.312803,0.285011
<function remove_underline at 0x7f2c41a46510>,25.26,18.168507,0.312736,0.284971
<function remove_and_invert_parenthesis at 0x7f2c41d17598>,25.18,18.080208,0.312181,0.284488
<function as_is at 0x7f2c41d17a60>,25.15,18.072014,0.312115,0.284449


In [23]:
df.groupby('sen_sep')[['bleu', 'meteor']].agg(['max', 'mean']).sort_values(('bleu', 'max'), ascending=False)

Unnamed: 0_level_0,bleu,bleu,meteor,meteor
Unnamed: 0_level_1,max,mean,max,mean
sen_sep,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
",",26.36,19.024699,0.342777,0.296994
,25.37,17.986157,0.338456,0.292196


In [24]:
df.groupby('sen_sort')[['bleu', 'meteor']].agg(['max', 'mean']).sort_values(('bleu', 'max'), ascending=False)

Unnamed: 0_level_0,bleu,bleu,meteor,meteor
Unnamed: 0_level_1,max,mean,max,mean
sen_sort,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
<function as_is at 0x7f2c41d17a60>,26.36,18.514155,0.342777,0.294612
<class 'reversed'>,26.18,18.496701,0.34249,0.294578


In [25]:
df.groupby('spo_order')[['bleu', 'meteor']].agg(['max', 'mean']).sort_values(('bleu', 'max'), ascending=False)

Unnamed: 0_level_0,bleu,bleu,meteor,meteor
Unnamed: 0_level_1,max,mean,max,mean
spo_order,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"['subject', 'predicate', 'object']",26.36,18.653275,0.342777,0.294837
"['object', 'predicate', 'subject']",25.3,18.357581,0.340966,0.294353
