In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd

os.sys.path.insert(0, '../script')

from evaluation import evaluate_model, evaluate_texts, EVALUATION_SETS, preprocess_to_evaluate
from collections import ChainMap, defaultdict, Counter
from template_based import *
import re

In [2]:
PARENTHESIS_RE = re.compile(r'(.*?)\((.*?)\)')
CAMELCASE_RE = re.compile(r'([a-z])([A-Z])')

def preprocess_so(so):

    parenthesis_preprocessed = PARENTHESIS_RE.sub('\g<2> \g<1>', so)
    underline_removed = parenthesis_preprocessed.replace('_', ' ')
    camelcase_preprocessed = CAMELCASE_RE.sub('\g<1> \g<2>', underline_removed)

    return camelcase_preprocessed.strip('" ')

In [3]:
class TemplateBasedModel:
    
    def __init__(self, template_db, lexicalization_f):
        
        self.ss = StructureData(template_db, Counter([JustJoinTemplate()]))
        self.st = SelectTemplate()
        self.mt = MakeText(lexicalization_f=lexicalization_f)
    
    def predict(self, X):
        
        result = []
        
        for x in X:
            try:

                structured_data = self.ss.structure(x)
                selected_templates = self.st.select_template(structured_data)
                text = self.mt.make_text(selected_templates)

                result.append(text)

            except Exception as ex:
                print(x.eid)
                raise ex
        
        return result

In [None]:
import pickle

with open('thiago_template_db', 'rb') as f:
    template_db = pickle.load(f)
    
with open('thiago_enhanced_template_db', 'rb') as f:
    template_enhanced_db = pickle.load(f)
    

In [4]:
import pickle

with open('thiago_template_db_category', 'rb') as f:
    template_db = pickle.load(f)

In [5]:
with open('thiago_lexicalization_db', 'rb') as f:
    lexicalization_db = pickle.load(f)

In [6]:
def lexicalize(s, ctx):
    
    if s in ctx['referred']:
        
        return ''
    
    ctx['referred'].add(s)
    
    if s in lexicalization_db:
        lexis = lexicalization_db[s]
        
        return lexis.most_common()[0][0]
    else:
        return preprocess_so(s)

In [7]:
tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize)

In [8]:
import codecs
from webnlg_corpus import webnlg

corpus = webnlg.load('webnlg_challenge_2017')
test = corpus.subset(datasets=['test'])

In [10]:
texts_filepath = f'../../masters/data/models/abe-2/abe-2.txt'

with codecs.open(texts_filepath, 'w', 'utf-8') as f:

    for text in tbm.predict(test):

        f.write("{}\n".format(text))

In [9]:
s = test.sample()
tbm.predict([s])

['a wizard of mars was written by diane duane.']

In [12]:
tbm.ss.structure(s)

[([Alfa_Romeo_164, 
  
  	<relatedMeanOfTransportation, [Fiat_Croma]>],
  Counter({template {s} {p} {o}.: 1})),
 ([Alfa_Romeo_164, 
  
  	<assembly, [Arese]>], Counter({template {s} {p} {o}.: 1})),
 ([Alfa_Romeo_164, 
  
  	<relatedMeanOfTransportation, [Lancia_Thema]>],
  Counter({template {s} {p} {o}.: 1}))]

In [None]:
%%time
evaluate_model(tbm, 'template-based-model-thiago')

In [None]:
%%time
results = {}

for eval_set in EVALUATION_SETS:
    
    results[eval_set] = evaluate_texts(f'../model/template-based-model-thiago_{eval_set}.txt', eval_set)
    
df = pd.DataFrame(results)
df

In [None]:
df

In [None]:
tbm = TemplateBasedModel(template_db=template_enhanced_db, lexicalization_f=lexicalize)

In [None]:
%%time
evaluate_model(tbm, 'template-based-model-enhanced-thiago')

In [None]:
%%time
results = {}

for eval_set in EVALUATION_SETS:
    
    results[eval_set] = evaluate_texts(f'../model/template-based-model-enhanced-thiago_{eval_set}.txt', eval_set)
    
df = pd.DataFrame(results)
df

In [None]:
!tail -5 ../model/template-based-model-thiago.txt

In [None]:
!tail -5 ../model/template-based-model-enhanced-thiago.txt

# For which entries I have good templates?

In [None]:
from webnlg_corpus import webnlg

corpus = webnlg.load('webnlg_challenge_2017')

test = corpus.subset(datasets=['test'])

In [None]:
e_w_template = []

for e in test:
    
    s = Structure.from_triples(e.data)
    
    if s in template_db:
        
        e_w_template.append(e)

In [None]:
len(e_w_template)

In [None]:
int(e_w_template[0].eid[2:]) - 1

In [None]:
with open('../../masters/evaluation/subsets/abe-1-w-templates.txt', 'w') as f:
    
    for e in e_w_template:
        
        f.write('{}\n'.format(int(e.eid[2:]) - 1))