In [1]:
%load_ext autoreload
%autoreload 2
import os

os.sys.path.insert(0, '../script')

from evaluation import evaluate_model
from collections import ChainMap, defaultdict, Counter
from template_based import *
import re

In [2]:
PARENTHESIS_RE = re.compile(r'(.*?)\((.*?)\)')
CAMELCASE_RE = re.compile(r'([a-z])([A-Z])')

def preprocess_so(so):

    parenthesis_preprocessed = PARENTHESIS_RE.sub('\g<2> \g<1>', so)
    underline_removed = parenthesis_preprocessed.replace('_', ' ')
    camelcase_preprocessed = CAMELCASE_RE.sub('\g<1> \g<2>', underline_removed)

    return camelcase_preprocessed.strip('" ')

In [3]:
class TemplateBasedModel:
    
    def __init__(self, template_db, lexicalization_f):
        
        fallback_template_db = defaultdict(lambda: Counter([JustJoinTemplate()]))
        
        self.template_db = ChainMap(template_db, fallback_template_db)
        self.ss = StructureData(self.template_db)
        self.st = SelectTemplate()
        self.mt = MakeText(lexicalization_f=lexicalization_f)
    
    def predict(self, X):
        
        result = []
        
        for x in X:
            try:

                structured_data = self.ss.structure(x.data)
                selected_templates = self.st.select_template(structured_data)
                text = self.mt.make_text(selected_templates)

                result.append(text)

            except Exception as ex:
                print(x)
                raise ex
        
        return result

In [4]:
import pickle

template_db = None
with open('thiago_template_db', 'rb') as f:
    template_db = pickle.load(f)
    
lexicalization_db = None
with open('thiago_lexicalization_db', 'rb') as f:
    lexicalization_db = pickle.load(f)

In [5]:
def lexicalize(s):
    
    if s in lexicalization_db:
        lexis = lexicalization_db[s]
        
        return lexis.most_common()[0][0]
    else:
        return preprocess_so(s)

    
tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize)

In [6]:
%%time
evaluate_model(tbm, 'template-based-model-thiago')

CPU times: user 203 ms, sys: 219 ms, total: 422 ms
Wall time: 22.5 s


{'bleu': 42.27, 'meteor': 0.4007386633351888, 'ter': 0.5742664178099317}

In [103]:
!head -10 ../model/template-based-model-thiago.txt

Abilene Regional Airport serves the city of Abilene, Texas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes and Alcobendas.
18L/36R is the runway name of Adolfo Suárez Madrid–Barajas Airport.
The ICAO Location Identifier of Afonso Pena International Airport is SBCT.
Afonso Pena International Airport serves the city of Curitiba.
Al-Taqaddum Air Base serves the city of Fallujah.
The runway length of Al-Taqaddum Air Base is 3684.0.
14/32 is the runway name of Alderney Airport.
The runway length of Allama Iqbal International Airport is 3360.12.
Amsterdam Airport Schiphol 1st runway is Number 18.


In [104]:
!head -10 ../model/template-based-model-just-join.txt

Abilene Regional Airport city Served Abilene, Texas.
Adolfo Suárez Madrid–Barajas Airport location Madrid, Paracuellos de Jarama, San Sebastián de los Reyes and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport runway Name 18L/36R.
Afonso Pena International Airport ICAO Location Identifier SBCT.
Afonso Pena International Airport city Served Curitiba.
Al-Taqaddum Air Base city Served Fallujah.
Al-Taqaddum Air Base runway Length 3684.0.
Alderney Airport runway Name 14/32.
Allama Iqbal International Airport runway Length 3360.12.
Amsterdam Airport Schiphol 1st runway Number 18.
