In [13]:
%load_ext autoreload
%autoreload 2
import os

os.sys.path.insert(0, '../script')

from evaluation import evaluate_model, evaluate_texts, EVALUATION_SETS
from collections import ChainMap, defaultdict, Counter
from template_based import *
import re

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
PARENTHESIS_RE = re.compile(r'(.*?)\((.*?)\)')
CAMELCASE_RE = re.compile(r'([a-z])([A-Z])')

def preprocess_so(so):

    parenthesis_preprocessed = PARENTHESIS_RE.sub('\g<2> \g<1>', so)
    underline_removed = parenthesis_preprocessed.replace('_', ' ')
    camelcase_preprocessed = CAMELCASE_RE.sub('\g<1> \g<2>', underline_removed)

    return camelcase_preprocessed.strip('" ')

In [52]:
class TemplateBasedModel:
    
    def __init__(self, template_db, lexicalization_f):
        
        fallback_template_db = defaultdict(lambda: Counter([JustJoinTemplate()]))
        
        self.template_db = ChainMap(template_db, fallback_template_db)
        self.ss = StructureData(self.template_db)
        self.st = SelectTemplate()
        self.mt = MakeText(lexicalization_f=lexicalization_f)
    
    def predict(self, X):
        
        result = []
        
        for x in X:
            try:

                structured_data = self.ss.structure(x.data)
                selected_templates = self.st.select_template(structured_data)
                text = self.mt.make_text(selected_templates)

                result.append(text)

            except Exception as ex:
                raise ex
        
        return result

In [3]:
import pickle

template_db = None
with open('thiago_template_db', 'rb') as f:
    template_db = pickle.load(f)
    
with open('thiago_enhanced_template_db', 'rb') as f:
    template_enhanced_db = pickle.load(f)
    
lexicalization_db = None
with open('thiago_lexicalization_db', 'rb') as f:
    lexicalization_db = pickle.load(f)

In [54]:
def lexicalize(s):
    
    if s in lexicalization_db:
        lexis = lexicalization_db[s]
        
        return lexis.most_common()[0][0]
    else:
        return preprocess_so(s)

In [55]:
tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize)

In [56]:
%%time
evaluate_model(tbm, 'template-based-model-thiago')

CPU times: user 203 ms, sys: 375 ms, total: 578 ms
Wall time: 23.3 s


{'bleu': 42.33, 'meteor': 0.4009372192995391, 'ter': 0.5736199034076689}

In [14]:
%%time
results = {}

for eval_set in EVALUATION_SETS:
    
    results[eval_set] = evaluate_texts(f'../model/template-based-model-thiago_{eval_set}.txt', eval_set)
    
df = pd.DataFrame(results)
df

CPU times: user 78.1 ms, sys: 1.23 s, total: 1.31 s
Wall time: 2min 15s


In [22]:
tbm = TemplateBasedModel(template_db=template_enhanced_db, lexicalization_f=lexicalize)

In [23]:
%%time
evaluate_model(tbm, 'template-based-model-enhanced-thiago')

CPU times: user 234 ms, sys: 328 ms, total: 562 ms
Wall time: 21 s


{'bleu': 42.39, 'meteor': 0.40095348475510767, 'ter': 0.5716341806007189}

In [16]:
%%time
results = {}

for eval_set in EVALUATION_SETS:
    
    results[eval_set] = evaluate_texts(f'../model/template-based-model-enhanced-thiago_{eval_set}.txt', eval_set)
    
df = pd.DataFrame(results)
df

CPU times: user 31.2 ms, sys: 1.16 s, total: 1.19 s
Wall time: 2min 28s


In [28]:
!tail -5 ../model/template-based-model-thiago.txt

Polydor Records is located in london. Alternative rock music Fusion Genre Nu metal. musician Andrew White record Label Polydor Records. musician Andrew White record Label defunct record label Universal Records. musician Andrew White is Alternative rock.
Rock music music Fusion Genre music Bhangra. NRBQ band Al Anderson instrument Guitar. NRBQ band Al Anderson is Rock music. Rock music stylistic Origin Country music. Country music stylistic Origin Blues.
Rock music stylistic Origin Blues. Rock music music Fusion Genre music Bhangra. Country music instrument Banjo. NRBQ band Al Anderson is Rock music. Rock music stylistic Origin Country music.
band Twilight is Black metal. Aaron Turner associated Band/associated Musical Artist band Twilight. Aaron Turner associated Band/associated Musical Artist Old Man Gloom. Aaron Turner instrument Electric guitar. Black metal music Fusion Genre Death metal.
Raúl Fernando Sendic Rodríguez is the leader of Uruguay. Alfredo Zitarrosa died in Montevid

In [27]:
!tail -5 ../model/template-based-model-enhanced-thiago.txt

Polydor Records is located in london. Alternative rock music Fusion Genre Nu metal. musician Andrew White record Label Polydor Records. musician Andrew White record Label defunct record label Universal Records. musician Andrew White is Alternative rock.
Rock music music Fusion Genre music Bhangra. NRBQ band Al Anderson instrument Guitar. NRBQ band Al Anderson is Rock music. Rock music stylistic Origin Country music. Country music stylistic Origin Blues.
Rock music stylistic Origin Blues. Rock music music Fusion Genre music Bhangra. Country music instrument Banjo. NRBQ band Al Anderson is Rock music. Rock music stylistic Origin Country music.
band Twilight is Black metal. Aaron Turner associated Band/associated Musical Artist band Twilight. Aaron Turner associated Band/associated Musical Artist Old Man Gloom. Aaron Turner instrument Electric guitar. Black metal music Fusion Genre Death metal.
Raúl Fernando Sendic Rodríguez is the leader of Uruguay. Alfredo Zitarrosa died in Montevid