In [1]:
import os
import pandas as pd

os.sys.path.extend(['../template_model/', '../evaluation'])

from evaluate import preprocess_model_to_evaluate
from util import preprocess_so, clear_dir
from collections import ChainMap, defaultdict, Counter
from template_based2 import JustJoinTemplate, StructureData, MakeText
import pickle

In [2]:
class TemplateBasedModel:
    
    def __init__(self, template_db, lexicalization_f):
        
        self.ss = StructureData(template_db, JustJoinTemplate())
        self.mt = MakeText(lexicalization_f=lexicalization_f)
    
    def predict(self, X):
        
        result = []
        
        for e in X:

            structured_data = self.ss.structure(e['triples'])
            texts = self.mt.make_text(structured_data)

            result.append(texts)
        
        return result
    
    
with open('../data/templates/template_db/thiago_template_db2', 'rb') as f:
    template_db = pickle.load(f)
    
with open('../data/templates/lexicalization/thiago_name_db', 'rb') as f:
    name_db = pickle.load(f)
    
with open('../data/templates/lexicalization/thiago_pronoun_db', 'rb') as f:
    pronoun_db = pickle.load(f)
    
with open('../evaluation/test.pkl', 'rb') as f:
    test = pickle.load(f)

In [3]:
def lexicalize(s, ctx):
    
    if s in ctx['seen']:
        
        if s in pronoun_db:
            
            return pronoun_db[s].most_common()[0][0]
        else:
            return ''
    
    ctx['seen'].add(s)
    
    if s in name_db:
        
        return name_db[s].most_common()[0][0]
    else:
        return preprocess_so(s)
    
tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize)

In [5]:
generated_texts = [texts for texts in tbm.predict(test)]

In [6]:
MODEL_NAME = 'abe-2'

if os.path.isdir(f'../data/models/{MODEL_NAME}'):
    clear_dir(f'../data/models/{MODEL_NAME}/')
else:
    os.mkdir(f'../data/models/{MODEL_NAME}/')

with open(f'../data/models/{MODEL_NAME}/{MODEL_NAME}.txt', 'w', encoding='utf-8') as f:

    for texts in generated_texts:

        f.write("{}\n".format(texts[0]))
        
preprocess_model_to_evaluate(f'../data/models/{MODEL_NAME}/{MODEL_NAME}.txt')

In [18]:
# writes to a pickled file, to model use
with open('../data/models/abe-2/milhoes_de_textos', 'wb') as f:
    pickle.dump(generated_texts, f)

# Templates with rule 1

In [7]:
with open('../data/templates/template_db/thiago_template_db_rule_1', 'rb') as f:
    template_db = pickle.load(f)

tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize)

In [8]:
%%time
if os.path.isdir('../data/models/abe-2'):
    clear_dir('../data/models/abe-2/')
else:
    os.mkdir('../data/models/abe-2/')

with open('../data/models/abe-2/abe-2.txt', 'w', encoding='utf-8') as f:

    for text in tbm.predict(test):

        f.write("{}\n".format(text))
        
preprocess_model_to_evaluate('../data/models/abe-2/abe-2.txt')

CPU times: user 344 ms, sys: 31.2 ms, total: 375 ms
Wall time: 533 ms


# Adicionando uma regra -> se uma entidade já apareceu, usar '' no lugar de lexicalizar ela

In [4]:
def lexicalize_seen_to_it(s, category, ctx):
    
    if s in ctx['seen']:
        return ''
    
    ctx['seen'].add(s)
    
    from itertools import islice
    
    if s in lexicalization_db:
        
        if category in lexicalization_db[s]:
            
            lexis = lexicalization_db[s][category]
            
            return lexis.most_common()[0][0]
        else:
            return list(islice(lexicalization_db[s].values(), 0, 1))[0].most_common()[0][0]
    else:
        return preprocess_so(s)
    
tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize_seen_to_it)

if os.path.isdir('../data/models/abe-2'):
    clear_dir('../data/models/abe-2/')
else:
    os.mkdir('../data/models/abe-2/')

with open('../data/models/abe-2/abe-2.txt', 'w', encoding='utf-8') as f:

    for text in tbm.predict(test):

        f.write("{}\n".format(text))
        
preprocess_model_to_evaluate('../data/models/abe-2/abe-2.txt')