In [1]:
import os
import pandas as pd

os.sys.path.extend(['../template_model/', '../evaluation'])

from evaluate import preprocess_model_to_evaluate
from util import preprocess_so, clear_dir
from collections import ChainMap, defaultdict, Counter
from template_based import JustJoinTemplate, StructureData, SelectTemplate, MakeText
import pickle

In [2]:
class TemplateBasedModel:
    
    def __init__(self, template_db, lexicalization_f):
        
        self.ss = StructureData(template_db, Counter([JustJoinTemplate()]))
        self.st = SelectTemplate()
        self.mt = MakeText(lexicalization_f=lexicalization_f)
    
    def predict(self, X):
        
        result = []
        
        for x in X:

            structured_data = self.ss.structure(x)
            selected_templates = self.st.select_template(structured_data)
            text = self.mt.make_text(selected_templates, x)

            result.append(text)
        
        return result
    
    
with open('../data/templates/template_db/thiago_template_db', 'rb') as f:
    template_db = pickle.load(f)
    
with open('../data/templates/lexicalization/thiago_lexicalization_db', 'rb') as f:
    lexicalization_db = pickle.load(f)
    
with open('../evaluation/test.pkl', 'rb') as f:
    test = pickle.load(f)

In [3]:
def lexicalize(s, category, ctx):
    
    from itertools import islice
    
    if s in lexicalization_db:
        
        if category in lexicalization_db[s]:
            
            lexis = lexicalization_db[s][category]
            
            return lexis.most_common()[0][0]
        else:
            return list(islice(lexicalization_db[s].values(), 0, 1))[0].most_common()[0][0]
    else:
        return preprocess_so(s)
    
tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize)

In [4]:
if os.path.isdir('../data/models/abe-1'):
    clear_dir('../data/models/abe-1/')
else:
    os.mkdir('../data/models/abe-1/')

with open('../data/models/abe-1/abe-1.txt', 'w', encoding='utf-8') as f:

    for text in tbm.predict(test):

        f.write("{}\n".format(text))
        
preprocess_model_to_evaluate('../data/models/abe-1/abe-1.txt')

In [14]:
tbm.predict([test[549]])

['bacon explosion comes from the united states, whose capital is washington dc and one of whose leaders is john roberts.']

In [7]:
from template_based import Structure

In [15]:
Structure.from_triples(test[549]['triples'])

[Bacon_Explosion, 

	<country, [
		[United_States, 

			<leaderName, [John_Roberts]>,
			<capital, [Washington,_D.C.]>]]>]

In [17]:
template_db[Structure.from_triples(test[549]['triples'])]['Bacon_Explosion']

Counter({Structure: [AGENT-1, 
         
         	<country, [
         		[BRIDGE-1, 
         
         			<leaderName, [PATIENT-1]>,
         			<capital, [PATIENT-2]>]]>]
         Text: {AGENT-1} comes from {BRIDGE-1}, whose capital is {PATIENT-2} and one of whose leaders is {PATIENT-1}.: 1,
         Structure: [AGENT-1, 
         
         	<country, [
         		[BRIDGE-1, 
         
         			<leaderName, [PATIENT-1]>,
         			<capital, [PATIENT-2]>]]>]
         Text: {AGENT-1} comes from {BRIDGE-1} where the leader is {PATIENT-1} and the capital city is {PATIENT-2}.: 1,
         Structure: [AGENT-1, 
         
         	<country, [
         		[BRIDGE-1, 
         
         			<leaderName, [PATIENT-1]>,
         			<capital, [PATIENT-2]>]]>]
         Text: {AGENT-1} is a dish from {BRIDGE-1} where {PATIENT-1} is a political leader and the capital city is {PATIENT-2}.: 1,
         Structure: [AGENT-1, 
         
         	<country, [
         		[BRIDGE-1, 
         
        

In [9]:
tbm.ss.structure(test[431])

[([A_Wizard_of_Mars, 
  
  	<country, [
  		[United_States, 
  
  			<ethnicGroup, [Asian_Americans]>,
  			<capital, [Washington,_D.C.]>]]>], Counter({Structure: [AGENT-1, 
           
           	<country, [
           		[BRIDGE-1, 
           
           			<ethnicGroup, [PATIENT-1]>,
           			<capital, [PATIENT-2]>]]>]
           Text: One of the ethnic Groups of {BRIDGE-1} (where the capital city is {PATIENT-2}) is {PATIENT-1}. Additionally , {AGENT-1} originates from {BRIDGE-1}.: 1,
           Structure: [AGENT-1, 
           
           	<country, [
           		[BRIDGE-1, 
           
           			<ethnicGroup, [PATIENT-1]>,
           			<capital, [PATIENT-2]>]]>]
           Text: {AGENT-1} was published in {BRIDGE-1}, where the capital is {PATIENT-2}, {PATIENT-1} are one of the ethnic groups there .: 1,
           Structure: [AGENT-1, 
           
           	<country, [
           		[BRIDGE-1, 
           
           			<ethnicGroup, [PATIENT-1]>,
           			<capita

# Templates with rule 1

In [7]:
with open('../data/templates/template_db/thiago_template_db_rule_1', 'rb') as f:
    template_db = pickle.load(f)

tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize)

In [8]:
%%time
if os.path.isdir('../data/models/abe-2'):
    clear_dir('../data/models/abe-2/')
else:
    os.mkdir('../data/models/abe-2/')

with open('../data/models/abe-2/abe-2.txt', 'w', encoding='utf-8') as f:

    for text in tbm.predict(test):

        f.write("{}\n".format(text))
        
preprocess_model_to_evaluate('../data/models/abe-2/abe-2.txt')

CPU times: user 344 ms, sys: 31.2 ms, total: 375 ms
Wall time: 533 ms


# Adicionando uma regra -> se uma entidade já apareceu, usar '' no lugar de lexicalizar ela

In [4]:
def lexicalize_seen_to_it(s, category, ctx):
    
    if s in ctx['seen']:
        return ''
    
    ctx['seen'].add(s)
    
    from itertools import islice
    
    if s in lexicalization_db:
        
        if category in lexicalization_db[s]:
            
            lexis = lexicalization_db[s][category]
            
            return lexis.most_common()[0][0]
        else:
            return list(islice(lexicalization_db[s].values(), 0, 1))[0].most_common()[0][0]
    else:
        return preprocess_so(s)
    
tbm = TemplateBasedModel(template_db=template_db, lexicalization_f=lexicalize_seen_to_it)

if os.path.isdir('../data/models/abe-2'):
    clear_dir('../data/models/abe-2/')
else:
    os.mkdir('../data/models/abe-2/')

with open('../data/models/abe-2/abe-2.txt', 'w', encoding='utf-8') as f:

    for text in tbm.predict(test):

        f.write("{}\n".format(text))
        
preprocess_model_to_evaluate('../data/models/abe-2/abe-2.txt')