In [12]:
%load_ext autoreload
%autoreload 2

import os
os.sys.path.insert(0, '../script')

from reading_thiagos_templates import read_thiagos_xml_entries, make_template, get_lexicalizations
from collections import ChainMap, defaultdict, Counter
import glob

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
filepaths = glob.glob('../../webnlg/data/delexicalized/v1.4/train/**/*.xml', recursive=True)
filepaths.extend(glob.glob('../../webnlg/data/delexicalized/v1.4/dev/**/*.xml', recursive=True))

template_db = defaultdict(Counter)
lexicalization_db = defaultdict(Counter)

failed_lexicalizations = defaultdict(list)

for filepath in filepaths:
    
    entries = read_thiagos_xml_entries(filepath)
    
    for entry in entries:
        
        for lexe in entry['lexes']:
            
            if lexe['comment'] == 'good':
                
                if entry['entity_map']:
            
                    try:
                        t = make_template(entry['triples'], 
                                          lexe['text'], 
                                          lexe['template'], 
                                          entry['r_entity_map'], 
                                          metadata={'filepath': filepath})
                        template_db[t.structure][t] += 1
                    except Exception as ex:
                        if str(ex) in ['triples must contain only one root', 'structure and template_text must match slots']:
                            continue
                        raise ex

                    lexicals = get_lexicalizations(lexe['text'], lexe['template'], entry['entity_map'])
                    
                    if lexicals:

                        for lex_key, lex_value in lexicals.items():

                            lexicalization_db[lex_key].update(lex_value)
                    else:
                        failed_lexicalizations[filepath].append(entry)

                        
template_db = dict(template_db)
lexicalization_db = dict(lexicalization_db)

In [20]:
from template_based import Structure

triples = [{'subject': 'AGENT-1', 'predicate': 'country', 'object': 'BRIDGE-1'},
           {'subject': 'BRIDGE-1', 'predicate': 'capital', 'object': 'PATIENT-1'},
           {'subject': 'BRIDGE-1', 'predicate': 'language', 'object': 'PATIENT-2'}]

templates = template_db[Structure.from_triples(triples)]

for template, count in templates.most_common():
    
    print(template.template_text, count)

{AGENT-1} is located in {BRIDGE-1}, {PATIENT-1} and the language is {PATIENT-2}. 1
{PATIENT-2} is the language of {BRIDGE-1} where {AGENT-1} is located and the capital city is {PATIENT-1}. 1
{AGENT-1} is located in {BRIDGE-1} which has the capital city of {PATIENT-1}. The language spoken in {BRIDGE-1} is {PATIENT-2}. 1
{AGENT-1} is from {BRIDGE-1}, where the capital is {PATIENT-1} and {PATIENT-2} is the language . 1
{AGENT-1} can be found in {BRIDGE-1} which has the capital city of {PATIENT-1} and uses {PATIENT-2}. 1
{BRIDGE-1} is home to {PATIENT-2}, the capital {PATIENT-1} and {AGENT-1}. 1


In [16]:
lexicalization_db['Alcatraz_Versus_the_Evil_Librarians']

Counter({'Alcatraz Versus the Evil Librarians': 69,
         'Alcatrz Versus the Evil Librarians': 1,
         'Alcatraz Versus The Evil Librarian': 3,
         'Alcatraz versus the Evil Librarians': 2,
         'The book Alcatraz Versus the Evil Librarians': 20,
         "The book 'Alcatraz Versus the Evil Librarian'": 1,
         'its': 2,
         'the book Alcatraz Versus the Evil Librarians': 7,
         'it': 4,
         'Alcatraz versus the Evil LIbrarians': 1,
         'Alcatraz Versus the Evil Libraries': 1,
         'The book': 3,
         'Alcatraz Versus the Evil Librarian': 1,
         'D.C. Alcatraz Versus the Evil Librarians': 1,
         'a book Alcatraz Versus the Evil Librarians': 1,
         'This book': 1})

In [17]:
import pickle

with open('thiago_template_db', 'wb') as f:
    pickle.dump(template_db, f)
    
with open('thiago_lexicalization_db', 'wb') as f:
    pickle.dump(lexicalization_db, f)

In [18]:
import pickle

template_db = None
with open('thiago_template_db', 'rb') as f:
    template_db = pickle.load(f)
    
lexicalization_db = None
with open('thiago_lexicalization_db', 'rb') as f:
    lexicalization_db = pickle.load(f)