In [1]:
%load_ext autoreload
%autoreload 2

import os
os.sys.path.insert(0, '../script')

from reading_thiagos_templates import read_thiagos_xml_entries, make_template, get_lexicalizations
from collections import ChainMap, defaultdict, Counter
import glob

In [2]:
filepaths = glob.glob('../../webnlg/data/delexicalized/v1.4/train/**/*.xml', recursive=True)
filepaths.extend(glob.glob('../../webnlg/data/delexicalized/v1.4/dev/**/*.xml', recursive=True))

template_db = defaultdict(lambda: defaultdict(Counter))
#lexicalization_db = defaultdict(Counter)

#failed_lexicalizations = defaultdict(list)

for filepath in filepaths:
    
    entries = read_thiagos_xml_entries(filepath)
    
    for entry in entries:
        
        for lexe in entry['lexes']:
            
            if lexe['comment'] == 'good':
                
                if entry['entity_map']:
            
                    try:
                        t = make_template(entry['triples'], 
                                          lexe['text'], 
                                          lexe['template'], 
                                          entry['r_entity_map'], 
                                          metadata={'filepath': filepath})
                        template_db[t.structure][entry['category']][t] += 1
                    except Exception as ex:
                        if str(ex) in ['triples must contain only one root', 'structure and template_text must match slots']:
                            continue
                        raise ex

                    #lexicals = get_lexicalizations(lexe['text'], lexe['template'], entry['entity_map'])
                    
                    #if lexicals:

                        #for lex_key, lex_value in lexicals.items():

                            #lexicalization_db[lex_key].update(lex_value)
                    #else:
                        #failed_lexicalizations[filepath].append(entry)

                        
template_db = dict(template_db)
#lexicalization_db = dict(lexicalization_db)

In [9]:
from template_based import *

triples = [{'subject': 'AGENT-1', 'predicate': 'country', 'object': 'BRIDGE-1'},
           {'subject': 'BRIDGE-1', 'predicate': 'capital', 'object': 'PATIENT-1'},
           {'subject': 'BRIDGE-1', 'predicate': 'language', 'object': 'PATIENT-2'}]

templates = template_db[Structure.from_triples(triples)]['Building']

for template, count in templates.most_common():
    
    print(template.template_text, count)

{AGENT-1} is located in {BRIDGE-1}, {PATIENT-1} and the language is {PATIENT-2}. 1
{PATIENT-2} is the language of {BRIDGE-1} where {AGENT-1} is located and the capital city is {PATIENT-1}. 1
{AGENT-1} is located in {BRIDGE-1} which has the capital city of {PATIENT-1}. The language spoken in {BRIDGE-1} is {PATIENT-2}. 1


In [10]:
lexicalization_db['Alcatraz_Versus_the_Evil_Librarians']

Counter({'alcatraz versus the evil librarians': 72,
         'alcatrz versus the evil librarians': 1,
         'alcatraz versus the evil librarian': 4,
         'the book alcatraz versus the evil librarians': 27,
         "the book 'alcatraz versus the evil librarian'": 1,
         'its': 2,
         'it': 4,
         'alcatraz versus the evil libraries': 1,
         'the book': 3,
         'd.c. alcatraz versus the evil librarians': 1,
         'a book alcatraz versus the evil librarians': 1,
         'this book': 1})

In [3]:
import pickle

with open('thiago_template_db_category', 'wb') as f:
    pickle.dump(template_db, f)

In [11]:
import pickle

with open('thiago_template_db', 'wb') as f:
    pickle.dump(template_db, f)
    
with open('thiago_lexicalization_db', 'wb') as f:
    pickle.dump(lexicalization_db, f)

# Enhancing

In [7]:
import pickle

with open('thiago_template_db_category', 'rb') as f:
    template_enhanced_db = pickle.load(f)

In [8]:
import re
from template_based import *

RE_REMOVE_FINAL_DOT = re.compile(r'\.$')
# assumes the sentence is in active voice
RE_REMOVE_AGENT_1 = re.compile(r'^.*?{AGENT-1}')

def make_text(t1, t2):
    
    t1_ = RE_REMOVE_FINAL_DOT.sub('', t1)
    
    t2_ = RE_REMOVE_AGENT_1.sub('', t2).replace('{PATIENT-1}', '{PATIENT-2}')
    
    return '{} and {}'.format(t1_, t2_)

def make_structure(h1, h2):
    
    o1 = Slot('PATIENT-1', [])
    p1 = Predicate(h1.predicates[0].value, [o1])
    o2 = Slot('PATIENT-2', [])
    p2 = Predicate(h2.predicates[0].value, [o2])
    
    s = Slot('AGENT-1', [p1, p2])
    
    return Structure(s)

def make_new_template(t1, t2):
    
    template_text = make_text(t1.template_text, t2.template_text)
    structure = make_structure(t1.structure.head, t2.structure.head)
    
    return Template(structure, template_text, None)

In [9]:
from itertools import combinations

templates_w_1_size = [t for s, t in template_enhanced_db.items() if len(s) == 1]

#probably passive voice
w_error = []

for t1_c, t2_c in combinations(templates_w_1_size, 2):
    
    t1 = t1_c.most_common(1)[0][0]
    t2 = t2_c.most_common(1)[0][0]
    
    try:
        t12 = make_new_template(t1, t2)
        template_enhanced_db[t12.structure][t12] +=1
    except:
        w_error.append((t1, t2))
    
    try:
        t21 = make_new_template(t2, t1)
        template_enhanced_db[t21.structure][t21] +=1
    except:
        w_error.append((t2, t1))

AttributeError: 'collections.defaultdict' object has no attribute 'most_common'

In [6]:
with open('thiago_enhanced_template_db', 'wb') as f:
    pickle.dump(dict(template_enhanced_db), f)

# Testing spacy string dependency tree parsing

In [3]:
import spacy

nlp = spacy.load('en')

In [28]:
t = list(template_db.values())[400].most_common()[0][0]

t

Structure: [AGENT-1, 

	<tenant, [
		[BRIDGE-1, 

			<foundationPlace, [PATIENT-1]>]]>]
Text: {BRIDGE-1} which was founded in {PATIENT-1} is the tenant of {AGENT-1}.

In [35]:
from spacy import displacy

# weird, spacy doesn't respect special cases if they appear in the end of the string, preceeded by a dot...
import re
c = re.compile(r'\.$')

for s in ['{AGENT-1}', '{PATIENT-1}', '{BRIDGE-1}']:
    special_case = [{'ORTH': s, 'TAG': 'NN'}]
    nlp.tokenizer.add_special_case(s, special_case)

doc = nlp(c.sub('', t.template_text))

displacy.render(doc, jupyter=True)