In [1]:
import os
os.sys.path.append('../template_model')

from reading_thiagos_templates import read_thiagos_xml_entries, make_template, get_lexicalizations, StructureDoesntMatchTemplate
from template_based import MoreThanOneRootException, Structure
from collections import ChainMap, defaultdict, Counter
import glob
import csv
import pickle

In [2]:
# train+dev filepaths
filepaths = glob.glob('../data/templates/v1.4/train/**/*.xml', recursive=True)
filepaths.extend(glob.glob('../data/templates/v1.4/dev/**/*.xml', recursive=True))

train_dev_entries = []

for filepath in filepaths:
    
    entries = read_thiagos_xml_entries(filepath)
    
    train_dev_entries.extend(entries)

# Templates

In [3]:
template_db = defaultdict(lambda: defaultdict(Counter))
entries_lexe_template_error = []
entries_lexe_structure_d_match_template = []

for entry in train_dev_entries:

    for lexe in entry['lexes']:

        if lexe['comment'] == 'good' and entry['entity_map'] and lexe['template']:

            try:
                t = make_template(entry['triples'], 
                                  lexe['text'], 
                                  lexe['template'], 
                                  entry['r_entity_map'], 
                                  metadata={'filepath': filepath})
                
                s = Structure.from_triples(entry['triples'])

                template_db[t.structure][s.head.value][t] += 1
            except MoreThanOneRootException:

                entries_lexe_template_error.append((entry, lexe, filepath))
            except StructureDoesntMatchTemplate:
                
                entries_lexe_structure_d_match_template.append((entry, lexe, filepath))
                    

template_db = dict(template_db)

In [4]:
len(entries_lexe_template_error), len(entries_lexe_structure_d_match_template)

(57, 0)

In [5]:
# writes to a CSV, for analysis

with open('../data/templates/template_db/thiago_template_db.csv', 'w', encoding='utf-8', newline='') as f:
    
    writer = csv.DictWriter(f, fieldnames=['structure', 'template', 'key', 'n'])
    
    writer.writeheader()
    
    for s, dd in template_db.items():
    
        for key, cc in dd.items():

            for t, n in cc.items():
                
                writer.writerow(dict(key=key, structure=t.structure, template=t.template_text, n=n))
                
# writes to a pickled file, to model use

with open('../data/templates/template_db/thiago_template_db', 'wb') as f:
    pickle.dump(template_db, f)

# Lexicalizations

In [21]:
lexicalization_db = defaultdict(lambda: defaultdict(Counter))

for entry in train_dev_entries:

    for lexe in entry['lexes']:

        if lexe['comment'] == 'good' and entry['entity_map']:

            lexicals = get_lexicalizations(lexe['text'], lexe['template'], entry['entity_map'])

            if lexicals:

                for lex_key, lex_values in lexicals.items():
                    
                    for lex_value in lex_values:

                        lexicalization_db[lex_key][entry['category']][lex_value] += 1

lexicalization_db = dict(lexicalization_db)

In [31]:
with open('../data/templates/lexicalization/thiago_lexicalization_db.csv', 'w', encoding='utf-8', newline='') as f:
    
    writer = csv.DictWriter(f, fieldnames=['category', 'lex_key', 'lex_value', 'n'])
    
    writer.writeheader()
    
    for lex_key, dd in lexicalization_db.items():
    
        for category, cc in dd.items():

            for lex_value, n in cc.items():
                
                writer.writerow(dict(category=category, lex_key=lex_key, lex_value=lex_value, n=n))
                
with open('../data/templates/lexicalization/thiago_lexicalization_db', 'wb') as f:
    pickle.dump(lexicalization_db, f)

# Enhancing

In [3]:
import pickle

with open('../data/templates/template_db/thiago_template_db', 'rb') as f:
    template_db = pickle.load(f)

## <s, p, o> + <s, p', o'> = <s, [<p, o>, <p', o'>]>

In [3]:
import re
from template_based import *

RE_REMOVE_FINAL_DOT = re.compile(r'\.$')
# assumes the sentence is in active voice
RE_REMOVE_AGENT_1 = re.compile(r'^.*?{AGENT-1}')

def make_text(t1, t2):
    
    t1_ = RE_REMOVE_FINAL_DOT.sub('', t1)
    
    t2_ = RE_REMOVE_AGENT_1.sub('', t2).replace('{PATIENT-1}', '{PATIENT-2}')
    
    return '{} and {}'.format(t1_, t2_)

def make_structure(h1, h2):
    
    o1 = Slot('PATIENT-1', [])
    p1 = Predicate(h1.predicates[0].value, [o1])
    o2 = Slot('PATIENT-2', [])
    p2 = Predicate(h2.predicates[0].value, [o2])
    
    s = Slot('AGENT-1', [p1, p2])
    
    return Structure(s)

def make_new_template(t1, t2):
    
    template_text = make_text(t1.template_text, t2.template_text)
    structure = make_structure(t1.structure.head, t2.structure.head)
    
    return Template(structure, template_text, None)

In [30]:
from itertools import combinations

templates_w_1_size = []
template_enhanced_db = defaultdict(lambda: defaultdict(Counter), template_db)

for s, cc in template_db.items():
    if len(s) == 1:
        for tc in cc.values():
            templates_w_1_size.extend(tc.keys())

In [31]:
%%time
#probably passive voice
w_error = []

for t1, t2 in combinations(templates_w_1_size, 2):

    try:
        t12 = make_new_template(t1, t2)
        template_enhanced_db[t12.structure][t1.structure.head.value][t12] +=1
    except:
        w_error.append((t1, t2))

    try:
        t21 = make_new_template(t2, t1)
        template_enhanced_db[t21.structure][t2.structure.head.value][t21] +=1
    except:
        w_error.append((t2, t1))

CPU times: user 7min 19s, sys: 54.9 s, total: 8min 14s
Wall time: 8min 23s


In [34]:
len(w_error)

0

In [35]:
with open('../data/templates/template_db/thiago_template_db_rule_1', 'wb') as f:
    pickle.dump(dict(template_enhanced_db), f)

In [36]:
len(template_enhanced_db)

58642

# Testing spacy string dependency tree parsing

In [3]:
import spacy

nlp = spacy.load('en')

In [28]:
t = list(template_db.values())[400].most_common()[0][0]

t

Structure: [AGENT-1, 

	<tenant, [
		[BRIDGE-1, 

			<foundationPlace, [PATIENT-1]>]]>]
Text: {BRIDGE-1} which was founded in {PATIENT-1} is the tenant of {AGENT-1}.

In [35]:
from spacy import displacy

# weird, spacy doesn't respect special cases if they appear in the end of the string, preceeded by a dot...
import re
c = re.compile(r'\.$')

for s in ['{AGENT-1}', '{PATIENT-1}', '{BRIDGE-1}']:
    special_case = [{'ORTH': s, 'TAG': 'NN'}]
    nlp.tokenizer.add_special_case(s, special_case)

doc = nlp(c.sub('', t.template_text))

displacy.render(doc, jupyter=True)

# Creating a subset containing only the entries with template

In [4]:
with open('../evaluation/test.pkl', 'rb') as f:
    test = pickle.load(f)

In [6]:
entries_w_template = []

for i, entry in enumerate(test):
    
    try:
        s = Structure.from_triples(entry['triples'])

        if s in template_db:
            entries_w_template.append(i)
    except MoreThanOneRootException:
        pass

In [10]:
with open('../evaluation/subsets/with-template.txt', 'w', encoding='utf-8') as f:
    
    f.writelines(f'{i}\n' for i in entries_w_template)