In [1]:
import os
os.sys.path.append('../template_model')

from reading_thiagos_templates import make_template, get_lexicalizations
from collections import ChainMap, defaultdict, Counter
import glob
import csv
import pickle
import pandas as pd
from util import load_train_dev, Entry
from more_itertools import flatten
from template_based import abstract_triples

In [2]:
td = load_train_dev()

# Templates

In [3]:
template_db = defaultdict(set)

template_model_texts = []

for e in [e for e in td if e.entity_map]:

    for l in [l for l in e.lexes if l['comment'] == 'good' and l['template']]:
        
        ts = make_template(l['sorted_triples'], 
                           l['template'], 
                           e.r_entity_map)
        
        if ts is None:
            break
        
        i_min = 0
        i_max = 0
        for t in ts:

            template_db[(e.category, t.template_triples)].add(t)
            
            i_max += len(t.template_triples)
            
            triples = e.triples[i_min:i_max]
            
            text = t.fill(triples, lambda x, ctx: x, None)
            
            template_model_texts.append(text)
            
            i_min = i_max
            
            
template_db = dict(template_db)

In [4]:
with open('../data/templates/template_db/tdb', 'wb') as f:
    pickle.dump(template_db, f)

In [5]:
with open('../data/kenlm/ts_texts.txt', 'w', encoding='utf-8') as f:
    for t in template_model_texts:
        f.write(f'{t}\n')

In [7]:
dados = Counter()
triple_to_lex_1 = defaultdict(list)
triple_to_lex_gt1 = defaultdict(list)

for e in [e for e in td if e.entity_map]:

    for l in [l for l in e.lexes if l['comment'] == 'good' and l['template']]:
        
        ts = make_template(l['sorted_triples'], 
                           l['template'], 
                           e.r_entity_map)
        
        for t in ts:

            # para lidar com casos em que de templates mal definidos
            if len(t.template_triples) > 0:

                dados[(t, t.template_triples, e.category)] += 1

            if len(e.triples) > 1:
                    for triple in e.triples:

                        triple_to_lex_gt1[triple].append(l['text'].lower())
            else:
                for triple in e.triples:

                    triple_to_lex_1[triple].append(l['text'].lower())

templates, templates_triples, categories = zip(*dados.keys())
df = pd.DataFrame({'feature_template_cnt_in_category': list(dados.values()),
                   'template': templates,
                   'template_triples': templates_triples,
                   'feature_template_category': categories})

# para aquela estrutura e por categoria, quantos templates há
g = df.groupby(['template_triples', 'feature_template_category'])['feature_template_cnt_in_category'].sum()
g.name = 'template_triples_and_category_cnt'
g = g.reset_index()

df = pd.merge(df, g)
df['feature_template_freq_in_category'] = df['feature_template_cnt_in_category'] / df['template_triples_and_category_cnt']
del df['template_triples_and_category_cnt']
    
df.to_pickle('../data/templates/template_db/template_db')
df.to_csv('../data/templates/template_db/template_db.csv', index=False)

with open('../data/templates/template_db/triple_to_lex_1', 'wb') as f:
    pickle.dump(triple_to_lex_1, f)
    
with open('../data/templates/template_db/triple_to_lex_gt1', 'wb') as f:
    pickle.dump(triple_to_lex_gt1, f)  

# Lexicalizations

In [11]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [36]:
name_db = defaultdict(lambda: Counter())
pronoun_db = defaultdict(lambda: Counter())

for entry in train_dev_entries:

    for lexe in entry['lexes']:

        if lexe['comment'] == 'good' and entry['entity_map']:

            lexicals = get_lexicalizations(lexe['text'], lexe['template'], entry['entity_map'])

            if lexicals:

                for lex_key, lex_values in lexicals.items():
                    
                    for lex_value in lex_values:
                        
                        doc = nlp(lex_value)
                        
                        if len(doc) == 1 and doc[0].pos_ == 'PRON':
                            
                            pronoun_db[lex_key][lex_value] += 1
                        else:
                            name_db[lex_key][lex_value] += 1

name_db = dict(name_db)
pronoun_db = dict(pronoun_db)

In [37]:
with open('../data/templates/lexicalization/thiago_name_db.csv', 'w', encoding='utf-8', newline='') as f:
    
    writer = csv.DictWriter(f, fieldnames=['lex_key', 'lex_value', 'n'])
    
    writer.writeheader()
    
    for lex_key, cc in name_db.items():
    
        for lex_value, n in cc.items():

            writer.writerow(dict(lex_key=lex_key, lex_value=lex_value, n=n))
            
with open('../data/templates/lexicalization/thiago_pronoun_db.csv', 'w', encoding='utf-8', newline='') as f:
    
    writer = csv.DictWriter(f, fieldnames=['lex_key', 'lex_value', 'n'])
    
    writer.writeheader()
    
    for lex_key, cc in pronoun_db.items():
    
        for lex_value, n in cc.items():

            writer.writerow(dict(lex_key=lex_key, lex_value=lex_value, n=n))

In [38]:
with open('../data/templates/lexicalization/thiago_name_db', 'wb') as f:
    pickle.dump(name_db, f)

with open('../data/templates/lexicalization/thiago_pronoun_db', 'wb') as f:
    pickle.dump(pronoun_db, f)

In [7]:
with open('../data/templates/lexicalization/thiago_lexicalization_db', 'wb') as f:
    pickle.dump(lexicalization_db, f)

# Language Model

In [54]:
import re

c = re.compile(r'\W')

# https://www.kaggle.com/alvations/n-gram-language-model-with-nltk
from nltk.lm.preprocessing import padded_everygram_pipeline, padded_everygrams

tokenized_texts = []

for e in train_dev_entries:
    
    for lexe in e['lexes']:
        
        if lexe['comment'] == 'good' and lexe['text']:
            
            tokenized = c.split(lexe['text'].lower())
            
            tokenized_texts.append(tokenized)
            
n = 2
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_texts)

from nltk.lm import MLE

model = MLE(n)
model.fit(train_data, padded_sents)

In [60]:
from functools import reduce

def score_sentence(s):
    
    ss = c.split(s.lower())
    
    scores = [model.score(w) for w in ss]
    
    non_zero_scores = [sc for sc in scores if sc != 0]
    
    return reduce(lambda x, y: x*y, non_zero_scores, 1)

def outro_score(s):
    
    ss = padded_everygrams(n, c.split(s.lower()))
    
    return model.entropy(ss)

# Enhancing

In [3]:
import pickle

with open('../data/templates/template_db/thiago_template_db', 'rb') as f:
    template_db = pickle.load(f)

## <s, p, o> + <s, p', o'> = <s, [<p, o>, <p', o'>]>

In [3]:
import re
from template_based import *

RE_REMOVE_FINAL_DOT = re.compile(r'\.$')
# assumes the sentence is in active voice
RE_REMOVE_AGENT_1 = re.compile(r'^.*?{AGENT-1}')

def make_text(t1, t2):
    
    t1_ = RE_REMOVE_FINAL_DOT.sub('', t1)
    
    t2_ = RE_REMOVE_AGENT_1.sub('', t2).replace('{PATIENT-1}', '{PATIENT-2}')
    
    return '{} and {}'.format(t1_, t2_)

def make_structure(h1, h2):
    
    o1 = Slot('PATIENT-1', [])
    p1 = Predicate(h1.predicates[0].value, [o1])
    o2 = Slot('PATIENT-2', [])
    p2 = Predicate(h2.predicates[0].value, [o2])
    
    s = Slot('AGENT-1', [p1, p2])
    
    return Structure(s)

def make_new_template(t1, t2):
    
    template_text = make_text(t1.template_text, t2.template_text)
    structure = make_structure(t1.structure.head, t2.structure.head)
    
    return Template(structure, template_text, None)

In [30]:
from itertools import combinations

templates_w_1_size = []
template_enhanced_db = defaultdict(lambda: defaultdict(Counter), template_db)

for s, cc in template_db.items():
    if len(s) == 1:
        for tc in cc.values():
            templates_w_1_size.extend(tc.keys())

In [31]:
%%time
#probably passive voice
w_error = []

for t1, t2 in combinations(templates_w_1_size, 2):

    try:
        t12 = make_new_template(t1, t2)
        template_enhanced_db[t12.structure][t1.structure.head.value][t12] +=1
    except:
        w_error.append((t1, t2))

    try:
        t21 = make_new_template(t2, t1)
        template_enhanced_db[t21.structure][t2.structure.head.value][t21] +=1
    except:
        w_error.append((t2, t1))

CPU times: user 7min 19s, sys: 54.9 s, total: 8min 14s
Wall time: 8min 23s


In [34]:
len(w_error)

0

In [35]:
with open('../data/templates/template_db/thiago_template_db_rule_1', 'wb') as f:
    pickle.dump(dict(template_enhanced_db), f)

In [36]:
len(template_enhanced_db)

58642

# Testing spacy string dependency tree parsing

In [3]:
import spacy

nlp = spacy.load('en')

In [28]:
t = list(template_db.values())[400].most_common()[0][0]

t

Structure: [AGENT-1, 

	<tenant, [
		[BRIDGE-1, 

			<foundationPlace, [PATIENT-1]>]]>]
Text: {BRIDGE-1} which was founded in {PATIENT-1} is the tenant of {AGENT-1}.

In [35]:
from spacy import displacy

# weird, spacy doesn't respect special cases if they appear in the end of the string, preceeded by a dot...
import re
c = re.compile(r'\.$')

for s in ['{AGENT-1}', '{PATIENT-1}', '{BRIDGE-1}']:
    special_case = [{'ORTH': s, 'TAG': 'NN'}]
    nlp.tokenizer.add_special_case(s, special_case)

doc = nlp(c.sub('', t.template_text))

displacy.render(doc, jupyter=True)

# Creating a subset containing only the entries with template

In [4]:
with open('../evaluation/test.pkl', 'rb') as f:
    test = pickle.load(f)

In [6]:
entries_w_template = []

for i, entry in enumerate(test):
    
    try:
        s = Structure.from_triples(entry['triples'])

        if s in template_db:
            entries_w_template.append(i)
    except MoreThanOneRootException:
        pass

In [10]:
with open('../evaluation/subsets/with-template.txt', 'w', encoding='utf-8') as f:
    
    f.writelines(f'{i}\n' for i in entries_w_template)

# testando a ideia de calcular uma prioridade de um template em função das suas partes

In [13]:
t1 = templates[4950]
t1

Structure: (Triple(subject='slot0', predicate='deathPlace', object='slot1'), Triple(subject='slot1', predicate='isPartOf', object='slot2'))
Text: {slot0} died in {slot1} (part of {slot2}) .

## agora vamos ver quantas vezes deathPlace é verbalizado com 'died in'

In [14]:
ts = [t for t in templates if 'deathPlace' in [tr.predicate for tr in t.template_triples] and ' died in ' in t.template_text]
len(ts)

307

In [16]:
tdish = [t for t in templates if 'dishVariation' in [tr.predicate for tr in t.template_triples]]

len(tdish)

593

In [17]:
ttdish = [t for t in tdish if 'can be varied by using' in t.template_text]

len(ttdish)

13