In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
import os
import sys
sys.path.extend(['../evaluation'])
from evaluate import model_preprocessed_filepath, normalize_text
import glob
from matplotlib import pyplot as plt
import math
%matplotlib inline

In [3]:
chencherry = SmoothingFunction().method2

def get_score(refs, hyp, weights=[.25, .25, .25, .25]):
    
    return sentence_bleu([r.split() for r in refs], 
                          hyp.split(), 
                          weights=weights,
                          smoothing_function=chencherry)

def get_scores(filepath, subset='all-cat'):
    
    references_files = [f'../evaluation/references/dev/{subset}_reference{i}.lex' for i in range(3)]
    hypothesis_file = filepath

    scores = []
    hypothesis = []
    references = []

    with open(references_files[0], 'r', encoding='utf-8') as ref0,\
         open(references_files[1], 'r', encoding='utf-8') as ref1,\
         open(references_files[2], 'r', encoding='utf-8') as ref2,\
         open(hypothesis_file, 'r', encoding='utf-8') as hyp:


        for h, r0, r1, r2 in zip(hyp, ref0, ref1, ref2):

            references.append((normalize_text(r0[:-1]), normalize_text(r1[:-1]), normalize_text(r2[:-1])))
            hypothesis.append(normalize_text(h[:-1]))

    for hyp, refs in zip(hypothesis, references):

        score = get_score(refs, hyp)

        scores.append(score)
        
    return scores, hypothesis, references

# Menores scores

In [39]:
models = [os.path.basename(s) for s in glob.glob('../data/models/dev/*')]

models_data = {}

for model in models:
    
    sentence_bleu_filepath = f'../data/models/dev/{model}/sentence_bleu.txt'
    try:
        model_texts_filepath = model_preprocessed_filepath(model, 'dev', 'all-cat')
    except:
        continue
    
    if os.path.isfile(sentence_bleu_filepath):
        with open(sentence_bleu_filepath, 'r', encoding='utf-8') as f:
            scores = [float(l[:-1]) for l in f.readlines()]
        with open(model_texts_filepath, 'r', encoding='utf-8') as f:
            hypothesis = [l[:-1] for l in f.readlines()]
    else:
        scores, hypothesis, _ = get_scores(model_texts_filepath, 'all-cat')

        with open(sentence_bleu_filepath, 'w', encoding='utf-8') as f:
            for score in scores:
                f.write(f'{score}\n')
    
    models_data[model] = (scores, hypothesis)

In [95]:
#adicionar references
sys.path.append('../template_model')
from reading_thiagos_templates import load_dev, Entry

import pickle

params = []
for m in models:
    with open(f'../data/models/dev/{m}/params.pkl', 'rb') as f:
        param = pickle.load(f)
    params.append(param)
params_df = pd.DataFrame(params, index=models)[['max_dp', 'max_sa', 'max_tems']]

dev = load_dev()
def get_texts(ix, models):
    
    print(f'ix = {ix}')
    
    ix_data = []
    for l in dev[ix].lexes:
        print(normalize_text(l['text']))
    for t in dev[ix].triples:
        print(f"<'{t.subject}', '{t.predicate}', '{t.object}'>")
    for model, (scores, hypothesis) in models_data.items():
        
        if model in models:
        
            ix_data.append((models[model], scores[ix], hypothesis[ix]))
            
    df = pd.DataFrame(ix_data, columns=['model', 'bleu', 'text'])
    df.set_index('model', inplace=True)
    df = pd.merge(df, params_df, left_index=True, right_index=True)
    return df.sort_values('bleu', ascending=False)

In [41]:
from collections import defaultdict

i_dev = defaultdict(list)

for i, e in enumerate(dev):
    i_dev[len(e.triples)].append(i)

In [54]:
from random import Random
rnd = Random(135)

sample = {n: rnd.sample(iss, 3) for n, iss in i_dev.items()}

In [108]:
n = 7
iss = sample[n]
i = -1

In [111]:
i += 1
get_texts(iss[i], {m: m for m in models})

ix = 858
buzz aldrin was born in glen ridge , new jersey and graduated from massachusetts institute of technology in 1963 with a doctorate in science . he was a member of the apollo 11 crew after being selected by nasa in 1963 with william anders as a backup pilot on apollo 11 .
buzz aldrin was born in glen ridge , new jersey . he graduated from mit in 1963 with a doctorate in science . he was a fighter pilot and began working for nasa in 1963 . aldrin was a member of apollo 11 , which was run by nasa and william anders was the backup pilot .
buzz aldrin was born in glen ridge , new jersey and obtained a doctorate in science from mit in 1963 . he served as a fighter pilot before being hired by nasa in 1963 and serving as a crew member on apollo 11 with backup pilot william anders .
<'Buzz_Aldrin', 'birthPlace', 'Glen_Ridge,_New_Jersey'>
<'Buzz_Aldrin', 'was a crew member of', 'Apollo_11'>
<'Buzz_Aldrin', 'was selected by NASA', '1963'>
<'Buzz_Aldrin', 'occupation', 'Fighter_pilot'>
<'B

Unnamed: 0_level_0,bleu,text,max_dp,max_sa,max_tems
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-4381119412256435544,0.746226,"buzz aldrin was born in glen ridge , new jersey . he joined nasa in 1963 . he served as fighter pilot . he graduated from massachusetts institute of technology in 1963 with a doctorate in science . he was a member of apollo 11 operated by nasa , where william anders was a backup pilot .",4,8,4
2834878568166934838,0.746226,"buzz aldrin was born in glen ridge , new jersey . he joined nasa in 1963 . he served as fighter pilot . he graduated from massachusetts institute of technology in 1963 with a doctorate in science . he was a member of apollo 11 operated by nasa , where william anders was a backup pilot .",2,8,4
-2557963126006473324,0.715181,"buzz aldrin was born in glen ridge , new jersey . he joined nasa in 1963 . he served as fighter pilot . he graduated from massachusetts institute of technology in 1963 with a doctorate in science . he was a crew member on apollo 11 , operated by nasa . william anders was an its backup pilot .",4,4,4
1389612785202597354,0.715181,"buzz aldrin was born in glen ridge , new jersey . he joined nasa in 1963 . he served as fighter pilot . he graduated from massachusetts institute of technology in 1963 with a doctorate in science . he was a crew member on apollo 11 , operated by nasa . william anders was an its backup pilot .",2,4,4
-3733669480039627796,0.704463,"buzz aldrin was born in glen ridge , new jersey . he was selected by nasa in 1963 . he served as fighter pilot . he obtained massachusetts institute of technology in 1963 with a doctorate in science . he was a member of nasa operated apollo 11 on which william anders was a backup pilot .",2,8,2
7659442398283943410,0.692416,"buzz aldrin was born in glen ridge , new jersey . he was selected by nasa in 1963 . he obtained massachusetts institute of technology in 1963 with a doctorate in science . he served as fighter pilot . he was a member of nasa operated apollo 11 on which william anders was a backup pilot .",4,8,2
-4257805808715618848,0.662309,"buzz aldrin was born in glen ridge , new jersey . he was selected by nasa in 1963 . he served as fighter pilot . he obtained massachusetts institute of technology in 1963 with a doctorate in science . he was a crew member of nasa operated apollo 11 . its also included william anders .",2,4,2
6601739722906416414,0.650151,"buzz aldrin was born in glen ridge , new jersey . he was selected by nasa in 1963 . he obtained massachusetts institute of technology in 1963 with a doctorate in science . he served as fighter pilot . he was a crew member of nasa operated apollo 11 . its also included william anders .",4,4,2


In [61]:
refs = ['alan shephard died on the 21st of july , 1998 in california ( whose senator was dianne feinstein ) .', 'dianne feinstein is a senator in california where alan shepard died on july 21 1998 .']
hyp1 = 'alan shepard died in california . alan shepard died on 1998 - 07 - 21 . the senator representing california was dianne feinstein .'
hyp2 = 'alan shepard died in california on 1998 - 07 - 21 . the senator representing california was dianne feinstein .'

get_score(refs, hyp1, [0, 0, 1.0])

0.17391304347826086

In [72]:
dev[403].lexes[1]['sorted_triples']

[(Triple(subject='Alan_Shepard', predicate='deathDate', object='"1998-07-21"'),
  Triple(subject='Alan_Shepard', predicate='deathPlace', object='California'),
  Triple(subject='California', predicate='senators', object='Dianne_Feinstein'))]

In [71]:
dev[403].category

'Astronaut'

In [66]:
get_score(refs, hyp2, [0, 0, 1.0])

0.15789473684210523

In [29]:
get_score(refs, hyp2)

0.2082198320914845

In [14]:
models_names = {'-5304774152169464302': 'gold', '8562436464462099980': 'best', '-4478499061069106480': 'worst', '-7111541801870592395': '2nd worst'}

In [9]:
get_texts(279, models_names)

Unnamed: 0,0,1,2
0,reference,1.0,"Doug Moench was the creator of the comic character Ballistic, who has the alternative name Kelvin Mao."
1,reference,1.0,"Ballistic, ( also known as Kelvin Mao ), is a fictional comic superhero created by Doug Moench."
2,reference,1.0,"Doug Moench created the character Ballistic, the fictional superhero whose alter ego is Kelvin Mao."


In [46]:
get_texts(200, models_names)

Unnamed: 0,0,1,2
0,reference,1.0,1634: The Ram Rebellion was followed by 1635: The Cannon Law.
1,reference,1.0,1634: The Ram Rebellion is followed by 1635: The Cannon Law.
2,worst,0.568285,d . c . 1634 the ram rebellion was followed up by 1635 : the cannon law .
4,2nd worst,0.568285,d . c . 1634 the ram rebellion was followed up by 1635 : the cannon law .
3,gold,0.553341,1635 : the cannon law is the sequel to 1634 : the ram rebellion .
5,best,0.553341,1635 : the cannon law is the sequel to 1634 : the ram rebellion .


# Análise qualitativa

In [20]:
get_texts(100, models_names)

<'Georgia_(U.S._state)', 'country', 'United_States'>


Unnamed: 0,0,1,2
0,reference,1.0,The state of Georgia is in the U.S.
1,reference,1.0,Georgia is in the country of United States.
2,reference,1.0,The state of Georgia is located within the United States.
4,gold,0.428882,georgia is the united states .
6,best,0.428882,georgia is the united states .
3,worst,0.344607,"georgia , united states is located within the country of a united states national ."
5,2nd worst,0.344607,"georgia , united states is located within the country of a united states national ."


In [88]:
i = 500

In [94]:
i += 1
get_texts(i, {'8562436464462099980': 'best'})

<'Akron_Summit_Assault', 'ground', 'St._Vincent–St._Mary_High_School'>
<'Akron_Summit_Assault', 'league', 'Premier_Development_League'>
<'Premier_Development_League', 'champions', 'K-W_United_FC'>


Unnamed: 0,0,1,2
0,reference,1.0,"St Vincent-St Mary High School is the ground of Akron Summit Assault who play in the Premier Development League, of which K-W United FC have been champions."
1,reference,1.0,"Akron Summit Assault who play in the Premier Development League, won by K-W United FC, have their home ground at St. Vincent-St. Mary High School."
2,reference,1.0,St Vincent-St Mary High School is the ground of Akron Summit Assault who play in the Premier Development League which K-W United FC have been champions of.
3,best,0.748615,"st . vincent - st . mary high school is the ground of akron summit assault , that play in the premier development league , which champions were k - w united fc ."


In [95]:
import pickle

with open('../data/pretrained_models/template_db_train', 'rb') as f:
    tdb = pickle.load(f)

In [96]:
len(tdb)

4633

In [99]:
keys = list(tdb.keys())

In [100]:
keys[0]

('Airport', (Triple(subject='slot0', predicate='cityServed', object='slot1'),))

In [197]:
i = 455

In [208]:
i = i + 1
tdb[keys[i]]

{Structure: (Triple(subject='slot0', predicate='status', object='slot1'), Triple(subject='slot0', predicate='nationality', object='slot2'))
 Text: {slot0-0-N}, now {slot1-0-N}, was born in {slot2-0-D}.}

In [170]:
print('<slot0, demonym, slot1>')
for t in tdb[keys[i]]:
    print(t.template_text)

<slot0, demonym, slot1>
{slot1-0-N} are from {slot0-0-N}.
{slot1-0-N} are the people who reside in {slot0-0-N}.
{slot1-0-N} is the name given to people from {slot0-0-N}.
{slot1-0-N} is the name for the people of {slot0-0-N}.
