In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
import os
import sys
sys.path.extend(['../evaluation'])
from evaluate import model_preprocessed_filepath
import glob
from matplotlib import pyplot as plt
import math
%matplotlib inline

chencherry = SmoothingFunction().method2

In [3]:
def get_scores(filepath, subset='all-cat'):
    
    references_files = [f'../evaluation/references/dev/{subset}_reference{i}.lex' for i in range(3)]
    hypothesis_file = filepath

    scores = []
    hypothesis = []
    references = []

    with open(references_files[0], 'r', encoding='utf-8') as ref0,\
         open(references_files[1], 'r', encoding='utf-8') as ref1,\
         open(references_files[2], 'r', encoding='utf-8') as ref2,\
         open(hypothesis_file, 'r', encoding='utf-8') as hyp:


        for h, r0, r1, r2 in zip(hyp, ref0, ref1, ref2):

            references.append((r0[:-1], r1[:-1], r2[:-1]))
            hypothesis.append(h[:-1])

    for hyp, refs in zip(hypothesis, references):

        score = sentence_bleu([r.split() for r in refs], 
                              hyp.split(), 
                              smoothing_function=chencherry)

        scores.append(score)
        
    return scores, hypothesis, references
    

def plot_hist_bleu(filepath, ax, subset='all-cat'):

    scores, _, _ = get_scores(filepath, subset)
    
    s = pd.Series(scores)
    
    s.hist(ax=ax, bins=40)

In [4]:
def plot_all_models_hist_bleu(subset):
    
    models = sorted([os.path.basename(s) for s in glob.glob('../data/models/dev/*')])

    n_models = len(models)
    n_columns = 6
    n_rows = math.ceil(n_models / 6)

    fig, axes = plt.subplots(n_rows, n_columns, figsize=(30, 5*n_rows), sharey=True, sharex=True)

    for model, ax in zip(models, axes.ravel()):

        try:
            filepath = model_preprocessed_filepath(model, 'dev', subset)
        except FileNotFoundError as e:
            pass

        plot_hist_bleu(filepath, ax, subset)
        ax.set_title(model)

In [None]:
plot_all_models_hist_bleu('all-cat')

# Menores scores

In [10]:
models = [os.path.basename(s) for s in glob.glob('../data/models/dev_apresentacao/*')]

models_data = {}

for model in models:
    
    sentence_bleu_filepath = f'../data/models/dev_apresentacao/{model}/sentence_bleu.txt'
    try:
        model_texts_filepath = model_preprocessed_filepath(model, 'dev_apresentacao', 'all-cat')
    except FileNotFoundError:
        continue
    
    if os.path.isfile(sentence_bleu_filepath):
        with open(sentence_bleu_filepath, 'r', encoding='utf-8') as f:
            scores = [float(l[:-1]) for l in f.readlines()]
        with open(model_texts_filepath, 'r', encoding='utf-8') as f:
            hypothesis = [l[:-1] for l in f.readlines()]
    else:
        scores, hypothesis, _ = get_scores(model_texts_filepath, 'all-cat')

        with open(f'../data/models/dev_apresentacao/{model}/sentence_bleu.txt', 'w', encoding='utf-8') as f:
            for score in scores:
                f.write(f'{score}\n')
    
    models_data[model] = (scores, hypothesis)

In [19]:
#adicionar references
sys.path.append('../template_model')
from reading_thiagos_templates import load_dev, Entry

dev = load_dev()
def get_texts(ix, models):
    
    ix_data = []
    for l in dev[ix].lexes:
        ix_data.append(('reference', 1.0, l['text']))
    for t in dev[ix].triples:
        print(f"<'{t.subject}', '{t.predicate}', '{t.object}'>")
    for model, (scores, hypothesis) in models_data.items():
        
        if model in models:
        
            ix_data.append((models[model], scores[ix], hypothesis[ix]))
            
    return pd.DataFrame(ix_data).sort_values(1, ascending=False)

In [14]:
models_names = {'-5304774152169464302': 'gold', '8562436464462099980': 'best', '-4478499061069106480': 'worst', '-7111541801870592395': '2nd worst'}

In [9]:
get_texts(279, models_names)

Unnamed: 0,0,1,2
0,reference,1.0,"Doug Moench was the creator of the comic character Ballistic, who has the alternative name Kelvin Mao."
1,reference,1.0,"Ballistic, ( also known as Kelvin Mao ), is a fictional comic superhero created by Doug Moench."
2,reference,1.0,"Doug Moench created the character Ballistic, the fictional superhero whose alter ego is Kelvin Mao."


In [46]:
get_texts(200, models_names)

Unnamed: 0,0,1,2
0,reference,1.0,1634: The Ram Rebellion was followed by 1635: The Cannon Law.
1,reference,1.0,1634: The Ram Rebellion is followed by 1635: The Cannon Law.
2,worst,0.568285,d . c . 1634 the ram rebellion was followed up by 1635 : the cannon law .
4,2nd worst,0.568285,d . c . 1634 the ram rebellion was followed up by 1635 : the cannon law .
3,gold,0.553341,1635 : the cannon law is the sequel to 1634 : the ram rebellion .
5,best,0.553341,1635 : the cannon law is the sequel to 1634 : the ram rebellion .


# Análise qualitativa

In [20]:
get_texts(100, models_names)

<'Georgia_(U.S._state)', 'country', 'United_States'>


Unnamed: 0,0,1,2
0,reference,1.0,The state of Georgia is in the U.S.
1,reference,1.0,Georgia is in the country of United States.
2,reference,1.0,The state of Georgia is located within the United States.
4,gold,0.428882,georgia is the united states .
6,best,0.428882,georgia is the united states .
3,worst,0.344607,"georgia , united states is located within the country of a united states national ."
5,2nd worst,0.344607,"georgia , united states is located within the country of a united states national ."


In [88]:
i = 500

In [94]:
i += 1
get_texts(i, {'8562436464462099980': 'best'})

<'Akron_Summit_Assault', 'ground', 'St._Vincent–St._Mary_High_School'>
<'Akron_Summit_Assault', 'league', 'Premier_Development_League'>
<'Premier_Development_League', 'champions', 'K-W_United_FC'>


Unnamed: 0,0,1,2
0,reference,1.0,"St Vincent-St Mary High School is the ground of Akron Summit Assault who play in the Premier Development League, of which K-W United FC have been champions."
1,reference,1.0,"Akron Summit Assault who play in the Premier Development League, won by K-W United FC, have their home ground at St. Vincent-St. Mary High School."
2,reference,1.0,St Vincent-St Mary High School is the ground of Akron Summit Assault who play in the Premier Development League which K-W United FC have been champions of.
3,best,0.748615,"st . vincent - st . mary high school is the ground of akron summit assault , that play in the premier development league , which champions were k - w united fc ."


In [95]:
import pickle

with open('../data/pretrained_models/template_db_train', 'rb') as f:
    tdb = pickle.load(f)

In [96]:
len(tdb)

4633

In [99]:
keys = list(tdb.keys())

In [100]:
keys[0]

('Airport', (Triple(subject='slot0', predicate='cityServed', object='slot1'),))

In [197]:
i = 455

In [208]:
i = i + 1
tdb[keys[i]]

{Structure: (Triple(subject='slot0', predicate='status', object='slot1'), Triple(subject='slot0', predicate='nationality', object='slot2'))
 Text: {slot0-0-N}, now {slot1-0-N}, was born in {slot2-0-D}.}

In [170]:
print('<slot0, demonym, slot1>')
for t in tdb[keys[i]]:
    print(t.template_text)

<slot0, demonym, slot1>
{slot1-0-N} are from {slot0-0-N}.
{slot1-0-N} are the people who reside in {slot0-0-N}.
{slot1-0-N} is the name given to people from {slot0-0-N}.
{slot1-0-N} is the name for the people of {slot0-0-N}.
