In [4]:
from textacy.preprocess import preprocess_text

def preprocess(ref):
    
    return preprocess_text(ref, 
                           # replace accenteds with unaccented versions
                           no_accents=True,
                           # lowercase
                           lowercase=True, 
                           # replace punctuation with empty string
                           no_punct=True)

In [2]:
SUBMISSION_FILEPATHS = [
    "../data/webnlg2017/submissions/melbourne/final_result.txt",
    "../data/webnlg2017/submissions/tilburg/smt_test.out.ordered",
    "../data/webnlg2017/submissions/pkuwriter/PKUWriter_results.txt",
    "../data/webnlg2017/submissions/upf/UPF_All_sent_final.txt",
    "../data/webnlg2017/submissions/tilburg/template_test.out.ordered",
    "../data/webnlg2017/submissions/tilburg/nmt_test.out.ordered",
    "../data/webnlg2017/submissions/baseline_sorted.txt",
    "../data/webnlg2017/submissions/adaptCenter/ADAPTcentreWebNLGsubmission.txt",
    "../data/webnlg2017/submissions/uit-danglt-clnlp/Submission-UIT-DANGNT-CLNLP.txt"
]

# BLEU

## Preprocessing submited files

In [13]:
# for each submited file, generate a preprocessed file

for filepath in SUBMISSION_FILEPATHS:
    
    with open(filepath) as f, open(filepath + '_preprocessed', 'w') as f_out:

        for ref in f:

            f_out.write(preprocess(ref) + '\n')

## Let's have a look at reference texts

In [15]:
# loads reference texts

import xml.etree.ElementTree as ET

tree = ET.parse("../data/webnlg2017/testdata_with_lex.xml")
root = tree.getroot()

references = []

for entry in root.iter('entry'):
    
    references_of_entry = []
    for ref in entry.findall('lex'):
        
        ref_final = preprocess(ref.text)
        
        references_of_entry.append(ref_final)
    
    references.append(references_of_entry)

In [8]:
references[0]

['abilene texas is served by the abilene regional airport',
 'abilene regional airport serves the city of abilene in texas']

## Let's calculate BLEU with NLTK

In [50]:
from nltk.translate.bleu_score import corpus_bleu

with open('../data/webnlg2017/submissions/melbourne/final_result.txt_preprocessed') as f:
    
    candidates = [line for line in f.readlines()]

split_candidates = [candidate.split() for candidate in candidates]
split_references = [[reference_text.split() for reference_text in reference] for reference in references]

corpus_bleu(split_references, split_candidates)

0.4303379633651451

# Let's calculate BLEU with multi-bleu.perl

https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl

## First we need to generate reference files

In [9]:
import xml.etree.ElementTree as ET

tree = ET.parse('test_reference_files_bleu/testdata_with_lex.xml')
root = tree.getroot()

references = []
for entry in root.iter('entry'):
    
    references_of_entry = []
    for ref in entry.findall('lex'):
        
        ref_final = preprocess(ref.text)
        
        references_of_entry.append(ref_final)
    
    references.append(references_of_entry)

In [10]:
ref_per_id = [[] for _ in range(8)]

for i in range(8):
    
    for reference_list in references:
        
        if len(reference_list) > i:
            
            ref_per_id[i].append(reference_list[i])
        else:
            ref_per_id[i].append('')

In [11]:
for i, refs in enumerate(ref_per_id):
    
    with open('test_reference_files_bleu/ref_{}.txt'.format(i), 'w') as f:
        
        f.writelines(('{}\n'.format(ref) for ref in refs))

In [6]:
!head -n 20 ../data/webnlg2017/submissions/melbourne/final_result.txt_ascii

abilene regional airport serves the city of abilene texas
adolfo suarez madrid barajas airport is located in madrid paracuellos de jarama san sebastian de los reyes and alcobendas
18l 36r is the runway name of adolfo suarez madrid barajas airport new zealand
the icao location identifier of afonso pena international airport is sbct
afonso pena international airport serves the city of curitiba
al taqaddum air base serves the city of fallujah
the runway length of al taqaddum air base is 3684 0
the runway name of alderney airport is 14 32
the runway length of allama iqbal international airport is 3360 12
the number of the first runway at amsterdam airport schiphol is 18
the 1st runway at amsterdam airport schiphol is made from asphalt
the runway name of amsterdam airport schiphol is 06 24 kaagbaan
andrews county airport is 973 0 metres above sea level
andrews county texas is the owner of andrews county airport
the runway length of andrews county airport is 896 0
the 1st runway at angola in

In [13]:
!head -n 20 test_reference_files_bleu/ref_0.txt

abilene texas is served by the abilene regional airport
adolfo suarez madrid barajas airport can be found in madrid paracuellos de jarama san sebastian de los reyes and alcobendas
the runway name of adolfo suarez madrid barajas airport is 18l 36r
afonso pena international airport icao location idenitifier is sbct
afonso pena international airport serves the city of curitiba
the al taqaddum air base serves the city of fallujah
the runway length of al taqaddum air base is 3684 0
alderney airport runway name is 14 32
the runway length at allama iqbal international airport is 3 360 12
the first runway at amsterdam s schiphol airport is known as number 18
the 5th runway at amsterdam airport schiphol has an asphalt surfacing
amsterdam airport schiphol runway name is 06 24 kaagbaan
andrews county airport is 973 metres above sea level
the andrews county airport is owned by andrews county texas
the runway length of andrews county airport is 896
angola international airport is the first runway m

# A realidade é que ainda não consegui replicar a métrica BLEU
* é necessário gerar arquivos pre processados, com o script evaluation.py
* ainda não entendi como gerar para arquivos novos, além dos submited files

## And then let's try multi-bleu.perl

## Melbourne

In [56]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl -lc ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference0.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference1.lex ../evaluation/webnlg2017/webnlg-automatic-evaluation/references/gold-all-cat-reference2.lex < ../evaluation/webnlg2017/webnlg-automatic-evaluation/teams/GKB_Unimelb_all-cat.txt

BLEU = 45.13, 79.9/55.4/38.8/27.4 (BP=0.969, ratio=0.969, hyp_len=40642, ref_len=41921)


In [52]:
!head ../evaluation/webnlg2017/webnlg-automatic-evaluation/eval/bleu3ref-GKB_Unimelb_all-cat.txt

BLEU = 45.13, 79.9/55.4/38.8/27.4 (BP=0.969, ratio=0.969, hyp_len=40642, ref_len=41921)
BLEU = 45.13, 79.9/55.4/38.8/27.4 (BP=0.969, ratio=0.969, hyp_len=40642, ref_len=41921)


## Tilb-SMT

In [15]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/smt_test.out.ordered_ascii

BLEU = 44.34, 76.0/54.3/37.3/26.1 (BP=0.991, ratio=0.991, hyp_len=36180, ref_len=36518)


## PKUWriter

In [16]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/pkuwriter/PKUWriter_results.txt_ascii

BLEU = 37.93, 73.6/50.2/34.5/24.2 (BP=0.906, ratio=0.910, hyp_len=31950, ref_len=35120)


## UPF-FORGe

In [17]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/upf/UPF_All_sent_final.txt_ascii

BLEU = 39.07, 77.6/50.7/31.2/19.0 (BP=1.000, ratio=1.012, hyp_len=37497, ref_len=37038)


## Tilb-Pipeline

In [18]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/template_test.out.ordered_ascii

BLEU = 33.71, 82.1/56.9/37.6/24.9 (BP=0.737, ratio=0.766, hyp_len=25481, ref_len=33256)


## Tilb-NMT

In [19]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/nmt_test.out.ordered_ascii

BLEU = 33.00, 68.4/42.4/25.8/15.8 (BP=1.000, ratio=1.036, hyp_len=38328, ref_len=37014)


## Baseline

In [20]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/baseline_sorted.txt_ascii

BLEU = 32.61, 62.8/41.9/30.9/23.8 (BP=0.874, ratio=0.882, hyp_len=30830, ref_len=34974)


## Adapt

In [21]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/adaptCenter/ADAPTcentreWebNLGsubmission.txt_ascii

BLEU = 30.31, 53.2/35.3/24.8/18.2 (BP=1.000, ratio=1.333, hyp_len=51161, ref_len=38388)


## UIT-VNU

In [22]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/uit-danglt-clnlp/Submission-UIT-DANGNT-CLNLP.txt_ascii

BLEU = 7.09, 65.1/53.1/37.3/26.7 (BP=0.165, ratio=0.357, hyp_len=9793, ref_len=27457)
