# BLEU

In [23]:
def preprocess_reference(ref):
    
    return bytes.decode(str.encode(ref, 'utf-8'), 'ascii', 'ignore').lower()

## Let's have a look at one of submissions

In [4]:
SUBMISSION_FILEPATH = "../data/webnlg2017/submissions/melbourne/final_result.txt"

In [5]:
!head $SUBMISSION_FILEPATH

abilene regional airport serves the city of abilene, texas .
adolfo suárez madrid–barajas airport is located in madrid, paracuellos de jarama, san sebastián de los reyes and alcobendas .
18l/36r is the runway name of adolfo suárez madrid–barajas airport , new zealand .
the icao location identifier of afonso pena international airport is sbct .
afonso pena international airport serves the city of curitiba .
al-taqaddum air base serves the city of fallujah .
the runway length of al-taqaddum air base is 3684.0 .
the runway name of alderney airport is 14/32 .
the runway length of allama iqbal international airport is 3360.12 .
the number of the first runway at amsterdam airport schiphol is 18 .


## Let's preprocess submited file converting non ascii characters to ascii

In [31]:
with open(SUBMISSION_FILEPATH) as f, open(SUBMISSION_FILEPATH + '_ascii', 'w') as f_out:
    
    candidates = []
    for ref in f:
        
        candidates.append(ref)
        f_out.write(preprocess_reference(ref))

## Let's have a look at reference texts

In [32]:
import xml.etree.ElementTree as ET
from spacy.lang.en import English

nlp = English()

tree = ET.parse("../data/webnlg2017/testdata_with_lex.xml")
root = tree.getroot()

references = []

for entry in root.iter('entry'):
    
    references_of_entry = []
    for ref in entry.findall('lex'):
        
        ref_lower = preprocess_reference(ref.text)
        doc = nlp(ref_lower)
        ref_final = ' '.join((token.text for token in doc))
        
        references_of_entry.append(ref_final)
    
    references.append(references_of_entry)

In [33]:
references[0]

['abilene , texas is served by the abilene regional airport .',
 'abilene regional airport serves the city of abilene in texas .']

## Let's calculate BLEU with NLTK

In [34]:
from nltk.translate.bleu_score import corpus_bleu

split_candidates = [candidate.split() for candidate in candidates]
split_references = [[reference_text.split() for reference_text in reference] for reference in references]

corpus_bleu(split_references, split_candidates)

0.35313600222360175

In [35]:
from nltk.translate.bleu_score import sentence_bleu

In [36]:
sentence_bleu(split_references[0], split_candidates[0])

0.6398166741645538

# Let's calculate BLEU with multi-bleu.perl

https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl

## First we need to generate reference files

In [37]:
import xml.etree.ElementTree as ET
from spacy.lang.en import English

nlp = English()

tree = ET.parse('test_reference_files_bleu/testdata_with_lex.xml')
root = tree.getroot()

references = []
for entry in root.iter('entry'):
    
    references_of_entry = []
    for ref in entry.findall('lex'):
        
        ref_lower = preprocess_reference(ref.text)
        doc = nlp(ref_lower)
        ref_final = ' '.join((token.text for token in doc))
        
        references_of_entry.append(ref_final)
    
    references.append(references_of_entry)

In [38]:
ref_per_id = [[] for _ in range(8)]

for i in range(8):
    
    for reference_list in references:
        
        if len(reference_list) > i:
            
            ref_per_id[i].append(reference_list[i])
        else:
            ref_per_id[i].append('')

In [39]:
for i, refs in enumerate(ref_per_id):
    
    with open('test_reference_files_bleu/ref_{}.txt'.format(i), 'w') as f:
        
        f.writelines(('{}\n'.format(ref) for ref in refs))

In [41]:
!head -n 20 ../data/webnlg2017/submissions/melbourne/final_result.txt_ascii

abilene regional airport serves the city of abilene, texas .
adolfo surez madridbarajas airport is located in madrid, paracuellos de jarama, san sebastin de los reyes and alcobendas .
18l/36r is the runway name of adolfo surez madridbarajas airport , new zealand .
the icao location identifier of afonso pena international airport is sbct .
afonso pena international airport serves the city of curitiba .
al-taqaddum air base serves the city of fallujah .
the runway length of al-taqaddum air base is 3684.0 .
the runway name of alderney airport is 14/32 .
the runway length of allama iqbal international airport is 3360.12 .
the number of the first runway at amsterdam airport schiphol is 18 .
the 1st runway at amsterdam airport schiphol is made from asphalt .
the runway name of amsterdam airport schiphol is 06/24 'kaagbaan' .
andrews county airport is 973.0 metres above sea level .
andrews county, texas is the owner of andrews county airport .
the runway length of andrews county

In [42]:
!head -n 20 test_reference_files_bleu/ref_0.txt

abilene , texas is served by the abilene regional airport .
adolfo surez madridbarajas airport can be found in madrid , paracuellos de jarama , san sebastin de los reyes and alcobendas .
the runway name of adolfo surez madridbarajas airport is 18l/36r .
afonso pena international airport icao location idenitifier is sbct .
afonso pena international airport serves the city of curitiba .
the al taqaddum air base serves the city of fallujah .
the runway length of al - taqaddum air base is 3684.0 .
alderney airport runway name is 14/32 .
the runway length at allama iqbal international airport is 3,360.12 .
the first runway at amsterdam 's schiphol airport is known as number 18 .
the 5th runway at amsterdam airport schiphol has an asphalt surfacing .
amsterdam airport schiphol runway name is 06/24 kaagbaan .
andrews county airport is 973 metres above sea level .
the andrews county airport is owned by andrews county , texas .
the runway length of andrews county airport is 896 .

## And then let's try multi-bleu.perl

## Melbourne

In [44]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/melbourne/final_result.txt_ascii

BLEU = 35.91, 76.4/48.7/30.6/19.6 (BP=0.929, ratio=0.932, hyp_len=36970, ref_len=39684)


## Tilb-SMT

In [89]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/smt_test.out.ordered

BLEU = 40.66, 73.2/50.6/34.6/24.4 (BP=0.967, ratio=0.968, hyp_len=38994, ref_len=40298)


## PKUWriter

In [90]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/pkuwriter/PKUWriter_results.txt

BLEU = 36.38, 69.8/45.0/30.8/21.8 (BP=0.954, ratio=0.955, hyp_len=38247, ref_len=40033)


## UPF-FORGe

In [87]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/upf/UPF_All_sent_final.txt

BLEU = 35.05, 74.1/45.8/27.6/16.1 (BP=1.000, ratio=1.051, hyp_len=44700, ref_len=42550)


## Tilb-Pipeline

In [91]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/template_test.out.ordered

BLEU = 32.30, 80.0/53.3/35.1/23.1 (BP=0.749, ratio=0.776, hyp_len=29103, ref_len=37505)


## Tilb-NMT

In [92]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/nmt_test.out.ordered

BLEU = 32.13, 68.5/40.6/24.9/15.4 (BP=1.000, ratio=1.028, hyp_len=42741, ref_len=41573)


## Baseline

In [93]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/baseline_sorted.txt

BLEU = 31.70, 63.4/39.6/28.9/21.9 (BP=0.892, ratio=0.898, hyp_len=35312, ref_len=39332)


## Adapt

In [94]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/adaptCenter/ADAPTcentreWebNLGsubmission.txt

BLEU = 29.84, 54.3/34.6/24.0/17.5 (BP=1.000, ratio=1.325, hyp_len=57002, ref_len=43021)


## UIT-VNU

In [95]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/uit-danglt-clnlp/Submission-UIT-DANGNT-CLNLP.txt

BLEU = 6.35, 63.8/48.3/32.6/22.8 (BP=0.163, ratio=0.356, hyp_len=10982, ref_len=30883)
