In [3]:
from spacy.lang.en import English

nlp = English()

def preprocess_reference(ref):
    
    doc = nlp(ref)
    ref = ' '.join((token.text for token in doc))
    
    return bytes.decode(str.encode(ref, 'utf-8'), 'ascii', 'ignore')\
                .lower()

In [4]:
SUBMISSION_FILEPATHS = [
    "../data/webnlg2017/submissions/melbourne/final_result.txt",
    "../data/webnlg2017/submissions/tilburg/smt_test.out.ordered",
    "../data/webnlg2017/submissions/pkuwriter/PKUWriter_results.txt",
    "../data/webnlg2017/submissions/upf/UPF_All_sent_final.txt",
    "../data/webnlg2017/submissions/tilburg/template_test.out.ordered",
    "../data/webnlg2017/submissions/tilburg/nmt_test.out.ordered",
    "../data/webnlg2017/submissions/baseline_sorted.txt",
    "../data/webnlg2017/submissions/adaptCenter/ADAPTcentreWebNLGsubmission.txt",
    "../data/webnlg2017/submissions/uit-danglt-clnlp/Submission-UIT-DANGNT-CLNLP.txt"
]

# BLEU

## Preprocessing submited files

In [11]:
for filepath in SUBMISSION_FILEPATHS:
    
    with open(filepath) as f, open(filepath + '_ascii', 'w') as f_out:

        for ref in f:

            f_out.write(preprocess_reference(ref))

## Let's have a look at reference texts

In [8]:
import xml.etree.ElementTree as ET

tree = ET.parse("../data/webnlg2017/testdata_with_lex.xml")
root = tree.getroot()

references = []

for entry in root.iter('entry'):
    
    references_of_entry = []
    for ref in entry.findall('lex'):
        
        ref_final = preprocess_reference(ref.text)
        
        references_of_entry.append(ref_final)
    
    references.append(references_of_entry)

In [9]:
references[0]

['abilene , texas is served by the abilene regional airport .',
 'abilene regional airport serves the city of abilene in texas .']

## Let's calculate BLEU with NLTK

In [12]:
from nltk.translate.bleu_score import corpus_bleu

with open('../data/webnlg2017/submissions/melbourne/final_result.txt_ascii') as f:
    
    candidates = f.readlines()

split_candidates = [candidate.split() for candidate in candidates]
split_references = [[reference_text.split() for reference_text in reference] for reference in references]

corpus_bleu(split_references, split_candidates)

0.41672566017154156

In [13]:
from nltk.translate.bleu_score import sentence_bleu

In [14]:
sentence_bleu(split_references[0], split_candidates[0])

0.8349950232057651

# Let's calculate BLEU with multi-bleu.perl

https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl

## First we need to generate reference files

In [15]:
import xml.etree.ElementTree as ET
from spacy.lang.en import English

nlp = English()

tree = ET.parse('test_reference_files_bleu/testdata_with_lex.xml')
root = tree.getroot()

references = []
for entry in root.iter('entry'):
    
    references_of_entry = []
    for ref in entry.findall('lex'):
        
        ref_lower = preprocess_reference(ref.text)
        doc = nlp(ref_lower)
        ref_final = ' '.join((token.text for token in doc))
        
        references_of_entry.append(ref_final)
    
    references.append(references_of_entry)

In [16]:
ref_per_id = [[] for _ in range(8)]

for i in range(8):
    
    for reference_list in references:
        
        if len(reference_list) > i:
            
            ref_per_id[i].append(reference_list[i])
        else:
            ref_per_id[i].append('')

In [17]:
for i, refs in enumerate(ref_per_id):
    
    with open('test_reference_files_bleu/ref_{}.txt'.format(i), 'w') as f:
        
        f.writelines(('{}\n'.format(ref) for ref in refs))

In [18]:
!head -n 20 ../data/webnlg2017/submissions/melbourne/final_result.txt_ascii

abilene regional airport serves the city of abilene , texas . 
adolfo surez madrid  barajas airport is located in madrid , paracuellos de jarama , san sebastin de los reyes and alcobendas . 
18l/36r is the runway name of adolfo surez madrid  barajas airport , new zealand . 
the icao location identifier of afonso pena international airport is sbct . 
afonso pena international airport serves the city of curitiba . 
al - taqaddum air base serves the city of fallujah . 
the runway length of al - taqaddum air base is 3684.0 . 
the runway name of alderney airport is 14/32 . 
the runway length of allama iqbal international airport is 3360.12 . 
the number of the first runway at amsterdam airport schiphol is 18 . 
the 1st runway at amsterdam airport schiphol is made from asphalt . 
the runway name of amsterdam airport schiphol is 06/24 ' kaagbaan ' . 
andrews county airport is 973.0 metres above sea level . 
andrews county , texas is the owner of andrews county airport . 
the run

In [19]:
!head -n 20 test_reference_files_bleu/ref_0.txt

abilene , texas is served by the abilene regional airport .
adolfo surez madrid   barajas airport can be found in madrid , paracuellos de jarama , san sebastin de los reyes and alcobendas .
the runway name of adolfo surez madrid   barajas airport is 18l/36r .
afonso pena international airport icao location idenitifier is sbct .
afonso pena international airport serves the city of curitiba .
the al taqaddum air base serves the city of fallujah .
the runway length of al - taqaddum air base is 3684.0 .
alderney airport runway name is 14/32 .
the runway length at allama iqbal international airport is 3,360.12 .
the first runway at amsterdam 's schiphol airport is known as number 18 .
the 5th runway at amsterdam airport schiphol has an asphalt surfacing .
amsterdam airport schiphol runway name is 06/24 kaagbaan .
andrews county airport is 973 metres above sea level .
the andrews county airport is owned by andrews county , texas .
the runway length of andrews county airport is 

## And then let's try multi-bleu.perl

## Melbourne

In [20]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/melbourne/final_result.txt_ascii

BLEU = 42.75, 79.0/53.3/36.3/25.1 (BP=0.966, ratio=0.967, hyp_len=39095, ref_len=40449)


## Tilb-SMT

In [21]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/smt_test.out.ordered_ascii

BLEU = 42.62, 73.6/51.5/35.8/25.5 (BP=0.989, ratio=0.989, hyp_len=40525, ref_len=40990)


## PKUWriter

In [22]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/pkuwriter/PKUWriter_results.txt_ascii

BLEU = 36.43, 69.9/45.1/30.9/21.9 (BP=0.953, ratio=0.954, hyp_len=38220, ref_len=40059)


## UPF-FORGe

In [23]:
    !../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/upf/UPF_All_sent_final.txt_ascii

BLEU = 36.97, 75.1/47.6/29.4/17.8 (BP=1.000, ratio=1.063, hyp_len=45511, ref_len=42822)


## Tilb-Pipeline

In [26]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/template_test.out.ordered_ascii

BLEU = 34.26, 81.3/55.3/37.1/24.8 (BP=0.760, ratio=0.784, hyp_len=29516, ref_len=37632)


## Tilb-NMT

In [27]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/tilburg/nmt_test.out.ordered_ascii

BLEU = 33.67, 69.4/42.1/26.4/16.7 (BP=1.000, ratio=1.036, hyp_len=43295, ref_len=41787)


## Baseline

In [28]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/baseline_sorted.txt_ascii

BLEU = 31.78, 63.5/39.7/29.1/22.0 (BP=0.891, ratio=0.897, hyp_len=35312, ref_len=39377)


## Adapt

In [29]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/adaptCenter/ADAPTcentreWebNLGsubmission.txt_ascii

BLEU = 30.19, 54.4/34.9/24.4/17.9 (BP=1.000, ratio=1.341, hyp_len=57893, ref_len=43182)


## UIT-VNU

In [30]:
!../evaluation/webnlg2017/webnlg-baseline-master/multi-bleu.perl test_reference_files_bleu/ref_0.txt test_reference_files_bleu/ref_1.txt test_reference_files_bleu/ref_2.txt < ../data/webnlg2017/submissions/uit-danglt-clnlp/Submission-UIT-DANGNT-CLNLP.txt_ascii

BLEU = 6.61, 64.5/49.2/33.5/23.7 (BP=0.166, ratio=0.358, hyp_len=11072, ref_len=30966)
