In [2]:
%run ../script/webnlg.py

lex = WebNLGDataset("../data/webnlg2017/testdata_with_lex.xml")

s_multiwordexpressions = lex.mdf.m_subject.str.lower().str.replace('_', ' ').str.split().tolist()
o_multiwordexpressions = lex.mdf.m_object.str.lower().str.replace('_', ' ').str.split().tolist()

all_multiwordexpressions = s_multiwordexpressions + o_multiwordexpressions
all_multiwordexpressions = list(set([tuple(s) for s in all_multiwordexpressions]))

from nltk.tokenize.mwe import MWETokenizer

t = MWETokenizer(all_multiwordexpressions)

print(t.tokenize('al-taqaddum air base serves the city of fallujah .\n'.split()))

['al-taqaddum_air_base', 'serves', 'the', 'city', 'of', 'fallujah', '.']


In [95]:
from nltk.translate.bleu_score import corpus_bleu
from textacy.preprocess import preprocess_text
import xml.etree.ElementTree as ET

def preprocess(ref):
    
    ref = preprocess_text(ref, no_accents=True, lowercase=True)
    
    return t.tokenize(ref.split())

tree = ET.parse("../data/webnlg2017/testdata_with_lex.xml")
root = tree.getroot()

references_original = []
references_preprocessed = []

for entry in root.iter('entry'):
    
    references_of_entry, references_of_entry_preprocessed = [], []
    for ref in entry.findall('lex'):
        
        references_of_entry.append(ref.text)
        references_of_entry_preprocessed.append(preprocess(ref.text))
    
    references_preprocessed.append(references_of_entry_preprocessed)
    references_original.append(references_of_entry)

In [13]:
SUBMISSION_FILEPATHS = [
    "../data/webnlg2017/submissions/melbourne/final_result.txt",
    "../data/webnlg2017/submissions/tilburg/smt_test.out.ordered",
    "../data/webnlg2017/submissions/pkuwriter/PKUWriter_results.txt",
    "../data/webnlg2017/submissions/upf/UPF_All_sent_final.txt",
    "../data/webnlg2017/submissions/tilburg/template_test.out.ordered",
    "../data/webnlg2017/submissions/tilburg/nmt_test.out.ordered",
    "../data/webnlg2017/submissions/baseline_sorted.txt",
    "../data/webnlg2017/submissions/adaptCenter/ADAPTcentreWebNLGsubmission.txt",
    "../data/webnlg2017/submissions/uit-danglt-clnlp/Submission-UIT-DANGNT-CLNLP.txt"
]

In [14]:
submissions = {}

for filepath in SUBMISSION_FILEPATHS:

    submissions[filepath] = {}
    
    with open(filepath) as f:

        lines = f.readlines()
        submissions[filepath]['candidates_original'] = lines
        submissions[filepath]['candidates_preprocessed'] = [preprocess(line) for line in lines]

In [15]:
submissions['../data/webnlg2017/submissions/melbourne/final_result.txt']['candidates_original'][5]

'al-taqaddum air base serves the city of fallujah .\n'

In [16]:
submissions['../data/webnlg2017/submissions/melbourne/final_result.txt']['candidates_preprocessed'][5]

['al-taqaddum_air_base', 'serves', 'the', 'city', 'of', 'fallujah', '.']

In [17]:
references_original[5]

['The Al Taqaddum Air Base serves the city of Fallujah.',
 'Al-Taqaddum Air Base serves the city of Fallujah.']

In [18]:
references_preprocessed[5]

[['the',
  'al',
  'taqaddum',
  'air',
  'base',
  'serves',
  'the',
  'city',
  'of',
  'fallujah.'],
 ['al-taqaddum_air_base', 'serves', 'the', 'city', 'of', 'fallujah.']]

In [19]:
corpus_bleu(references_preprocessed, submissions['../data/webnlg2017/submissions/melbourne/final_result.txt']['candidates_preprocessed'])

0.238959236874247

In [20]:
for filepath, data in submissions.items():
    
    print(filepath)
    
    print(corpus_bleu(references_preprocessed, data['candidates_preprocessed']))
    
    print()

../data/webnlg2017/submissions/melbourne/final_result.txt
0.238959236874247

../data/webnlg2017/submissions/tilburg/smt_test.out.ordered
0.2434789580589374

../data/webnlg2017/submissions/pkuwriter/PKUWriter_results.txt
0.18480246526227984

../data/webnlg2017/submissions/upf/UPF_All_sent_final.txt
0.15576518171470138

../data/webnlg2017/submissions/tilburg/template_test.out.ordered
0.16512476933647605

../data/webnlg2017/submissions/tilburg/nmt_test.out.ordered
0.15104097109989256

../data/webnlg2017/submissions/baseline_sorted.txt
0.19296743398818347

../data/webnlg2017/submissions/adaptCenter/ADAPTcentreWebNLGsubmission.txt
0.16253063000877843

../data/webnlg2017/submissions/uit-danglt-clnlp/Submission-UIT-DANGNT-CLNLP.txt
0.020733098629933357

