In [1]:
import os
import re

os.sys.path.insert(0, '../script')

import webnlg

import pandas as pd

In [2]:
corpus = webnlg.load(dataset=['train', 'dev', 'test_with_lex'], structure='pandas')

In [3]:
datasets_stats = pd.Series({
    'number of entries': len(corpus.edf),
    'number of distinct triples': corpus.mdf.mtext.nunique(),
    'number of distinct generated texts': corpus.ldf.ltext.nunique(),
    'number of reference texts': len(corpus.ldf),
    'number of triples': len(corpus.mdf),
    'number of characters in reference texts': corpus.ldf.ltext.str.len().sum()
    })

datasets_stats

number of entries                             9674
number of distinct triples                    3221
number of distinct generated texts           25214
number of reference texts                    25298
number of triples                            28399
number of characters in reference texts    2956533
dtype: int64

# One entry per category and triplesize

In [11]:
random_entries = corpus.edf.groupby(['category', 'ntriples']).apply(lambda g: g.sample(random_state=10))

len(random_entries)

81

In [12]:
random_entries_with_lexes = pd.merge(random_entries, corpus.ldf, on='idx')

len(random_entries_with_lexes)

222

In [16]:
random_entries_with_lexes['Google Translate'] = None


random_entries_with_lexes.sort_values(['category', 'ntriples']).to_excel('sample_translation_evaluation.xlsx', index=False)

# All texts

In [52]:
all_texts = pd.merge(corpus.edf, corpus.ldf, on='idx')

all_texts['Google Translate'] = None

all_texts.sort_values(['category', 'ntriples'], inplace=True)

all_texts.to_excel('all_texts_translation_evaluation.xlsx', index=False)

## to use google translate, I have to split the texts in buckets of 30k characters

In [66]:
import os

buckets_dir = './all_texts_buckets'

if not os.path.isdir(buckets_dir):
    
    os.mkdir(buckets_dir)

n_characters = 30000

accumulated = 0

positions = []

for i, (idx, text_len) in enumerate(all_texts['ltext'].str.len().iteritems()):
    
    accumulated += text_len
    
    if accumulated > n_characters:
        
        positions.append(i - 1)
        accumulated = text_len
        
for i, (i_begin, i_end) in enumerate(zip([0] + positions, positions + [None])):
    
    bucket = all_texts.iloc[i_begin:i_end, :]
    
    bucket['ltext'].to_csv(os.path.join(buckets_dir, f'all_texts_bucket_{i}.txt'), index=False)

# After translating all the texts, lets bring them together into a single file

In [66]:
df = pd.read_excel('all_texts_buckets/Todos textos.xlsx')

In [67]:
from glob import glob

translation_files = glob('all_texts_buckets/translations_*.txt')

translation_files.sort()

translation_files

['all_texts_buckets/translations_0_21.txt',
 'all_texts_buckets/translations_21_30.txt',
 'all_texts_buckets/translations_31_40.txt',
 'all_texts_buckets/translations_41_50.txt',
 'all_texts_buckets/translations_51_60.txt',
 'all_texts_buckets/translations_61_70.txt',
 'all_texts_buckets/translations_71_80.txt',
 'all_texts_buckets/translations_81_90.txt',
 'all_texts_buckets/translations_91_98.txt']

In [72]:
translations = []

for translation_file in translation_files:
    
    with open(translation_file) as f:
        
        translations.extend(f.readlines())

translations = [t.strip() for t in translations]
df['Google Translate'] = translations

In [73]:
df_by_dataset = {dataset: group for dataset, group in df.groupby('dataset')}

In [90]:
CORPUS_FOLDER = 'corpus_pt'
OUTFILE_TEMPLATE = '{}.xml'

LEX_XML_ELEMENT_TEMPLATE = '<lex comment="none" lid="{lid}">{Google Translate}</lex>'

C_LEX_ELEMENT = re.compile(r'<lex.*</lex>', flags=re.DOTALL)

def replace_lex_by_slot(xml):
    
    return C_LEX_ELEMENT.sub('{lexes}', xml)

if not os.path.isdir(CORPUS_FOLDER):
    
    os.mkdir(CORPUS_FOLDER)
    
i = 0

for dataset, df in df_by_dataset.items():
    
    outfile = os.path.join(CORPUS_FOLDER, OUTFILE_TEMPLATE.format(dataset))
    
    with open(outfile, 'w') as f:
        
        f.write('<benchmark>\n')
        f.write('\t<entries>\n\t\t')
        
        for idx, df_rows in df.groupby('idx'):
            
            xml = replace_lex_by_slot(df_rows.iloc[0]['content'])
            
            lex_xml_elements = []
            
            for i, row in df_rows.iterrows():
                
                lex_xml_element = LEX_XML_ELEMENT_TEMPLATE.format(**row.to_dict())
                
                lex_xml_elements.append(lex_xml_element)
                
            xml = xml.format(**{'lexes': '\n\t\t\t'.join(lex_xml_elements)})
            
            f.write(xml)