In [1]:
import pandas as pd
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path

In [2]:
RANDOM_SEED = 1234
source_data_path = '../../hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/modern.lit/bi'

In [3]:
!ls "$source_data_path"

hunglish1.lit.bi  hunglish2.lit.bi


## Load the data

In [4]:
hunglish1_hun = []
hunglish1_eng = []
with open(os.path.join(source_data_path, 'hunglish1.lit.bi'), 'r', encoding='iso-8859-1') as input_file:
    for i, line in enumerate(input_file):
        line_parts = [part.strip() for part in line.split('\t')]
        if len(line_parts) == 2:
            hunglish1_hun.append(line_parts[0])
            hunglish1_eng.append(line_parts[1])
        else:
            print(line_parts)

In [5]:
data = {
    'hun': hunglish1_hun,
    'eng': hunglish1_eng
}
df1 = pd.DataFrame(data)

In [6]:
df1

Unnamed: 0,hun,eng
0,- -,-
1,- -,-
2,- -,-
3,- ...,. .
4,-,-
...,...,...
449393,"Zsuppogott, cuppant és micsongott, ez utóbbit ...","It flolloped, gupped and willomied, doing this..."
449394,- Zsupsz be! - mondta.,"""Nabbed!"" said he."
449395,- Zsupszkulccsá varázsoltam a serleget.,"""Turned it into a Portkey."
449396,"Zsûritagként tökéletes ürügye lenne rá, hogy r...",It would be an ideal excuse to make regular vi...


In [7]:
df1['hun_has_letters'] = df1['hun'].apply(lambda sent: any(c.isalpha() for c in sent))
df1['eng_has_letters'] = df1['eng'].apply(lambda sent: any(c.isalpha() for c in sent))

In [8]:
df1

Unnamed: 0,hun,eng,hun_has_letters,eng_has_letters
0,- -,-,False,False
1,- -,-,False,False
2,- -,-,False,False
3,- ...,. .,False,False
4,-,-,False,False
...,...,...,...,...
449393,"Zsuppogott, cuppant és micsongott, ez utóbbit ...","It flolloped, gupped and willomied, doing this...",True,True
449394,- Zsupsz be! - mondta.,"""Nabbed!"" said he.",True,True
449395,- Zsupszkulccsá varázsoltam a serleget.,"""Turned it into a Portkey.",True,True
449396,"Zsûritagként tökéletes ürügye lenne rá, hogy r...",It would be an ideal excuse to make regular vi...,True,True


In [9]:
df1 = df1[df1['hun_has_letters'] & df1['eng_has_letters']]

In [10]:
df1

Unnamed: 0,hun,eng,hun_has_letters,eng_has_letters
444,"0,1 g ötven órán keresztül, majd 0,2 az áttéré...",'0.1 gee for fifty hours; then 0.2 until turna...,True,True
445,"07,00h-kor hivatalosan leváltotta Poole-t a ve...",At 0700 he would officially relieve Poole on t...,True,True
446,0903-at mutattak.,They read: 0903.,True,True
463,10. A CLAVIUS-BÁZIS,10 - Clavius Base,True,True
464,10. A HÁBORÚ VÉGE,10. THE WAR ENDS,True,True
...,...,...,...,...
449393,"Zsuppogott, cuppant és micsongott, ez utóbbit ...","It flolloped, gupped and willomied, doing this...",True,True
449394,- Zsupsz be! - mondta.,"""Nabbed!"" said he.",True,True
449395,- Zsupszkulccsá varázsoltam a serleget.,"""Turned it into a Portkey.",True,True
449396,"Zsûritagként tökéletes ürügye lenne rá, hogy r...",It would be an ideal excuse to make regular vi...,True,True


In [11]:
hunglish2_hun = []
hunglish2_eng = []
with open(os.path.join(source_data_path, 'hunglish2.lit.bi'), 'r', encoding='iso-8859-1') as input_file:
    for i, line in enumerate(input_file):
        line_parts = [part.strip() for part in line.split('\t')]
        if len(line_parts) == 2:
            hunglish2_hun.append(line_parts[0])
            hunglish2_eng.append(line_parts[1])
        else:
            print(line_parts)

In [12]:
data = {
    'hun': hunglish2_hun,
    'eng': hunglish2_eng
}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,hun,eng
0,"0004-Lima volt már, vagyis éjjel 12 óra 04 per...","It was now 0004-Lima, or 12:04 A.M., local time."
1,"- 0, boldogan!","""Oh, I want to,"" he said."
2,"0, de milyen igaza volt Akasának!","Oh, was she ever right, how I had wanted to do..."
3,"- 0, ne légy már ilyen bolond! - kérlelt szelí...","""Oh, don't be such a fool,"" he begged gently."
4,"0, nem, soha, egyetlen élet sem az!","Oh, no, never, not any life!"
...,...,...
1220726,- Zúzott ajka mosolyra húzódott.,Seth's mangled lips formed a smile.
1220727,Zúzott jég vérmocskos vermébõl kapaszkodott fö...,Marius was climbing up out of a bloodstained p...
1220728,Zúzott kaméliák hullottak a porhanyós földre.,"Camellia blossoms, bruised and falling on the ..."
1220729,"zz, Vi, egy ujj mászik kifelé a mosdó lefolyój...","Watch out, Vi, there's a finger coming out of ..."


In [13]:
df = pd.concat([df1, df2], ignore_index=True, join='inner')
df = df.drop_duplicates()
df

Unnamed: 0,hun,eng
0,"0,1 g ötven órán keresztül, majd 0,2 az áttéré...",'0.1 gee for fifty hours; then 0.2 until turna...
1,"07,00h-kor hivatalosan leváltotta Poole-t a ve...",At 0700 he would officially relieve Poole on t...
2,0903-at mutattak.,They read: 0903.
3,10. A CLAVIUS-BÁZIS,10 - Clavius Base
4,10. A HÁBORÚ VÉGE,10. THE WAR ENDS
...,...,...
1668962,- Zúzott ajka mosolyra húzódott.,Seth's mangled lips formed a smile.
1668963,Zúzott jég vérmocskos vermébõl kapaszkodott fö...,Marius was climbing up out of a bloodstained p...
1668964,Zúzott kaméliák hullottak a porhanyós földre.,"Camellia blossoms, bruised and falling on the ..."
1668965,"zz, Vi, egy ujj mászik kifelé a mosdó lefolyój...","Watch out, Vi, there's a finger coming out of ..."


## Filter translations

In [14]:
def filter_by_length(df):
    df.loc[:, 'hun_word_count'] = df['hun'].str.split().apply(len)
    df.loc[:, 'eng_word_count'] = df['eng'].str.split().apply(len)
    return df[(df['hun_word_count'] > 0) &
              (df['hun_word_count'] <= 32) &
              (df['eng_word_count'] > 0) &
              (df['eng_word_count'] <= 32) &
              (
                  ((df['hun_word_count'] - df['eng_word_count']).abs() < 7) |
                  (
                      (df['hun_word_count'] / df['eng_word_count'] < 1.2) &
                      (df['eng_word_count'] / df['hun_word_count'] < 1.2)
                  )
              )
             ]

In [15]:
df = filter_by_length(df)

In [16]:
df

Unnamed: 0,hun,eng,hun_word_count,eng_word_count
0,"0,1 g ötven órán keresztül, majd 0,2 az áttéré...",'0.1 gee for fifty hours; then 0.2 until turna...,14,17
1,"07,00h-kor hivatalosan leváltotta Poole-t a ve...",At 0700 he would officially relieve Poole on t...,14,20
2,0903-at mutattak.,They read: 0903.,2,3
3,10. A CLAVIUS-BÁZIS,10 - Clavius Base,3,4
4,10. A HÁBORÚ VÉGE,10. THE WAR ENDS,4,4
...,...,...,...,...
1668960,"Zúzmara lepte az arcát, a szemhéját.","Frost covered her face, her eyelids.",6,6
1668961,Zúzni.,Blast.,1,1
1668962,- Zúzott ajka mosolyra húzódott.,Seth's mangled lips formed a smile.,5,6
1668964,Zúzott kaméliák hullottak a porhanyós földre.,"Camellia blossoms, bruised and falling on the ...",6,9


## Remove translations that are in other test sets

In [17]:
main_test_set_path_hun = '../../data/Hunglish2/combined-32-simple/hunglish2-test.hu'
main_test_set_path_eng = '../../data/Hunglish2/combined-32-simple/hunglish2-test.en'

In [18]:
hun_sentences = []
eng_sentences = []
with open(main_test_set_path_hun, 'r') as hun_file:
    with open(main_test_set_path_eng, 'r') as eng_file:
        for hun_line, eng_line in zip(hun_file, eng_file):
            hun_sentences.append(hun_line.strip())
            eng_sentences.append(eng_line.strip())
            
data = {
    'hun': hun_sentences,
    'eng': eng_sentences
}
df_main_test = pd.DataFrame(data)

In [19]:
df_main_test

Unnamed: 0,hun,eng
0,"Ha azt hiszik, hogy ők használnak téged.",Only if they think theyre using you.
1,"Nem várta őket kocsi, semmi.",No carriage was waiting.
2,Miért nem próbálkozik más bánásmóddal?,Why not give them a change of treatment?
3,"(Amint Henry lehajol, hogy megcsókolja, Higgin...","[As he bends to kiss her, she takes his hat of..."
4,A víztől mindenképpen zárlatos lesz majd...,Water should short it out.
...,...,...
21725,Ne keverd a sört a borral!,"Dont mix beer and wine, ever!"
21726,-Nemsokára meglátod.,-Youll see it.
21727,Göringnek dolgozik?,For Göring?
21728,"Frankón, a lúzer-tüsszentés már hivatalosan ha...",Im pretty sure the loser sneeze is officially ...


In [20]:
df_ones_in_main_test = pd.merge(df, df_main_test, indicator=True, how='outer').query('_merge=="both"').drop('_merge', axis=1)
df_ones_in_main_test

Unnamed: 0,hun,eng,hun_word_count,eng_word_count
151,1832-ben a párizsi szennycsatorna még korántse...,The sewers of Paris in 1832 were far from bein...,10.0,14.0
504,8. fejezet,Chapter 8,2.0,2.0
598,Á!,Ah.,1.0,1.0
1253,"A báró még mindig hátat fordított a grófnak, b...","The Baron kept his back to the Count, nodding.",9.0,9.0
1296,"A baronet keze hideg volt, mint a márvány.",It was as cold as a block of marble.,8.0,9.0
...,...,...,...,...
1410998,"Will is, Lyra is reszketett, félelem, kimerült...",Will and Lyra were both trembling and weak wit...,19.0,19.0
1412616,"Zajcev elhagyta az irodát, és a liftek felé in...",Zaitzev walked out and down the corridor to th...,9.0,10.0
1413215,"Zenobia nem nézett a szemembe, a semmibe bámult.","She stared before her, but not into my eyes.",8.0,9.0
1413317,"Zihálva ültem fel, veríték csorgott az arcomon...","I sat, panting, the sweat pouring down my face...",19.0,22.0


In [21]:
df_ones_in_main_test.shape[0] / df.shape[0]

0.0020322860531010747

In [22]:
df_without_main_test = pd.merge(df, df_main_test, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
df_without_main_test

Unnamed: 0,hun,eng,hun_word_count,eng_word_count
0,"0,1 g ötven órán keresztül, majd 0,2 az áttéré...",'0.1 gee for fifty hours; then 0.2 until turna...,14.0,17.0
1,"07,00h-kor hivatalosan leváltotta Poole-t a ve...",At 0700 he would officially relieve Poole on t...,14.0,20.0
2,0903-at mutattak.,They read: 0903.,2.0,3.0
3,10. A CLAVIUS-BÁZIS,10 - Clavius Base,3.0,4.0
4,10. A HÁBORÚ VÉGE,10. THE WAR ENDS,4.0,4.0
...,...,...,...,...
1414414,"Zúzmara lepte az arcát, a szemhéját.","Frost covered her face, her eyelids.",6.0,6.0
1414415,Zúzni.,Blast.,1.0,1.0
1414416,- Zúzott ajka mosolyra húzódott.,Seth's mangled lips formed a smile.,5.0,6.0
1414417,Zúzott kaméliák hullottak a porhanyós földre.,"Camellia blossoms, bruised and falling on the ...",6.0,9.0


In [23]:
def get_basic_stats(df, col):
    metric_dict = {}
    metric_dict['q0.25'] = str(df[col].quantile(q=0.25))
    metric_dict['q0.5'] = str(df[col].quantile(q=0.5))
    metric_dict['q0.75'] = str(df[col].quantile(q=0.75))
    metric_dict['q0.99'] = str(df[col].quantile(q=0.99))
    metric_dict['q0.999'] = str(df[col].quantile(q=0.999))
    metric_dict['min'] = str(df[col].min())
    metric_dict['max'] = str(df[col].max())
    metric_dict['mean'] = str(df[col].mean())
    metric_dict['stdev'] = str(df[col].std())
    
    return metric_dict

In [24]:
get_basic_stats(df_without_main_test, 'hun_word_count')

{'q0.25': '4.0',
 'q0.5': '7.0',
 'q0.75': '11.0',
 'q0.99': '26.0',
 'q0.999': '31.0',
 'min': '1.0',
 'max': '32.0',
 'mean': '8.620396090808299',
 'stdev': '5.6186232364479'}

In [25]:
get_basic_stats(df_without_main_test, 'eng_word_count')

{'q0.25': '5.0',
 'q0.5': '9.0',
 'q0.75': '14.0',
 'q0.99': '29.0',
 'q0.999': '32.0',
 'min': '1.0',
 'max': '32.0',
 'mean': '10.047593948474898',
 'stdev': '6.251214670365017'}

## Sample data

In [26]:
def sample(df, size, output_path, output_prefix):
    valid_size = size * 0.15
    
    np.random.seed(RANDOM_SEED)
    nums = np.random.choice(np.arange(df_without_main_test.shape[0]), size)
    sampled_df = df.iloc[nums].reset_index()
    
    train_idxs, valid_idxs = train_test_split(np.arange(size), test_size=int(valid_size), random_state=RANDOM_SEED)
    indices = {
        'train': train_idxs,
        'valid': valid_idxs
    }
    
    Path(output_path).mkdir(parents=True, exist_ok=True)
    for set_type in ['train', 'valid']:
        with open(os.path.join(output_path, f'{output_prefix}-{set_type}.hu'), 'w') as hun_file:
            with open(os.path.join(output_path, f'{output_prefix}-{set_type}.en'), 'w') as eng_file:
                for row in sampled_df.iloc[indices[set_type]].iterrows():
                    hun_file.write(row[1][1])
                    hun_file.write('\n')
                    eng_file.write(row[1][2])
                    eng_file.write('\n')

In [27]:
low_resource_path = '../../data/Hunglish2/low-resource'

In [28]:
for sample_count in [75, 100, 200, 300, 400, 500]:
    sample(df_without_main_test, sample_count * 1000, low_resource_path, f'hunglish2-{sample_count}k')

## Save test data

In [29]:
with open(os.path.join(low_resource_path, 'hunglish2-test.hu'), 'w') as hun_file:
    with open(os.path.join(low_resource_path, 'hunglish2-test.en'), 'w') as eng_file:
        for row in df_ones_in_main_test.iterrows():
            hun_file.write(row[1][0])
            hun_file.write('\n')
            eng_file.write(row[1][1])
            eng_file.write('\n')