In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split


In [2]:
RANDOM_SEED = 500

In [27]:
class HunglishSampler:
    def __init__(self, base_data_dir, sample_from_domains, seed, train_set_ratio=0.98, valid_set_ratio=0.15, test_set_ratio=0.02):
        self.domains = [
            'classic.lit',
            'law',
            'modern.lit',
            'softwaredocs',
            'subtitles'
        ]
        self.base_data_dir = base_data_dir
        self.sample_from_domains = sample_from_domains
        self.train_set_ratio = train_set_ratio
        self.valid_set_ratio = valid_set_ratio
        self.test_set_ratio = test_set_ratio
        self.RANDOM_SEED = seed
        self.train_set = pd.DataFrame()
        self.validation_set = pd.DataFrame()
        self.test_set = pd.DataFrame()

    def sample(self):
        data = {
            'train': {
                'hun': [],
                'eng': [],
                'source_file': []
            },
            'valid': {
                'hun': [],
                'eng': [],
                'source_file': []
            },
            'test': {
                'hun': [],
                'eng': [],
                'source_file': []
            }
        }
        for domain in self.sample_from_domains:
            if domain not in self.domains:
                raise ValueError(f'Cannot sample from domain {domain}')
            domain_path = f'{self.base_data_dir}/{domain}/bi'
            files = [f for f in listdir(f'{domain_path}') if isfile(join(f'{domain_path}', f))]
            for file in files:
                with open(f'{domain_path}/{file}', 'r', encoding='latin2') as f:
                    # Train-test split file-wise
                    hun_sentences = []
                    eng_sentences = []
                    malformed_lines = {}
                    for line in f:
                        try:
                            hun_sentence, eng_sentence = line.rstrip('\n').split('\t')
                            hun_sentences.append(hun_sentence)
                            eng_sentences.append(eng_sentence)
                            if len(hun_sentences) != eng_sentences:
                                raise ValueError(f'Hun-eng sentence pair has bad formatting')
                        except:
                            if domain not in malformed_lines:
                                malformed_lines[domain] = []
                            malformed_lines[domain].append((f'line: {line}', f'file: {file}'))
                    try:
                        x_train, x_test, y_train, y_test = train_test_split(hun_sentences, eng_sentences,
                                                                           train_size=self.train_set_ratio,
                                                                           test_size=self.test_set_ratio,
                                                                           random_state=self.RANDOM_SEED)

                        x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train,
                                                                              train_size=1-self.valid_set_ratio,
                                                                              test_size=self.valid_set_ratio,
                                                                              random_state=self.RANDOM_SEED)
                    except:
                        print(f'There might not be well formatted lines in file: {file}') # look into these files
                        
                    data['train']['hun'].extend(x_train)
                    data['train']['eng'].extend(y_train)
                    data['train']['source_file'].extend([file for _ in range(len(x_train))])
                    
                    data['valid']['hun'].extend(x_valid)
                    data['valid']['eng'].extend(y_valid)
                    data['valid']['source_file'].extend([file for _ in range(len(x_valid))])
                    
                    data['test']['hun'].extend(x_test)
                    data['test']['eng'].extend(y_test)
                    data['test']['source_file'].extend([file for _ in range(len(x_test))])
                    
        print('Train set length: {}'.format(len(data['train']['hun'])))
        print('Validation set length: {}'.format(len(data['valid']['hun'])))
        print('Test set length: {}'.format(len(data['test']['hun'])))
        print('--------TRAIN--------')
        print(data['train']['hun'][0:3])
        print(data['train']['eng'][0:3])
        print('--------VALID--------')
        print(data['valid']['hun'][0:3])
        print(data['valid']['eng'][0:3])
        print('--------TEST--------')
        print(data['test']['hun'][0:3])
        print(data['test']['eng'][0:3])

        # Dump splits to dataframes
        self.train_set = pd.DataFrame(data['train'])
        self.validation_set = pd.DataFrame(data['valid'])
        self.test_set = pd.DataFrame(data['test'])
        
    def save_splits_to_csv(self):
        self.train_set.to_csv('./train_set.csv')
        self.validation_set.to_csv('./validation_set.csv')
        self.test_set.to_csv('./test_set.csv')
        


In [28]:
sampler = HunglishSampler(
                base_data_dir='/Users/attilanagy/Personal/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2',
                sample_from_domains= ['modern.lit'],
                seed=RANDOM_SEED
                
)
sampler.sample()

Train set length: 1391216
Validation set length: 245510
Test set length: 33403
--------TRAIN--------
['Hat csapszeg tartja a pallót, három-három mindkét oldalon!', '- Három.', 'Fogalmam sincs, Leek.']
['"There are six pins that hold the supports of the catwalk-three on each side!', '"Three.', '"Beats me, Chief."']
--------VALID--------
['Imádkoznom kell!', 'De sohasem lesz ő a Hordozó.', '- Bármikor elköltözhetek.']
['I must pray."', 'He will never be the bearer.', '"I\'ll be moving on soon.']
--------TEST--------
['Ez ugyan jót tett a betyárbecsületüknek", ám erre az időre használhatatlanná tette őket.', 'Egy hetembe telt, hogy az összes rozsdát lekaparjam róla.', 'Tudja, én... ööö ... jobban ismerem az érdekelteket, mint ő.']
['That both established their bonafides and made them temporarily useless.', "'it took me a week to get the rust off.", 'You see, I - er - know the parties concerned better than he does.']


In [24]:
sampler.train_set

Unnamed: 0,hun,eng,source_file
0,"Hat csapszeg tartja a pallót, három-három mind...","""There are six pins that hold the supports of ...",hunglish2.lit.bi
1,- Három.,"""Three.",hunglish2.lit.bi
2,"Fogalmam sincs, Leek.","""Beats me, Chief.""",hunglish2.lit.bi
3,Felfedezte Pete Kocsmáját a fogadótól egysarok...,He'd discovered Pete's Bar and Grill a block a...,hunglish2.lit.bi
4,Semmit.,Nothing.',hunglish2.lit.bi
...,...,...,...
1391211,"- Igen, gyűlölöm - mondotta Rosa -, mert ő az ...","""I do hate him,"" said Rosa, ""as he is the caus...",hunglish1.lit.bi
1391212,"Azt mondják, nagyon nehéz befogni őket!","They're supposed to be really hard to catch!""",hunglish1.lit.bi
1391213,"Mark elvigyorodott, és megrázta a fejét.",Mark grinned and shook his head.,hunglish1.lit.bi
1391214,"- Gondold meg - ajánlom -, aztán ha vízre talá...","""Why don't you think about it,"" I say, ""and th...",hunglish1.lit.bi


In [25]:
sampler.validation_set

Unnamed: 0,hun,eng,source_file
0,Imádkoznom kell!,"I must pray.""",hunglish2.lit.bi
1,De sohasem lesz ő a Hordozó.,He will never be the bearer.,hunglish2.lit.bi
2,- Bármikor elköltözhetek.,"""I'll be moving on soon.",hunglish2.lit.bi
3,"Ott ültem, ahol Tally néhány órával korábban, ...","I was sitting where Tally had been, just hours...",hunglish2.lit.bi
4,"- Azt mondta visszajön, és elmondja nekem, ha ...","""He said he'd come back and tell me, if we all...",hunglish2.lit.bi
...,...,...,...
245505,MacDufffal együtt jártam Cambridgebe.,"MacDuff I knew at Cambridge.""",hunglish1.lit.bi
245506,"Ergo.""","Ergo.""",hunglish1.lit.bi
245507,"Jamis olyan ember, Sayyadina, aki nem tud megb...","""Jamis is one to hold a grudge, Sayyadina.",hunglish1.lit.bi
245508,Mivel a világ Minőség nélkül nem forog a szoká...,Since the world obviously doesn't function nor...,hunglish1.lit.bi


In [26]:
sampler.test_set

Unnamed: 0,hun,eng,source_file
0,"Ez ugyan jót tett a betyárbecsületüknek"", ám e...",That both established their bonafides and made...,hunglish2.lit.bi
1,"Egy hetembe telt, hogy az összes rozsdát lekap...",'it took me a week to get the rust off.,hunglish2.lit.bi
2,"Tudja, én... ööö ... jobban ismerem az érdekel...","You see, I - er - know the parties concerned b...",hunglish2.lit.bi
3,"mlékezetében, milyen sógorság-komaság is volt ...",But you had to remember to think when you had ...,hunglish2.lit.bi
4,"Megmondtam a kis játszópajtásának is, hogy elő...",I told that punk of yours that you'd have to t...,hunglish2.lit.bi
...,...,...,...
33398,Aztán...,Then --,hunglish1.lit.bi
33399,- Nem.,"""Wrong.",hunglish1.lit.bi
33400,"Mordaunt futó pillantást vetett a generálisra,...",Mordaunt tried for a moment to read in the gen...,hunglish1.lit.bi
33401,Ki ültette az első fát?,Who planted the first tree?,hunglish1.lit.bi


In [29]:
sampler.save_splits_to_csv()