In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split


In [2]:
RANDOM_SEED = 500

In [3]:
class HunglishSampler:
    def __init__(self, base_data_dir, sample_from_domains, seed, samples_per_domain_in_valid, samples_per_domain_in_test):
        self.domains = [
            'classic.lit',
            'law',
            'modern.lit',
            'softwaredocs',
            'subtitles'
        ]
        self.base_data_dir = base_data_dir
        self.sample_from_domains = sample_from_domains
        self.RANDOM_SEED = seed
        self.train_set = pd.DataFrame()
        self.validation_set = pd.DataFrame()
        self.test_set = pd.DataFrame()
        self.CORPUS_LENGTH = 2974471 # Need to know in advance to calculate train/valid/test ratios
        self.valid_set_ratio = samples_per_domain_in_valid * len(self.domains) / self.CORPUS_LENGTH
        self.test_set_ratio = samples_per_domain_in_test * len(self.domains) / self.CORPUS_LENGTH
        self.train_set_ratio = 1 - self.valid_set_ratio - self.test_set_ratio
        
        print(f'Valid set ratio: {self.valid_set_ratio}')
        print(f'Test set ratio: {self.test_set_ratio}')
        print(f'Train set ratio: {self.train_set_ratio}')
        
        
    def sample(self):
        data = {
            'train': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
            },
            'valid': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
            },
            'test': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
                
            }
        }
        for domain in self.sample_from_domains:
            if domain not in self.domains:
                raise ValueError(f'Cannot sample from domain {domain}')
            domain_path = f'{self.base_data_dir}/{domain}/bi'
            files = [f for f in listdir(f'{domain_path}') if isfile(join(f'{domain_path}', f))]
            for file in files:
                with open(f'{domain_path}/{file}', 'r', encoding='latin2') as f:
                    # Train-test split file-wise
                    hun_sentences = []
                    eng_sentences = []
                    malformed_lines = {}
                    for line in f:
                        try:
                            hun_sentence, eng_sentence = line.rstrip('\n').split('\t')
                            hun_sentences.append(hun_sentence)
                            eng_sentences.append(eng_sentence)
                            if len(hun_sentences) != eng_sentences:
                                raise ValueError(f'Hun-eng sentence pair has bad formatting')
                        except:
                            if domain not in malformed_lines:
                                malformed_lines[domain] = []
                            malformed_lines[domain].append((f'line: {line}', f'file: {file}'))
                    try:
                        x_train, x_test, y_train, y_test = train_test_split(hun_sentences, eng_sentences,
                                                                           train_size=self.train_set_ratio,
                                                                           test_size=self.test_set_ratio,
                                                                           random_state=self.RANDOM_SEED)

                        x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train,
                                                                              train_size=1-self.valid_set_ratio,
                                                                              test_size=self.valid_set_ratio,
                                                                              random_state=self.RANDOM_SEED)
                    except:
                        print(f'There might not be well formatted lines in file: {file}') # look into these files
                        
                    data['train']['hun'].extend(x_train)
                    data['train']['eng'].extend(y_train)
                    data['train']['source_file'].extend([file for _ in range(len(x_train))])
                    data['train']['domain'].extend([domain for _ in range(len(x_train))])

                    
                    data['valid']['hun'].extend(x_valid)
                    data['valid']['eng'].extend(y_valid)
                    data['valid']['source_file'].extend([file for _ in range(len(x_valid))])
                    data['valid']['domain'].extend([domain for _ in range(len(x_valid))])

                    data['test']['domain'].extend([domain for _ in range(len(x_test))])                    
                    data['test']['hun'].extend(x_test)
                    data['test']['eng'].extend(y_test)
                    data['test']['source_file'].extend([file for _ in range(len(x_test))])
                    
        print('Train set length: {}'.format(len(data['train']['hun'])))
        print('Validation set length: {}'.format(len(data['valid']['hun'])))
        print('Test set length: {}'.format(len(data['test']['hun'])))
        print('--------TRAIN--------')
        print(data['train']['hun'][0:3])
        print(data['train']['eng'][0:3])
        print('--------VALID--------')
        print(data['valid']['hun'][0:3])
        print(data['valid']['eng'][0:3])
        print('--------TEST--------')
        print(data['test']['hun'][0:3])
        print(data['test']['eng'][0:3])

        # Dump splits to dataframes
        self.train_set = pd.DataFrame(data['train'])
        self.validation_set = pd.DataFrame(data['valid'])
        self.test_set = pd.DataFrame(data['test'])
        
    def save_splits_to_csv(self):
        self.train_set.to_csv('./train_set.csv')
        self.validation_set.to_csv('./validation_set.csv')
        self.test_set.to_csv('./test_set.csv')
        


In [4]:
sampler = HunglishSampler(
                base_data_dir='/Users/attilanagy/Personal/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2',
                sample_from_domains= [
                        'classic.lit',
                        'law',
                        'modern.lit',
                        'softwaredocs',
                        'subtitles'
                        ],
                samples_per_domain_in_valid=5000,
                samples_per_domain_in_test=5000,
                seed=RANDOM_SEED
                
)
sampler.sample()

Valid set ratio: 0.008404855855041115
Test set ratio: 0.008404855855041115
Train set ratio: 0.9831902882899177
There might not be well formatted lines in file: Shakespeare_8.bi
There might not be well formatted lines in file: Shakespeare_9.bi
There might not be well formatted lines in file: Shakespeare_5.bi
Train set length: 2738594
Validation set length: 23259
Test set length: 23660
--------TRAIN--------
['Te, akit a föld határairól hoztalak elő, és a világ végéről hívtalak meg; te, akihez így szóltam: Szolgám vagy, kiválasztottalak, ezért nem vetlek el: Ne félj, mert veled vagyok, ne csüggedj, mert én vagyok a te Istened!', 'Ha majd végzel a pusztítással, elpusztítanak, és ha beteltél a fosztogatással, téged is kifosztanak.', 'Az álnok szívűektől iszonyodik az Úr, akik tisztességesek, elnyerik tetszését.']
['In whom I have taken thee from the ends of the earth, and from the remote parts thereof have called thee, and said to thee: Thou art my servant, I have chosen thee, and have not 

In [15]:
sampler.train_set

Unnamed: 0,hun,eng,source_file
0,"Te, akit a föld határairól hoztalak elő, és a ...",In whom I have taken thee from the ends of the...,Bible_2.bi
1,"Ha majd végzel a pusztítással, elpusztítanak, ...","O Lord, have mercy on us: for we have waited f...",Bible_2.bi
2,"Az álnok szívűektől iszonyodik az Úr, akik tis...",A perverse heart is abominable to the Lord: an...,Bible_2.bi
3,Báruk könyve,THE PROPHECY OF BARUCH,Bible_2.bi
4,"Vajon nem így van, Izrael fiai? - mondja az Úr.","Is it not so, O ye children of Israel, saith t...",Bible_2.bi
...,...,...,...
2918002,"- Igen, uram.","- Yes, sir.",subtitles.bi
2918003,"Gondolja meg jól, második lehetőséget kap!","Come on, that's a second chance, kiddo.",subtitles.bi
2918004,Igen.,Yes.,subtitles.bi
2918005,Rendben.,All right.,subtitles.bi


In [16]:
sampler.validation_set

Unnamed: 0,hun,eng,source_file
0,"Mert elküldi angyalait hozzád, hogy védelmezze...",For he hath given his angels charge over thee;...,Bible_2.bi
1,"Nemcsak hogy utánoztad az útjaikat, és elkövet...","But neither hast thou walked in their ways, no...",Bible_2.bi
2,"Nemde még a nap is megállt intésére, s egy nap...","Was not the sun stopped in his anger, and one ...",Bible_2.bi
3,"A félelem ugyanis nem egyéb, mint lemondás a s...",For fear is nothing else but a yielding up of ...,Bible_2.bi
4,"Aztán így szólt Jonatánhoz: ""Miért fárasztotta...",And he said to Jonathan: Why hast thou trouble...,Bible_2.bi
...,...,...,...
27964,- Ennek a nadrágnak annyi.,- These pants are ruined.,subtitles.bi
27965,Tízre akkor érted jövök.,I'll pick you up here at 10.,subtitles.bi
27966,Kell a segítséged!,I need you!,subtitles.bi
27967,De van.,Of course.,subtitles.bi


In [17]:
sampler.test_set

Unnamed: 0,hun,eng,source_file
0,"Hozzá is láttam, hogy legeltessem a leölésre s...",And I will feed the flock of slaughter for thi...,Bible_2.bi
1,"Ha kinyitja a száját, bölcsen beszél, jóságos ...","She hath opened her mouth to wisdom, and the l...",Bible_2.bi
2,"Kinyilvánította az Úr üdvösségét, igazságosság...",The Lord hath made known his salvation: he hat...,Bible_2.bi
3,"Akkor megrendül a föld és megremeg, mert telje...","And the land shall be in a commotion, and shal...",Bible_2.bi
4,"Szemem azért mindig az Úrra néz, ő megszabadít...",My eyes are ever towards the Lord: for he shal...,Bible_2.bi
...,...,...,...
28396,Helló.,How are you.,subtitles.bi
28397,"Pakoljuk a szállítmányokat, készülünk a költöz...",...getting ready for the move stateside.,subtitles.bi
28398,Mit akarsz tenni?,What do you want to do?,subtitles.bi
28399,Oww!,Oww!,subtitles.bi


In [None]:
sampler.save_splits_to_csv()

In [5]:
df = sampler.train_set

In [6]:
df

Unnamed: 0,hun,eng,source_file,domain
0,"Te, akit a föld határairól hoztalak elő, és a ...",In whom I have taken thee from the ends of the...,Bible_2.bi,classic.lit
1,"Ha majd végzel a pusztítással, elpusztítanak, ...","O Lord, have mercy on us: for we have waited f...",Bible_2.bi,classic.lit
2,"Az álnok szívűektől iszonyodik az Úr, akik tis...",A perverse heart is abominable to the Lord: an...,Bible_2.bi,classic.lit
3,Báruk könyve,THE PROPHECY OF BARUCH,Bible_2.bi,classic.lit
4,"Vajon nem így van, Izrael fiai? - mondja az Úr.","Is it not so, O ye children of Israel, saith t...",Bible_2.bi,classic.lit
...,...,...,...,...
2738589,"- Igen, uram.","- Yes, sir.",subtitles.bi,subtitles
2738590,"Gondolja meg jól, második lehetőséget kap!","Come on, that's a second chance, kiddo.",subtitles.bi,subtitles
2738591,Igen.,Yes.,subtitles.bi,subtitles
2738592,Rendben.,All right.,subtitles.bi,subtitles


In [10]:

df['domain'].value_counts()

modern.lit      1628251
classic.lit      643555
subtitles        334721
softwaredocs     132067
Name: domain, dtype: int64