In [1]:
import pandas as pd
from os import listdir, mkdir
from os.path import isfile, join, exists, getsize
from sklearn.model_selection import train_test_split
import random
from math import ceil

In [2]:
RANDOM_SEED = 500
hunglish_path = '../data/ftp.mokk.bme.hu/Hunglish2'

In [38]:
combined_path = join(hunglish_path, 'combined')
if not exists(combined_path):
    mkdir(combined_path)

In [48]:
class HunglishSampler:
    def __init__(self, base_data_dir, sample_from_domains, seed, samples_per_domain_in_valid, samples_per_domain_in_test):
        self.domains = [
            'classic.lit',
            'law',
            'modern.lit',
            'softwaredocs',
            'subtitles'
        ]
        self.base_data_dir = base_data_dir
        self.sample_from_domains = sample_from_domains
        self.RANDOM_SEED = seed
        random.seed(self.RANDOM_SEED)
        self.train_set = pd.DataFrame()
        self.validation_set = pd.DataFrame()
        self.test_set = pd.DataFrame()
        self.CORPUS_LENGTH = 2974471 # Need to know in advance to calculate train/valid/test ratios
        self.valid_set_ratio = samples_per_domain_in_valid * len(self.domains) / self.CORPUS_LENGTH
        self.test_set_ratio = samples_per_domain_in_test * len(self.domains) / self.CORPUS_LENGTH
        self.train_set_ratio = 1 - self.valid_set_ratio - self.test_set_ratio
        
        print(f'Valid set ratio: {self.valid_set_ratio}')
        print(f'Test set ratio: {self.test_set_ratio}')
        print(f'Train set ratio: {self.train_set_ratio}')
        
        
    def sample(self, max_number_of_tokens=-1, sample_ratio=1.0):
        data = {
            'train': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
            },
            'valid': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
            },
            'test': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
                
            }
        }
        sentences_dropped = 0
        for domain in self.sample_from_domains:
            if domain not in self.domains:
                raise ValueError(f'Cannot sample from domain {domain}')
            domain_path = f'{self.base_data_dir}/{domain}/bi'
            files = [f for f in listdir(f'{domain_path}') if isfile(join(f'{domain_path}', f))]
            for file in files:
                file_path = f'{domain_path}/{file}'
                with open(file_path, 'r', encoding='latin2') as f:
                    # Train-test split file-wise
                    hun_sentences = []
                    eng_sentences = []
                    malformed_lines = {}
                    for line in f:
                        try:
                            hun_sentence, eng_sentence = line.rstrip('\n').split('\t')
                            if max_number_of_tokens == -1 or (max_number_of_tokens != -1 and len(hun_sentence.split()) < max_number_of_tokens and len(eng_sentence.split()) < max_number_of_tokens):
                                hun_sentences.append(hun_sentence)
                                eng_sentences.append(eng_sentence)
                            else:
                                sentences_dropped += 1
                            if len(hun_sentences) != eng_sentences:
                                raise ValueError(f'Hun-eng sentence pair has bad formatting')
                        except:
                            if domain not in malformed_lines:
                                malformed_lines[domain] = []
                            malformed_lines[domain].append((f'line: {line}', f'file: {file}'))
                            
                    if sample_ratio != 1.0:
                        n_sentences = len(hun_sentences)
                        ids = list(range(n_sentences))
                        sampled_ids = random.sample(ids, int(n_sentences * sample_ratio))
                        hun_sentences = [hun_sentences[i] for i in sampled_ids]
                        eng_sentences = [eng_sentences[i] for i in sampled_ids]
                    
                    try:
                        x_train, x_test, y_train, y_test = train_test_split(hun_sentences, eng_sentences,
                                                                           train_size=self.train_set_ratio,
                                                                           test_size=self.test_set_ratio,
                                                                           random_state=self.RANDOM_SEED)

                        x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train,
                                                                              train_size=1-self.valid_set_ratio,
                                                                              test_size=self.valid_set_ratio,
                                                                              random_state=self.RANDOM_SEED)
                    except Exception as e:
                        if 'the resulting train set will be empty' not in str(e):
                            print(e)
                            print(f'There might not be well formatted lines in file: {file}, size: {getsize(file_path)} bytes') # look into these files
                        
                    data['train']['hun'].extend(x_train)
                    data['train']['eng'].extend(y_train)
                    data['train']['source_file'].extend([file for _ in range(len(x_train))])
                    data['train']['domain'].extend([domain for _ in range(len(x_train))])

                    data['valid']['hun'].extend(x_valid)
                    data['valid']['eng'].extend(y_valid)
                    data['valid']['source_file'].extend([file for _ in range(len(x_valid))])
                    data['valid']['domain'].extend([domain for _ in range(len(x_valid))])

                    data['test']['domain'].extend([domain for _ in range(len(x_test))])                    
                    data['test']['hun'].extend(x_test)
                    data['test']['eng'].extend(y_test)
                    data['test']['source_file'].extend([file for _ in range(len(x_test))])
        
        if max_number_of_tokens != -1:
            print('Sentences dropped: ', sentences_dropped)
        print('Train set length: {}'.format(len(data['train']['hun'])))
        print('Validation set length: {}'.format(len(data['valid']['hun'])))
        print('Test set length: {}'.format(len(data['test']['hun'])))
        print('--------TRAIN--------')
        print(data['train']['hun'][0:3])
        print(data['train']['eng'][0:3])
        print('--------VALID--------')
        print(data['valid']['hun'][0:3])
        print(data['valid']['eng'][0:3])
        print('--------TEST--------')
        print(data['test']['hun'][0:3])
        print(data['test']['eng'][0:3])

        # Dump splits to dataframes
        self.train_set = pd.DataFrame(data['train'])
        self.validation_set = pd.DataFrame(data['valid'])
        self.test_set = pd.DataFrame(data['test'])
        
    def save_splits_to_csv(self):
        self.train_set.to_csv('./train_set.csv')
        self.validation_set.to_csv('./validation_set.csv')
        self.test_set.to_csv('./test_set.csv')
        
    def create_data_set_files(self, path, base_file_name):
        file_name_beginning = join(path, base_file_name + '-')
        f = lambda set_name, language: file_name_beginning + set_name + '.' + language

        self.train_set['hun'].to_csv(f('train', 'hu'), header=None, index=None, sep=' ')
        self.train_set['eng'].to_csv(f('train', 'en'), header=None, index=None, sep=' ')
        self.validation_set['hun'].to_csv(f('valid', 'hu'), header=None, index=None, sep=' ')
        self.validation_set['eng'].to_csv(f('valid', 'en'), header=None, index=None, sep=' ')
        self.test_set['hun'].to_csv(f('test', 'hu'), header=None, index=None, sep=' ')
        self.test_set['eng'].to_csv(f('test', 'en'), header=None, index=None, sep=' ')

In [49]:
sampler = HunglishSampler(
                base_data_dir=hunglish_path,
                sample_from_domains= [
                        'classic.lit',
                        'law',
                        'modern.lit',
                        'softwaredocs',
                        'subtitles'
                        ],
                samples_per_domain_in_valid=5000,
                samples_per_domain_in_test=5000,
                seed=RANDOM_SEED
                
)

Valid set ratio: 0.008404855855041115
Test set ratio: 0.008404855855041115
Train set ratio: 0.9831902882899177


In [37]:
sampler.sample(max_number_of_tokens=512)

Sentences dropped:  130
Train set length: 4051549
Validation set length: 50240
Test set length: 50800
--------TRAIN--------
['Büszke ez ének, minden szava, célja, Átfogni tér s idő roppant birodalmait, A fejlődést, a feltüremlőt, a növekvőt és a nemzedékeket.', 'Elmegyek, mint a levegő, fehér fürtjeimet rázom a szökevény napra, Örvényekbe árasztom húsomat és rostos sávokban hömpölyögtetem.', 'Legfőbb érdememet megtagadom tőletek, nem vetem le magamról igazi lényem, Világokat mérjetek meg, de sohase próbáljatok megmérni engem, A legügyesebbet, legjobbat is megdöbbentem, közületek, ha csak rátok nézek.']
['Haughty this song, its words and scope, To span vast realms of space and time, Evolution - the cumulative - growths and generations.', 'I depart as air, I shake my white locks at the runaway sun, I effuse my flesh in eddies, and drift it in lacy jags.', 'My final merit I refuse you, I refuse putting from me what I really am, Encompass worlds, but never try to encompass me, I crowd your

In [39]:
sampler.create_data_set_files(hunglish_path, 'combined/hunglish2')

In [52]:
sampler.sample(max_number_of_tokens=512, sample_ratio=0.1)

Sentences dropped:  130
Train set length: 472662
Validation set length: 23310
Test set length: 23266
--------TRAIN--------
['A mozgóvilág csírái ártatlan játékukból hallgatva felkelnek, frissen felrügyeznek, Ferdén szökellnek a magasba s a mélybe.', "Ott fenn van az ég - de nálad, vagy a szomszédnál vagy a túlsó oldalon szembe'?", 'Gyilkos vagy féltékeny volt irántad az emberi nem, testvérem, nővérem?']
['Hefts of the moving world at innocent gambols silently rising freshly exuding, Scooting obliquely high and low.', 'The sky up there - yet here or next door, or across the way?', 'Were mankind murderous or jealous upon you, my brother, my sister?']
--------VALID--------
['Jól sikerültek a fotók - de barátod s karodba zárt biztos feleséged?', '- Hozzá lehet adni mindenféle vacsorához ezen a napon? - kérdezte Scrooge.', 'City-beli üzletbarátai valóban csodálkoztak volna, ha hallják, amint Scrooge természetének egész komolyságával beszél ilyen dolgokról, igen különös, síró-nevető hangon, 

In [40]:
sampler.create_data_set_files(hunglish_path, 'combined/hunglish2small')

((440300, 4), (23237, 4), (23195, 4))