In [1]:
import pandas as pd
from os import listdir, mkdir
from os.path import isfile, join, exists
from sklearn.model_selection import train_test_split

In [2]:
RANDOM_SEED = 500
hunglish_path = '../data/ftp.mokk.bme.hu/Hunglish2'

In [3]:
class HunglishSampler:
    def __init__(self, base_data_dir, sample_from_domains, seed, samples_per_domain_in_valid, samples_per_domain_in_test):
        self.domains = [
            'classic.lit',
            'law',
            'modern.lit',
            'softwaredocs',
            'subtitles'
        ]
        self.base_data_dir = base_data_dir
        self.sample_from_domains = sample_from_domains
        self.RANDOM_SEED = seed
        self.train_set = pd.DataFrame()
        self.validation_set = pd.DataFrame()
        self.test_set = pd.DataFrame()
        self.CORPUS_LENGTH = 2974471 # Need to know in advance to calculate train/valid/test ratios
        self.valid_set_ratio = samples_per_domain_in_valid * len(self.domains) / self.CORPUS_LENGTH
        self.test_set_ratio = samples_per_domain_in_test * len(self.domains) / self.CORPUS_LENGTH
        self.train_set_ratio = 1 - self.valid_set_ratio - self.test_set_ratio
        
        print(f'Valid set ratio: {self.valid_set_ratio}')
        print(f'Test set ratio: {self.test_set_ratio}')
        print(f'Train set ratio: {self.train_set_ratio}')
        
        
    def sample(self):
        data = {
            'train': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
            },
            'valid': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
            },
            'test': {
                'hun': [],
                'eng': [],
                'source_file': [],
                'domain': []
                
            }
        }
        for domain in self.sample_from_domains:
            if domain not in self.domains:
                raise ValueError(f'Cannot sample from domain {domain}')
            domain_path = f'{self.base_data_dir}/{domain}/bi'
            files = [f for f in listdir(f'{domain_path}') if isfile(join(f'{domain_path}', f))]
            for file in files:
                with open(f'{domain_path}/{file}', 'r', encoding='latin2') as f:
                    # Train-test split file-wise
                    hun_sentences = []
                    eng_sentences = []
                    malformed_lines = {}
                    for line in f:
                        try:
                            hun_sentence, eng_sentence = line.rstrip('\n').split('\t')
                            hun_sentences.append(hun_sentence)
                            eng_sentences.append(eng_sentence)
                            if len(hun_sentences) != eng_sentences:
                                raise ValueError(f'Hun-eng sentence pair has bad formatting')
                        except:
                            if domain not in malformed_lines:
                                malformed_lines[domain] = []
                            malformed_lines[domain].append((f'line: {line}', f'file: {file}'))
                    try:
                        x_train, x_test, y_train, y_test = train_test_split(hun_sentences, eng_sentences,
                                                                           train_size=self.train_set_ratio,
                                                                           test_size=self.test_set_ratio,
                                                                           random_state=self.RANDOM_SEED)

                        x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train,
                                                                              train_size=1-self.valid_set_ratio,
                                                                              test_size=self.valid_set_ratio,
                                                                              random_state=self.RANDOM_SEED)
                    except:
                        print(f'There might not be well formatted lines in file: {file}') # look into these files
                        
                    data['train']['hun'].extend(x_train)
                    data['train']['eng'].extend(y_train)
                    data['train']['source_file'].extend([file for _ in range(len(x_train))])
                    data['train']['domain'].extend([domain for _ in range(len(x_train))])

                    
                    data['valid']['hun'].extend(x_valid)
                    data['valid']['eng'].extend(y_valid)
                    data['valid']['source_file'].extend([file for _ in range(len(x_valid))])
                    data['valid']['domain'].extend([domain for _ in range(len(x_valid))])

                    data['test']['domain'].extend([domain for _ in range(len(x_test))])                    
                    data['test']['hun'].extend(x_test)
                    data['test']['eng'].extend(y_test)
                    data['test']['source_file'].extend([file for _ in range(len(x_test))])
                    
        print('Train set length: {}'.format(len(data['train']['hun'])))
        print('Validation set length: {}'.format(len(data['valid']['hun'])))
        print('Test set length: {}'.format(len(data['test']['hun'])))
        print('--------TRAIN--------')
        print(data['train']['hun'][0:3])
        print(data['train']['eng'][0:3])
        print('--------VALID--------')
        print(data['valid']['hun'][0:3])
        print(data['valid']['eng'][0:3])
        print('--------TEST--------')
        print(data['test']['hun'][0:3])
        print(data['test']['eng'][0:3])

        # Dump splits to dataframes
        self.train_set = pd.DataFrame(data['train'])
        self.validation_set = pd.DataFrame(data['valid'])
        self.test_set = pd.DataFrame(data['test'])
        
    def save_splits_to_csv(self):
        self.train_set.to_csv('./train_set.csv')
        self.validation_set.to_csv('./validation_set.csv')
        self.test_set.to_csv('./test_set.csv')
        


In [4]:
sampler = HunglishSampler(
                base_data_dir=hunglish_path,
                sample_from_domains= [
                        'classic.lit',
                        'law',
                        'modern.lit',
                        'softwaredocs',
                        'subtitles'
                        ],
                samples_per_domain_in_valid=5000,
                samples_per_domain_in_test=5000,
                seed=RANDOM_SEED
                
)
sampler.sample()

Valid set ratio: 0.008404855855041115
Test set ratio: 0.008404855855041115
Train set ratio: 0.9831902882899177
There might not be well formatted lines in file: Shakespeare_5.bi
There might not be well formatted lines in file: Shakespeare_9.bi
There might not be well formatted lines in file: Shakespeare_8.bi
There might not be well formatted lines in file: 32002R1605.bi
There might not be well formatted lines in file: 31996L0086.bi
There might not be well formatted lines in file: 31970R1253.bi
There might not be well formatted lines in file: 32010R0564.bi
There might not be well formatted lines in file: 32000R2724.bi
There might not be well formatted lines in file: 32000L0003.bi
There might not be well formatted lines in file: 31986Y0627_01.bi
There might not be well formatted lines in file: 32010R0395.bi
There might not be well formatted lines in file: 32002R1408.bi
There might not be well formatted lines in file: 31998L0014.bi
There might not be well formatted lines in file: 31980R339

There might not be well formatted lines in file: 31982D0461.bi
There might not be well formatted lines in file: 31990D0036.bi
There might not be well formatted lines in file: 32007R0665.bi
There might not be well formatted lines in file: 32002D0971.bi
There might not be well formatted lines in file: 32006R1919.bi
There might not be well formatted lines in file: 31976R0692.bi
There might not be well formatted lines in file: 31970L0032.bi
There might not be well formatted lines in file: 32006R1066.bi
There might not be well formatted lines in file: 31989R3794.bi
There might not be well formatted lines in file: 31979R1979.bi
There might not be well formatted lines in file: 31994L0055.bi
There might not be well formatted lines in file: 32010R0944.bi
There might not be well formatted lines in file: 32010L0067.bi
There might not be well formatted lines in file: 31994D0800.bi
There might not be well formatted lines in file: 31984D0358.bi
There might not be well formatted lines in file: 31983R

There might not be well formatted lines in file: 32007R0129.bi
There might not be well formatted lines in file: 31992R0601.bi
There might not be well formatted lines in file: 32006R0638.bi
There might not be well formatted lines in file: 32005R1561.bi
There might not be well formatted lines in file: 32006R0947.bi
There might not be well formatted lines in file: 32009R0564.bi
There might not be well formatted lines in file: 32008R0839.bi
There might not be well formatted lines in file: 31963D0032.bi
There might not be well formatted lines in file: 31996D0337.bi
There might not be well formatted lines in file: 31988D0355.bi
There might not be well formatted lines in file: 32009R1179.bi
There might not be well formatted lines in file: 32009L0002.bi
There might not be well formatted lines in file: 31988S4104.bi
There might not be well formatted lines in file: 32007R1165.bi
There might not be well formatted lines in file: 32008R0811.bi
There might not be well formatted lines in file: 32006R

In [5]:
sampler.train_set

Unnamed: 0,hun,eng,source_file,domain
0,Szunnyadó és elfolyó fák földje!,Earth of the slumbering and liquid trees!,Whitman_1.bi,classic.lit
1,"Egy szemérmes kéz szorítása ez, a haj lengése ...","This is the press of a bashful hand, this the ...",Whitman_1.bi,classic.lit
2,"Férfi és nő, elmondhatnám, hogy szeretlek tite...","Man or woman, I might tell how I like you, but...",Whitman_1.bi,classic.lit
3,"Egy sem hajtott térdet a parancsra, Volt aki ő...","None obey'd the command to kneel, Some made a ...",Whitman_1.bi,classic.lit
4,"Sarjak vernek gyökeret és sokasodnak, állnak a...","Sprouts take and accumulate, stand by the curb...",Whitman_1.bi,classic.lit
...,...,...,...,...
4051676,"- Igen, uram.","- Yes, sir.",subtitles.bi,subtitles
4051677,"Gondolja meg jól, második lehetőséget kap!","Come on, that's a second chance, kiddo.",subtitles.bi,subtitles
4051678,Igen.,Yes.,subtitles.bi,subtitles
4051679,Rendben.,All right.,subtitles.bi,subtitles


In [6]:
sampler.validation_set

Unnamed: 0,hun,eng,source_file,domain
0,"Ha ki mást lealáz, engem aláz le, Ha mit teszn...","Whoever degrades another degrades me, And what...",Whitman_1.bi,classic.lit
1,"Csak kis időre vet horgonyt hajóm, Hírnökeim f...","I anchor my ship for a little while only, My m...",Whitman_1.bi,classic.lit
2,Léptem felriasztja a vadrucát és párját: e nap...,My tread scares the wood-drake and wood-duck o...,Whitman_1.bi,classic.lit
3,Igenis - mégpedig tulajdon ágyának oszlopa volt.,Yes! and the bedpost was his own.,Dickens_3.bi,classic.lit
4,Az írnok kényszeredetten mosolygott.,The clerk smiled faintly.,Dickens_3.bi,classic.lit
...,...,...,...,...
50235,- Ennek a nadrágnak annyi.,- These pants are ruined.,subtitles.bi,subtitles
50236,Tízre akkor érted jövök.,I'll pick you up here at 10.,subtitles.bi,subtitles
50237,Kell a segítséged!,I need you!,subtitles.bi,subtitles
50238,De van.,Of course.,subtitles.bi,subtitles


In [7]:
sampler.test_set

Unnamed: 0,hun,eng,source_file,domain
0,"Én elfogadom a valóságot, és nem merem kétségb...","I accept Reality and dare not question it, Mat...",Whitman_1.bi,classic.lit
1,Ennyire fájt neked elhagyni engem?,"Did it make you ache so, leaving me?",Whitman_1.bi,classic.lit
2,"Minden előbbre és kifelé halad, semmi sem hull...","All goes onward and outward, nothing collapses...",Whitman_1.bi,classic.lit
3,A szellem ismét felkiáltott e szavak hallatára...,"The Ghost, on hearing this, set up another cry...",Dickens_3.bi,classic.lit
4,"- Nem jött meg - szólt Bob, hirtelen engedve m...","`Not coming!' said Bob, with a sudden declensi...",Dickens_3.bi,classic.lit
...,...,...,...,...
50796,Helló.,How are you.,subtitles.bi,subtitles
50797,"Pakoljuk a szállítmányokat, készülünk a költöz...",...getting ready for the move stateside.,subtitles.bi,subtitles
50798,Mit akarsz tenni?,What do you want to do?,subtitles.bi,subtitles
50799,Oww!,Oww!,subtitles.bi,subtitles


In [None]:
sampler.save_splits_to_csv()

In [8]:
df = sampler.train_set

In [9]:
df

Unnamed: 0,hun,eng,source_file,domain
0,Szunnyadó és elfolyó fák földje!,Earth of the slumbering and liquid trees!,Whitman_1.bi,classic.lit
1,"Egy szemérmes kéz szorítása ez, a haj lengése ...","This is the press of a bashful hand, this the ...",Whitman_1.bi,classic.lit
2,"Férfi és nő, elmondhatnám, hogy szeretlek tite...","Man or woman, I might tell how I like you, but...",Whitman_1.bi,classic.lit
3,"Egy sem hajtott térdet a parancsra, Volt aki ő...","None obey'd the command to kneel, Some made a ...",Whitman_1.bi,classic.lit
4,"Sarjak vernek gyökeret és sokasodnak, állnak a...","Sprouts take and accumulate, stand by the curb...",Whitman_1.bi,classic.lit
...,...,...,...,...
4051676,"- Igen, uram.","- Yes, sir.",subtitles.bi,subtitles
4051677,"Gondolja meg jól, második lehetőséget kap!","Come on, that's a second chance, kiddo.",subtitles.bi,subtitles
4051678,Igen.,Yes.,subtitles.bi,subtitles
4051679,Rendben.,All right.,subtitles.bi,subtitles


In [10]:
df['domain'].value_counts()

modern.lit      1628251
law             1314873
classic.lit      641769
subtitles        334721
softwaredocs     132067
Name: domain, dtype: int64

In [11]:
sampler.validation_set['domain'].value_counts()

law             26996
modern.lit      13802
classic.lit      5481
subtitles        2838
softwaredocs     1123
Name: domain, dtype: int64

In [12]:
sampler.test_set['domain'].value_counts()

law             27156
modern.lit      14039
classic.lit      5577
subtitles        2886
softwaredocs     1143
Name: domain, dtype: int64

In [20]:
def create_data_set_files(sampler, path, base_file_name):
    file_name_beginning = join(path, base_file_name + '-')
    
    f = lambda set_name, language: file_name_beginning + set_name + '.' + language
    
    sampler.train_set['hun'][:400000].to_csv(f('train', 'hu'), header=None, index=None, sep=' ')
    sampler.train_set['eng'][:400000].to_csv(f('train', 'en'), header=None, index=None, sep=' ')
    
    sampler.validation_set['hun'][:5000].to_csv(f('valid', 'hu'), header=None, index=None, sep=' ')
    sampler.validation_set['eng'][:5000].to_csv(f('valid', 'en'), header=None, index=None, sep=' ')
    
    sampler.test_set['hun'][:5000].to_csv(f('test', 'hu'), header=None, index=None, sep=' ')
    sampler.test_set['eng'][:5000].to_csv(f('test', 'en'), header=None, index=None, sep=' ')

In [21]:
combined_path = join(hunglish_path, 'combined')
if not exists(combined_path):
    mkdir(combined_path)
create_data_set_files(sampler, hunglish_path, 'combined/hunglish2small')