In [1]:
import os
import re
import pandas as pd
import numpy as np

In [2]:
path = '../data/model_ready/csv/'
csv_files = os.listdir(path)
csv_files = [file for file in csv_files if re.search(r'csv', file)]

In [3]:
def get_splits(df, seed=None):
    """
    helper function for randomly shuffling and splitting data.
    :param df: DataFrame object with raw data
    :param seed: int for determining random seed to use.
    """
    np.random.seed(seed)
    shuffled_idx = np.random.permutation(df.index)
    # determine indices for partitioning df
    rows = len(df.index)
    # 0.8 corresponds to 80% allocated for training set
    train_idx = int(0.8 * rows)
    # 0.1 corresponds to 10% allocated for validation. 
    # remainder is for test set
    validate_idx = int(0.1 * rows) + train_idx
    train = df.loc[shuffled_idx[:train_idx]]
    validate = df.loc[shuffled_idx[train_idx:validate_idx]]
    test = df.loc[shuffled_idx[validate_idx:]]
    return train, validate, test

In [4]:
for file in csv_files:
    full_df = pd.read_csv(path+file)
    train, val, test = get_splits(full_df, 100)
    print('Creating splits for: {}'.format(file))
    print('train set size: {}'.format(train.shape))
    print('validation set size: {}'.format(val.shape))
    print('test set size: {}'.format(test.shape))
    print('-' * 60)
    print('\n')
    
    train.to_csv(path + file[:-4] + '_train.csv', index=False)
    val.to_csv(path + file[:-4] + '_val.csv', index=False)
    test.to_csv(path + file[:-4] + '_test.csv', index=False)

Creating splits for: processed_dutch.csv
train set size: (206609, 3)
validation set size: (25826, 3)
test set size: (25827, 3)
------------------------------------------------------------


Creating splits for: processed_spanish.csv
train set size: (372970, 3)
validation set size: (46621, 3)
test set size: (46622, 3)
------------------------------------------------------------


Creating splits for: processed_italian.csv
train set size: (107843, 3)
validation set size: (13480, 3)
test set size: (13481, 3)
------------------------------------------------------------


Creating splits for: processed_icelandic.csv
train set size: (56051, 3)
validation set size: (7006, 3)
test set size: (7007, 3)
------------------------------------------------------------


Creating splits for: processed_polish.csv
train set size: (416164, 3)
validation set size: (52020, 3)
test set size: (52022, 3)
------------------------------------------------------------


Creating splits for: processed_croatian.csv
