In [None]:
import os
import tensorflow as tf
import numpy as np
import glob
import multiprocessing as mp
from tqdm import tqdm
from datetime import datetime

### LOAD FILES FROM MULTIPLE DIRECTORIES IN PARALLEL

W = [2, 4, 6, 8, 10]

samples, labels = [], []

# reader function for parallelized text file reading
def reader(window_size):
    data  = []
    for filename in tqdm(glob.glob(os.path.join(data_dir + f'W{window_size}/', '*.txt'))):
        d = np.loadtxt(filename)
        f = np.full((len(d), 1), int(filename[-12:-9]))
        data.append(np.hstack((f, d)))
    return data
    
data_dir = '/home/apatyk/DailySamples/'

start_time = datetime.now()

print(f'Loading data in parallel...', flush=True) # buffer flush needed with tqdm
# read text files in parallel
pool = mp.Pool(len(W))
data_ls = pool.map(reader, W)

end_time = datetime.now()
print(f'Duration: {end_time - start_time}', flush=True) # buffer flush needed with tqdm

# arrange data into samples, labels and filenames for saving
all_data = np.hstack(data_ls)
all_samples = [x[:, 2] for x in all_data]
all_labels = [x[:, 1] for x in all_data]
all_filenames = [x[0, 0] for x in all_data]

# save data to numpy arrays
np.save('1M-samples/daily-samples.npy', all_samples)
np.save('1M-samples/daily-labels.npy', all_labels)
np.save('1M-samples/daily-filenames.npy', all_filenames)

print('Done.')

In [None]:
import os
import numpy as np
import glob
from tqdm import tqdm

### LOAD FILES FROM SINGLE DIRECTORY (SERIAL)

data_dir = '/home/apatyk/DailySamples/W6/'

all_data  = []
for filename in tqdm(glob.glob(os.path.join(data_dir, '*.txt'))):
    d = np.loadtxt(filename)
    f = np.full((len(d), 1), int(filename[-12:-9]))
    all_data.append(np.hstack((f, d)))
    
all_samples = [x[:, 2] for x in all_data]
all_labels = [x[:, 1] for x in all_data]
all_filenames = [x[0, 0] for x in all_data]

np.save('200k-samples-W6/daily-samples.npy', all_samples)
np.save('200k-samples-W6/daily-labels.npy', all_labels)
np.save('200k-samples-W6/daily-filenames.npy', all_filenames)