In [1]:
import pickle, numpy as np, glob, os, gzip
from numpy import linalg as LA
import os

In [2]:
def make_arrays(num_samples, num_features):
  if num_samples:
    X = np.ndarray((num_samples, num_features), dtype=np.float32)
    y = np.ndarray(num_samples, dtype=np.float32)
  else:
    X, y = None, None
  return X, y

def merge_datasets(pickle_files):
    num_parts = len(pickle_files)

    X, y, Pairs = None, None, None
    for label, pickle_file in enumerate(pickle_files):
        seq = int(os.path.split(pickle_file)[1].split('_')[0])
        try:
            print('reading', pickle_file)
            with gzip.open(pickle_file, 'rb') as f:
                pairs, feat, dts = pickle.load(f)
                pairs = np.array([(seq, pair[0], pair[1]) for pair in pairs])
                feat = np.array(feat)
                scales = np.array([LA.norm(val) for val in dts])
        except Exception as e:
          print('Unable to process data from', pickle_file, ':', e)
          raise

        if X is not None:
            X = np.vstack((X, feat))
            y = np.hstack((y, scales))
            Pairs = np.vstack((Pairs, pairs))
        else:
            X = feat
            y = scales
            Pairs = pairs
    return X, y, Pairs

In [3]:
def shuffle_in_unison(X, y, pairs):
    assert len(X) == len(y)
    p = np.random.permutation(len(X))
    return X[p,:], y[p], pairs[p,:]

def split_data(X, y, pairs, train_size, valid_size, test_size):
    train_X, train_y, train_pairs = X[:train_size,:], y[:train_size], pairs[:train_size,:]
    
    k = train_size + valid_size
    valid_X, valid_y, valid_pairs = X[train_size:k,:], y[train_size:k], pairs[train_size:k,:]
    
    test_X, test_y, test_pairs = X[k:(k+test_size),:], y[k:(k+test_size)], pairs[k:(k+test_size),:]
    
    return train_X, train_y, train_pairs, valid_X, valid_y, valid_pairs, test_X, test_y, test_pairs

train_sequences = ['%02d' % seq for seq in list(range(4)) + list(range(5,7))]

print('sequences to process:', train_sequences)

grids = list(zip([6,5,4], [4,3,2]))
nbins = [100, 150, 200]
strides = [1, 2]
prefixes = ['un_', '']

labels = []
for prefix in prefixes:
    for grid in grids:
        for nb in nbins:
            labels.append([prefix+'%d_%d_%d_%d' % (stride, grid[0], grid[1], nb) for stride in strides])
            #labels.extend(['%d_%d_%d_%d' % (stride, grid[0], grid[1], nb) for stride in strides])

print('labels that will be processed:')
for label in labels:
    print(label)

for label in labels:
    print('processing', label)
    if isinstance(label, str):
        tag = label
        files = [os.path.join('data', 'features', '%s_%s.pklz' % (seq, label)) for seq in train_sequences]
    else:
        tag = '_'.join(label)
        files = []
        for sub_label in label:
            files.extend([os.path.join('data', 'features', '%s_%s.pklz' % (seq, sub_label)) for seq in train_sequences])

    print('files to be read:')
    for file in files:
        print(file)
        
    X, y, pairs = merge_datasets(files)
    X, y, pairs = shuffle_in_unison(X,y,pairs)

    pickle.dump({'X': X, 'y': y, 'pairs': pairs}, gzip.open(os.path.join('data', 'datasets', 'data_%s.pklz' % tag), 'wb'))

    size = len(X)
    train_size = int(.7*size)
    valid_size = int(.15*size)
    test_size = size-train_size-valid_size

    train_X, train_y, train_pairs, valid_X, valid_y, valid_pairs, test_X, test_y, test_pairs = split_data(X, y, pairs, train_size, valid_size, test_size)
    print(len(train_X), len(test_X), len(test_y))
    pickle.dump({'X': train_X, 'y': train_y, 'pairs': train_pairs}, gzip.open('data/datasets/train_%s.pklz' % tag, 'wb'))
    pickle.dump({'X': valid_X, 'y': valid_y, 'pairs': valid_pairs}, gzip.open('data/datasets/valid_%s.pklz' % tag, 'wb'))
    pickle.dump({'X': test_X, 'y': test_y, 'pairs': test_pairs}, gzip.open('data/datasets/test_%s.pklz' % tag, 'wb'))
    

sequences to process: ['00', '01', '02', '03', '05', '06']
labels that will be processed:
['un_1_6_4_100', 'un_2_6_4_100']
['un_1_6_4_150', 'un_2_6_4_150']
['un_1_6_4_200', 'un_2_6_4_200']
['un_1_5_3_100', 'un_2_5_3_100']
['un_1_5_3_150', 'un_2_5_3_150']
['un_1_5_3_200', 'un_2_5_3_200']
['un_1_4_2_100', 'un_2_4_2_100']
['un_1_4_2_150', 'un_2_4_2_150']
['un_1_4_2_200', 'un_2_4_2_200']
['1_6_4_100', '2_6_4_100']
['1_6_4_150', '2_6_4_150']
['1_6_4_200', '2_6_4_200']
['1_5_3_100', '2_5_3_100']
['1_5_3_150', '2_5_3_150']
['1_5_3_200', '2_5_3_200']
['1_4_2_100', '2_4_2_100']
['1_4_2_150', '2_4_2_150']
['1_4_2_200', '2_4_2_200']
processing ['un_1_6_4_100', 'un_2_6_4_100']
files to be read:
data/features/00_un_1_6_4_100.pklz
data/features/01_un_1_6_4_100.pklz
data/features/02_un_1_6_4_100.pklz
data/features/03_un_1_6_4_100.pklz
data/features/05_un_1_6_4_100.pklz
data/features/06_un_1_6_4_100.pklz
data/features/00_un_2_6_4_100.pklz
data/features/01_un_2_6_4_100.pklz
data/features/02_un_2_6_4_10