In [3]:
from scipy.io import loadmat
import numpy as np

In [4]:
X_raw = loadmat("R2198_20ms.mat")
y_raw = np.loadtxt("R2198_locations.dat")

X_raw = X_raw['mm'].T

In [2]:
def generate_datasets(window=50, step=25, test_set_sampling='chunk', test_set_size=0.2, test_chunk_location=0.8):
    print('generating data')
    print('window {}, step {}'.format(window,step))
    dataset_length = int(X_raw.shape[0]/step)
    X = np.zeros((dataset_length, 2*window*33))
    y = np.zeros((dataset_length, 2))

    for filtered_i, data_i in enumerate(range(window, X_raw.shape[0]-window, step)):
        X[filtered_i] = np.real(np.fft.fft(X_raw[data_i-window:data_i+window].T)).flatten()
        y[filtered_i] = y_raw[data_i]

    if test_set_sampling == 'uniform_chunks':
        print('uniform chunks')
        # refer to report for how this is calculated.         
        test_set_size = 0.1
        chunk_size = int(2*window/step)
        test_set_length = int(dataset_length*test_set_size)
        chunks_count = int(test_set_length/chunk_size)
        chunk_step = int((dataset_length-chunk_size)/chunks_count)
        test_chunks = [list(range(i*chunk_step, i*chunk_step + chunk_size)) for i in range(chunks_count)]
        discard_chunks = [list(range(i*chunk_step - chunk_size, i*chunk_step)) for i in range(chunks_count)][1:]
        # flatten chunk lists
        test_chunks = sum(test_chunks, [])
        discard_chunks = sum(discard_chunks, [])
        test_and_discard_chunks = test_chunks + discard_chunks
        X_test = X[test_chunks]
        X_train = np.delete(X, test_and_discard_chunks, axis=0)
        y_test = y[test_chunks]
        y_train = np.delete(y, test_and_discard_chunks, axis=0)
        print(test_chunks)
        print(discard_chunks)
#         print(test_and_discard_chunks)
#         print(sum(chunks, []))
#         return
    if test_set_sampling == 'chunk':
        print('sampling with a chunk, size {}, location {}'.format(test_set_size, test_chunk_location))
        test_set_start = int(dataset_length*test_chunk_location)
        test_set_end = test_set_start + int(dataset_length*test_set_size)
        print('indexes {}:{}'.format(test_set_start, test_set_end))
        test_chunk = np.arange(test_set_start, test_set_end)
        X_test = X[test_chunk]
        X_train = np.delete(X, test_chunk, axis=0)
        y_test = y[test_chunk]
        y_train = np.delete(y, test_chunk, axis=0)
    if test_set_sampling == 'random':
        print('random test set, probably not a good option here')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=0)
    print(X_test.shape, X_train.shape, y_test.shape, y_train.shape)
    return X_test, X_train, y_test, y_train

# X_test, X_train, y_test, y_train = generate_datasets(test_set_sampling='uniform_chunks')

In [5]:
X_test, X_train, y_test, y_train = generate_datasets(window=50, step=1, test_set_sampling='uniform_chunks')

generating data
window 50, step 1
uniform chunks
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 10