In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.utils import resample

np.set_printoptions(precision=6, suppress=True)

In [2]:
SCREEN_SIZE = 50
LOSS_RATES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
DIRECTORY = '../data/'

## Training & test data for various loss

### U-Net

In [None]:
TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()

temp.extend(dataset_list)
dataset_list = temp

# min, max first
temp_df = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    
    temp_df.append(env_df)
MAXS = np.concatenate(temp_df, axis=0).max(axis=0)
MINS = np.concatenate(temp_df, axis=0).min(axis=0)

In [None]:
for LOSS_RATE in LOSS_RATES:
    temp_current = []
    temp_prev = []
    temp_next = []
    temp_mask = []
    temp_label = []
    
    TOM_DIRECTORY = '../data/tom/'
    file_list = os.listdir(TOM_DIRECTORY)
    dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
    dataset_list.sort()
    temp = dataset_list

    PAP_DIRECTORY = '../data/pap/'
    file_list = os.listdir(PAP_DIRECTORY)
    dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
    dataset_list.sort()

    temp.extend(dataset_list)
    dataset_list = temp
    
    for FILENAME in dataset_list:
        env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
        env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values

        np.random.seed(3101)
        null_prob = 0.3
        mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
        np.random.seed(4564)
        raw_null_prob = 0.3
        raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                       replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
        raw_indices = raw_indices*48
        raw_indices_ext = []
        for elem in raw_indices:
            for _ in range(elem, elem+48):
                raw_indices_ext.append(_)
        raw_indices = np.array(raw_indices_ext)
        raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
        env_df = (env_df - MINS)/(MAXS - MINS)

        mask[raw_indices, :] = 0
        missing_df = np.ma.array(env_df, mask=1-mask, fill_value=-1)
        missing_df = missing_df.filled()
        for INDEX in range(SCREEN_SIZE*2, env_df.shape[0]-SCREEN_SIZE):
            temp_prev.append(missing_df[(INDEX-SCREEN_SIZE*2):(INDEX-SCREEN_SIZE), :])
            temp_current.append(missing_df[(INDEX-SCREEN_SIZE):INDEX, :])
            temp_next.append(missing_df[INDEX:INDEX+SCREEN_SIZE, :])

            temp_mask.append(mask[(INDEX-SCREEN_SIZE):INDEX, :])
            temp_label.append(env_df[(INDEX-SCREEN_SIZE):INDEX, :])
    temp_prev = np.stack(temp_prev)
    temp_current = np.stack(temp_current)
    temp_next = np.stack(temp_next)
    temp_mask = np.stack(temp_mask)
    temp_label = np.stack(temp_label)

    raw_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
    raw_label = temp_label[..., np.newaxis]
    
    raw_input = np.concatenate([raw_input]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)
    raw_label = np.concatenate([raw_label]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)
    
    TOM_DIRECTORY = '../data/tom/'
    file_list = os.listdir(TOM_DIRECTORY)
    dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
    dataset_list.sort()
    temp = dataset_list

    PAP_DIRECTORY = '../data/pap/'
    file_list = os.listdir(PAP_DIRECTORY)
    dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
    dataset_list.sort()

    temp.extend(dataset_list)
    dataset_list = temp

    temp_current = []
    temp_prev = []
    temp_next = []
    temp_mask = []
    temp_label = []
    for FILENAME in dataset_list:
        env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
        env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
        np.random.seed(3101)
        null_prob = 0.3
        mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
        np.random.seed(4564)
        raw_null_prob = LOSS_RATE
        raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                       replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
        raw_indices = raw_indices*48
        raw_indices_ext = []
        for elem in raw_indices:
            for _ in range(elem, elem+48):
                raw_indices_ext.append(_)
        raw_indices = np.array(raw_indices_ext)
        raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
        env_df = (env_df - MINS)/(MAXS - MINS)

        mask[raw_indices, :] = 0
        missing_df = np.ma.array(env_df, mask=1-mask, fill_value=-1)
        missing_df = missing_df.filled()

        for INDEX in range(SCREEN_SIZE*2, env_df.shape[0]-SCREEN_SIZE):
            temp_prev.append(missing_df[(INDEX-SCREEN_SIZE*2):(INDEX-SCREEN_SIZE), :])
            temp_current.append(missing_df[(INDEX-SCREEN_SIZE):INDEX, :])
            temp_next.append(missing_df[INDEX:INDEX+SCREEN_SIZE, :])

            temp_mask.append(mask[(INDEX-SCREEN_SIZE):INDEX, :])
            temp_label.append(env_df[(INDEX-SCREEN_SIZE):INDEX, :])
    temp_prev = np.stack(temp_prev)
    temp_current = np.stack(temp_current)
    temp_next = np.stack(temp_next)
    temp_mask = np.stack(temp_mask)
    temp_label = np.stack(temp_label)

    test_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
    test_label = temp_label[..., np.newaxis]

    test_input = np.concatenate([test_input]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)
    test_label = np.concatenate([test_label]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)

    f = open('./data/tot_dataset_mtr_loss_%.2f.npz' % LOSS_RATE, 'wb')
    np.savez(f,
             raw_input = raw_input,
             raw_label = raw_label,
             test_input = test_input,
             test_label = test_label,
             MAXS = MAXS,
             MINS = MINS,
             SCREEN_SIZE = SCREEN_SIZE
            )
    f.close()
    print('Loss rate %.2f dataset saved.' % LOSS_RATE)

### RNN

In [3]:
SCREEN_SIZE = 100

TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()

temp.extend(dataset_list)
dataset_list = temp

# min, max first
temp_df = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    
    temp_df.append(env_df)
MAXS = np.concatenate(temp_df, axis=0).max(axis=0)
MINS = np.concatenate(temp_df, axis=0).min(axis=0)

In [4]:
for LOSS_RATE in LOSS_RATES:
    temp_current = []
    temp_prev = []
    temp_next = []
    temp_mask = []
    temp_label = []

    TOM_DIRECTORY = '../data/tom/'
    file_list = os.listdir(TOM_DIRECTORY)
    dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
    dataset_list.sort()
    temp = dataset_list

    PAP_DIRECTORY = '../data/pap/'
    file_list = os.listdir(PAP_DIRECTORY)
    dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
    dataset_list.sort()

    temp.extend(dataset_list)
    dataset_list = temp
    for FILENAME in dataset_list:
        env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
        env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values

        np.random.seed(3101)
        null_prob = 0.3
        mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
        np.random.seed(4564)
        raw_null_prob = 0.3
        raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                       replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
        raw_indices = raw_indices*48
        raw_indices_ext = []
        for elem in raw_indices:
            for _ in range(elem, elem+48):
                raw_indices_ext.append(_)
        raw_indices = np.array(raw_indices_ext)
        raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
        env_df = (env_df - MINS)/(MAXS - MINS)

        mask[raw_indices, :] = 0
        missing_df = np.ma.array(env_df, mask=1-mask, fill_value=-1)
        missing_df = missing_df.filled()
        for INDEX in range(SCREEN_SIZE*2, env_df.shape[0]-SCREEN_SIZE):
            temp_prev.append(missing_df[(INDEX-SCREEN_SIZE*2):(INDEX-SCREEN_SIZE), :])
            temp_current.append(missing_df[(INDEX-SCREEN_SIZE):INDEX, :])
            temp_next.append(missing_df[INDEX:INDEX+SCREEN_SIZE, :])

            temp_mask.append(mask[(INDEX-SCREEN_SIZE):INDEX, :])
            temp_label.append(env_df[(INDEX-SCREEN_SIZE):INDEX, :])
    temp_prev = np.stack(temp_prev)
    temp_current = np.stack(temp_current)
    temp_next = np.stack(temp_next)
    temp_mask = np.stack(temp_mask)
    temp_label = np.stack(temp_label)

    raw_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
    raw_label = temp_label[..., np.newaxis]
    
    raw_input = np.concatenate([raw_input]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)
    raw_label = np.concatenate([raw_label]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)
    
    TOM_DIRECTORY = '../data/tom/'
    file_list = os.listdir(TOM_DIRECTORY)
    dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
    dataset_list.sort()
    temp = dataset_list

    PAP_DIRECTORY = '../data/pap/'
    file_list = os.listdir(PAP_DIRECTORY)
    dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
    dataset_list.sort()

    temp.extend(dataset_list)
    dataset_list = temp

    temp_current = []
    temp_prev = []
    temp_next = []
    temp_mask = []
    temp_label = []
    for FILENAME in dataset_list:
        env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
        env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
        np.random.seed(3101)
        null_prob = 0.3
        mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
        np.random.seed(4564)
        raw_null_prob = LOSS_RATE
        raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                       replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
        raw_indices = raw_indices*48
        raw_indices_ext = []
        for elem in raw_indices:
            for _ in range(elem, elem+48):
                raw_indices_ext.append(_)
        raw_indices = np.array(raw_indices_ext)
        raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
        env_df = (env_df - MINS)/(MAXS - MINS)

        mask[raw_indices, :] = 0
        missing_df = np.ma.array(env_df, mask=1-mask, fill_value=-1)
        missing_df = missing_df.filled()

        for INDEX in range(SCREEN_SIZE*2, env_df.shape[0]-SCREEN_SIZE):
            temp_prev.append(missing_df[(INDEX-SCREEN_SIZE*2):(INDEX-SCREEN_SIZE), :])
            temp_current.append(missing_df[(INDEX-SCREEN_SIZE):INDEX, :])
            temp_next.append(missing_df[INDEX:INDEX+SCREEN_SIZE, :])

            temp_mask.append(mask[(INDEX-SCREEN_SIZE):INDEX, :])
            temp_label.append(env_df[(INDEX-SCREEN_SIZE):INDEX, :])
    temp_prev = np.stack(temp_prev)
    temp_current = np.stack(temp_current)
    temp_next = np.stack(temp_next)
    temp_mask = np.stack(temp_mask)
    temp_label = np.stack(temp_label)

    test_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
    test_label = temp_label[..., np.newaxis]

    test_input = np.concatenate([test_input]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)
    test_label = np.concatenate([test_label]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)

    f = open('./data/tot_dataset_RNN_mtr_loss_%.2f.npz' % LOSS_RATE, 'wb')
    np.savez(f,
             raw_input = raw_input,
             raw_label = raw_label,
             test_input = test_input,
             test_label = test_label,
             MAXS = MAXS,
             MINS = MINS,
             SCREEN_SIZE = SCREEN_SIZE
            )
    f.close()
    print('Loss rate %.2f dataset saved.' % LOSS_RATE)

Loss rate 0.10 dataset saved.
Loss rate 0.20 dataset saved.
Loss rate 0.30 dataset saved.
Loss rate 0.40 dataset saved.
Loss rate 0.50 dataset saved.
Loss rate 0.60 dataset saved.
Loss rate 0.70 dataset saved.
Loss rate 0.80 dataset saved.
Loss rate 0.90 dataset saved.
Loss rate 0.95 dataset saved.


### FFNN & Linear interpolation

In [None]:
TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()

temp.extend(dataset_list)
dataset_list = temp

# min, max first
temp_df = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    
    temp_df.append(env_df)
MAXS = np.concatenate(temp_df, axis=0).max(axis=0)
MINS = np.concatenate(temp_df, axis=0).min(axis=0)

In [None]:
for LOSS_RATE in LOSS_RATES:
    temp_current = []
    temp_prev = []
    temp_next = []
    temp_mask = []
    temp_label = []
    
    TOM_DIRECTORY = '../data/tom/'
    file_list = os.listdir(TOM_DIRECTORY)
    dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
    dataset_list.sort()
    temp = dataset_list

    PAP_DIRECTORY = '../data/pap/'
    file_list = os.listdir(PAP_DIRECTORY)
    dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
    dataset_list.sort()

    temp.extend(dataset_list)
    dataset_list = temp
    for FILENAME in dataset_list:
        env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
        env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values

        np.random.seed(3101)
        null_prob = 0.3
        mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
        np.random.seed(4564)
        raw_null_prob = 0.3
        raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                       replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
        raw_indices = raw_indices*48
        raw_indices_ext = []
        for elem in raw_indices:
            for _ in range(elem, elem+48):
                raw_indices_ext.append(_)
        raw_indices = np.array(raw_indices_ext)
        raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
        env_df = (env_df - MINS)/(MAXS - MINS)

        mask[raw_indices, :] = 0
        missing_df = env_df*mask


        for INDEX in range(2, env_df.shape[0]-1):
            temp_prev.append(missing_df[(INDEX-2):(INDEX-1), :])
            temp_current.append(missing_df[(INDEX-1):INDEX, :])
            temp_next.append(missing_df[INDEX:INDEX+1, :])

            temp_mask.append(mask[(INDEX-1):INDEX, :])
            temp_label.append(env_df[(INDEX-1):INDEX, :])
    temp_prev = np.stack(temp_prev)
    temp_current = np.stack(temp_current)
    temp_next = np.stack(temp_next)
    temp_mask = np.stack(temp_mask)
    temp_label = np.stack(temp_label)

    raw_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
    raw_label = temp_label[..., np.newaxis]
    
    raw_input = np.concatenate([raw_input.squeeze(axis=1)[..., 0],
                                raw_input.squeeze(axis=1)[..., 1],
                                raw_input.squeeze(axis=1)[..., 2],
                                raw_input.squeeze(axis=1)[..., 3]
                               ], axis=1)
    raw_label = raw_label.squeeze(axis=1)[..., 0]

    TOM_DIRECTORY = '../data/tom/'
    file_list = os.listdir(TOM_DIRECTORY)
    dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
    dataset_list.sort()
    temp = dataset_list

    PAP_DIRECTORY = '../data/pap/'
    file_list = os.listdir(PAP_DIRECTORY)
    dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
    dataset_list.sort()

    temp.extend(dataset_list)
    dataset_list = temp

    temp_current = []
    temp_prev = []
    temp_next = []
    temp_mask = []
    temp_label = []
    for FILENAME in dataset_list:
        env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
        env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
        np.random.seed(3101)
        null_prob = 0.3
        mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
        np.random.seed(4564)
        raw_null_prob = LOSS_RATE
        raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                       replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
        raw_indices = raw_indices*48
        raw_indices_ext = []
        for elem in raw_indices:
            for _ in range(elem, elem+48):
                raw_indices_ext.append(_)
        raw_indices = np.array(raw_indices_ext)
        raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
        env_df = (env_df - MINS)/(MAXS - MINS)

        mask[raw_indices, :] = 0
        missing_df = env_df*mask

        for INDEX in range(2, env_df.shape[0]-1):
            temp_prev.append(missing_df[(INDEX-2):(INDEX-1), :])
            temp_current.append(missing_df[(INDEX-1):INDEX, :])
            temp_next.append(missing_df[INDEX:INDEX+1, :])

            temp_mask.append(mask[(INDEX-1):INDEX, :])
            temp_label.append(env_df[(INDEX-1):INDEX, :])
    temp_prev = np.stack(temp_prev)
    temp_current = np.stack(temp_current)
    temp_next = np.stack(temp_next)
    temp_mask = np.stack(temp_mask)
    temp_label = np.stack(temp_label)

    test_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
    test_label = temp_label[..., np.newaxis]

    test_input = np.concatenate([test_input.squeeze(axis=1)[..., 0],
                                 test_input.squeeze(axis=1)[..., 1],
                                 test_input.squeeze(axis=1)[..., 2],
                                 test_input.squeeze(axis=1)[..., 3]
                                ], axis=1)
    test_label = test_label.squeeze(axis=1)[..., 0]

    f = open('./data/tot_dataset_FFNN_mtr_loss_%.2f.npz' % LOSS_RATE, 'wb')
    np.savez(f,
             raw_input = raw_input,
             raw_label = raw_label,
             test_input = test_input,
             test_label = test_label,
             MAXS = MAXS,
             MINS = MINS
            )
    f.close()
    print('Loss rate %.2f dataset saved.' % LOSS_RATE)