In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.utils import resample

np.set_printoptions(precision=6, suppress=True)

In [2]:
SCREEN_SIZE = 50
DIRECTORY = '../data/'

## Training & test data

### U-Net & RNN

In [3]:
TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

In [4]:
PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()

In [5]:
temp.extend(dataset_list)
dataset_list = temp

In [6]:
# min, max first
temp_df = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    
    temp_df.append(env_df)
MAXS = np.concatenate(temp_df, axis=0).max(axis=0)
MINS = np.concatenate(temp_df, axis=0).min(axis=0)

In [7]:
temp_current = []
temp_prev = []
temp_next = []
temp_mask = []
temp_label = []

TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()

temp.extend(dataset_list)
dataset_list = temp

for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    
    np.random.seed(3101)
    null_prob = 0.3
    mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
    np.random.seed(4564)
    raw_null_prob = 0.3
    raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                   replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
    raw_indices = raw_indices*48
    raw_indices_ext = []
    for elem in raw_indices:
        for _ in range(elem, elem+48):
            raw_indices_ext.append(_)
    raw_indices = np.array(raw_indices_ext)
    raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
    env_df = (env_df - MINS)/(MAXS - MINS)
    
    mask[raw_indices, :] = 0
    missing_df = np.ma.array(env_df, mask=1-mask, fill_value=-1)
    missing_df = missing_df.filled()
    for INDEX in range(SCREEN_SIZE*2, env_df.shape[0]-SCREEN_SIZE):
        temp_prev.append(missing_df[(INDEX-SCREEN_SIZE*2):(INDEX-SCREEN_SIZE), :])
        temp_current.append(missing_df[(INDEX-SCREEN_SIZE):INDEX, :])
        temp_next.append(missing_df[INDEX:INDEX+SCREEN_SIZE, :])
        
        temp_mask.append(mask[(INDEX-SCREEN_SIZE):INDEX, :])
        temp_label.append(env_df[(INDEX-SCREEN_SIZE):INDEX, :])
temp_prev = np.stack(temp_prev)
temp_current = np.stack(temp_current)
temp_next = np.stack(temp_next)
temp_mask = np.stack(temp_mask)
temp_label = np.stack(temp_label)

raw_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
raw_label = temp_label[..., np.newaxis]

In [8]:
raw_input = np.concatenate([raw_input]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)
raw_label = np.concatenate([raw_label]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)

In [9]:
print(raw_input.shape)
print(raw_label.shape)

(134993, 50, 50, 4)
(134993, 50, 50, 1)


In [10]:
i = 5
print(raw_input[i, ..., 2])
print(raw_input[i, ..., 0])
print(raw_input[i, ..., 3])
print(raw_input[i, ..., 1])
print(raw_label[i, ..., 0])

[[ 0.245125  0.376592  0.833015 ...  0.833015  0.173458  0.      ]
 [ 0.253482  0.372346 -1.       ... -1.        0.177036  0.      ]
 [ 0.269824 -1.        0.819099 ...  0.819099  0.183511  0.001335]
 ...
 [-1.       -1.       -1.       ... -1.       -1.       -1.      ]
 [-1.       -1.       -1.       ... -1.       -1.       -1.      ]
 [-1.       -1.       -1.       ... -1.       -1.       -1.      ]]
[[-1.       -1.       -1.       ... -1.       -1.       -1.      ]
 [-1.       -1.       -1.       ... -1.       -1.       -1.      ]
 [-1.       -1.       -1.       ... -1.       -1.       -1.      ]
 ...
 [-1.       -1.        0.877098 ...  0.877098  0.182771  0.      ]
 [ 0.272052  0.352641  0.870193 ...  0.870193  0.190664  0.001904]
 [ 0.290065  0.360625  0.854897 ...  0.854897 -1.        0.033487]]
[[ 0.303993  0.383727 -1.       ... -1.        0.210764  0.082926]
 [ 0.343918 -1.        0.883684 ...  0.883684  0.22125   0.146342]
 [ 0.353389  0.448616  0.892394 ...  0.892394  0.2

In [11]:
print(MAXS)
print(MINS)

[  54.88   37.95  100.   2999.   1669.92]
[  1.03 -20.92   5.86   0.     0.  ]


### Test data

In [12]:
TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

In [13]:
PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()

In [14]:
temp.extend(dataset_list)
dataset_list = temp

In [15]:
temp_current = []
temp_prev = []
temp_next = []
temp_mask = []
temp_label = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    np.random.seed(3101)
    null_prob = 0.3
    mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
    np.random.seed(4564)
    raw_null_prob = 0.3
    raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                   replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
    raw_indices = raw_indices*48
    raw_indices_ext = []
    for elem in raw_indices:
        for _ in range(elem, elem+48):
            raw_indices_ext.append(_)
    raw_indices = np.array(raw_indices_ext)
    raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
    env_df = (env_df - MINS)/(MAXS - MINS)
    
    mask[raw_indices, :] = 0
    missing_df = np.ma.array(env_df, mask=1-mask, fill_value=-1)
    missing_df = missing_df.filled()
    
    for INDEX in range(SCREEN_SIZE*2, env_df.shape[0]-SCREEN_SIZE):
        temp_prev.append(missing_df[(INDEX-SCREEN_SIZE*2):(INDEX-SCREEN_SIZE), :])
        temp_current.append(missing_df[(INDEX-SCREEN_SIZE):INDEX, :])
        temp_next.append(missing_df[INDEX:INDEX+SCREEN_SIZE, :])
        
        temp_mask.append(mask[(INDEX-SCREEN_SIZE):INDEX, :])
        temp_label.append(env_df[(INDEX-SCREEN_SIZE):INDEX, :])
temp_prev = np.stack(temp_prev)
temp_current = np.stack(temp_current)
temp_next = np.stack(temp_next)
temp_mask = np.stack(temp_mask)
temp_label = np.stack(temp_label)

test_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
test_label = temp_label[..., np.newaxis]

In [16]:
test_input = np.concatenate([test_input]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)
test_label = np.concatenate([test_label]*(int(SCREEN_SIZE/env_df.shape[1])), axis=2)

In [17]:
print(test_input.shape)
print(test_label.shape)

(78263, 50, 50, 4)
(78263, 50, 50, 1)


In [18]:
f = open('./data/tot_dataset_%s_UNet.npz' % str(SCREEN_SIZE), 'wb')
np.savez(f,
         raw_input = raw_input,
         raw_label = raw_label,
         test_input = test_input,
         test_label = test_label,
         MAXS = MAXS,
         MINS = MINS,
         SCREEN_SIZE = SCREEN_SIZE
        )
f.close()

### FFNN & Linear interpolation

TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()

temp.extend(dataset_list)
dataset_list = temp

# min, max first
temp_df = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    
    temp_df.append(env_df)
MAXS = np.concatenate(temp_df, axis=0).max(axis=0)
MINS = np.concatenate(temp_df, axis=0).min(axis=0)

temp_current = []
temp_prev = []
temp_next = []
temp_mask = []
temp_label = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    
    np.random.seed(3101)
    null_prob = 0.3
    mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
    np.random.seed(4564)
    raw_null_prob = 0.3
    raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                   replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
    raw_indices = raw_indices*48
    raw_indices_ext = []
    for elem in raw_indices:
        for _ in range(elem, elem+48):
            raw_indices_ext.append(_)
    raw_indices = np.array(raw_indices_ext)
    raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
    env_df = (env_df - MINS)/(MAXS - MINS)
    
    mask[raw_indices, :] = 0
    missing_df = env_df*mask


    for INDEX in range(2, env_df.shape[0]-1):
        temp_prev.append(missing_df[(INDEX-2):(INDEX-1), :])
        temp_current.append(missing_df[(INDEX-1):INDEX, :])
        temp_next.append(missing_df[INDEX:INDEX+1, :])
        
        temp_mask.append(mask[(INDEX-1):INDEX, :])
        temp_label.append(env_df[(INDEX-1):INDEX, :])
temp_prev = np.stack(temp_prev)
temp_current = np.stack(temp_current)
temp_next = np.stack(temp_next)
temp_mask = np.stack(temp_mask)
temp_label = np.stack(temp_label)

raw_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
raw_label = temp_label[..., np.newaxis]

print(raw_input.shape)
print(raw_label.shape)

raw_input = np.concatenate([raw_input.squeeze(axis=1)[..., 0],
                            raw_input.squeeze(axis=1)[..., 1],
                            raw_input.squeeze(axis=1)[..., 2],
                            raw_input.squeeze(axis=1)[..., 3]
                           ], axis=1)
raw_label = raw_label.squeeze(axis=1)[..., 0]

print(raw_input.shape)
print(raw_label.shape)

TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()

temp.extend(dataset_list)
dataset_list = temp

temp_current = []
temp_prev = []
temp_next = []
temp_mask = []
temp_label = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    np.random.seed(3101)
    null_prob = 0.3
    mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
    np.random.seed(4564)
    raw_null_prob = 0.3
    raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                   replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
    raw_indices = raw_indices*48
    raw_indices_ext = []
    for elem in raw_indices:
        for _ in range(elem, elem+48):
            raw_indices_ext.append(_)
    raw_indices = np.array(raw_indices_ext)
    raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])
    env_df = (env_df - MINS)/(MAXS - MINS)
    
    mask[raw_indices, :] = 0
    missing_df = env_df*mask
    
    for INDEX in range(2, env_df.shape[0]-1):
        temp_prev.append(missing_df[(INDEX-2):(INDEX-1), :])
        temp_current.append(missing_df[(INDEX-1):INDEX, :])
        temp_next.append(missing_df[INDEX:INDEX+1, :])
        
        temp_mask.append(mask[(INDEX-1):INDEX, :])
        temp_label.append(env_df[(INDEX-1):INDEX, :])
temp_prev = np.stack(temp_prev)
temp_current = np.stack(temp_current)
temp_next = np.stack(temp_next)
temp_mask = np.stack(temp_mask)
temp_label = np.stack(temp_label)

test_input = np.stack([temp_current, temp_mask, temp_prev, temp_next], axis = -1)
test_label = temp_label[..., np.newaxis]

print(test_input.shape)
print(test_label.shape)

test_input = np.concatenate([test_input.squeeze(axis=1)[..., 0],
                             test_input.squeeze(axis=1)[..., 1],
                             test_input.squeeze(axis=1)[..., 2],
                             test_input.squeeze(axis=1)[..., 3]
                            ], axis=1)
test_label = test_label.squeeze(axis=1)[..., 0]

print(test_input.shape)
print(test_label.shape)

f = open('./data/tot_dataset_ffnn.npz', 'wb')
np.savez(f,
         raw_input = raw_input,
         raw_label = raw_label,
         test_input = test_input,
         test_label = test_label,
         MAXS = MAXS,
         MINS = MINS
        )
f.close()