In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
TIME_STEPS = 24
OUTPUT_SIZE = 24

## Training data

In [3]:
DIRECTORY = './env_set/'
file_list = os.listdir(DIRECTORY)
dataset_list = [file for file in file_list if file.endswith('.csv') and not file.startswith('Val')]
dataset_list.sort()

In [4]:
dataset_list

['PF_0000336_pap_env.csv',
 'PF_0000587_pap_env.csv',
 'PF_0001122_pap_env.csv',
 'PF_0001284_pap_env.csv',
 'PF_0001394_pap_env.csv',
 'PF_0001399_pap_env.csv',
 'PF_0001401_pap_env.csv',
 'PF_0001403_pap_env.csv',
 'PF_0002322_pap_env.csv',
 'PF_0002633_pap_env.csv']

In [7]:
num_data = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    num_data.append(env_df.shape[0]/24)
num_data = np.array(num_data)
num_data.sum()

2461.0

In [5]:
temp_train_df = []
temp_train_input = []
temp_train_label = []
temp_test_df = []
temp_test_input = []
temp_test_label = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    np.random.seed(3101)
    slicer = int((env_df.shape[0]-OUTPUT_SIZE - TIME_STEPS)/10)
    test_index_start = (np.random.choice(10, 3, replace=False)*slicer).astype('int')
    test_index_bound = np.concatenate([np.arange(_+TIME_STEPS, _+slicer-OUTPUT_SIZE) for _ in test_index_start])
    test_index_start = np.concatenate([np.arange(_, _+slicer) for _ in test_index_start])
    for INDEX in range(TIME_STEPS, env_df.shape[0]-OUTPUT_SIZE):
        if INDEX in test_index_start:
            if INDEX in test_index_bound:
                temp_test_input.append(env_df[(INDEX-TIME_STEPS):INDEX, :])
                temp_test_label.append(env_df[INDEX:(INDEX+OUTPUT_SIZE), :])
            else:
                continue
        else:
            temp_train_input.append(env_df[(INDEX-TIME_STEPS):INDEX, :])
            temp_train_label.append(env_df[INDEX:(INDEX+OUTPUT_SIZE), :])
train_input = np.concatenate(temp_train_input)
train_label = np.concatenate(temp_train_label)
test_input = np.concatenate(temp_test_input)
test_label = np.concatenate(temp_test_label)

In [6]:
MAXS = train_input.max(axis=0)
MINS = train_input.min(axis=0)

In [7]:
train_input = train_input.reshape(-1, TIME_STEPS, env_df.shape[-1])
train_label = train_label.reshape(-1, OUTPUT_SIZE, env_df.shape[-1])
test_input = test_input.reshape(-1, TIME_STEPS, env_df.shape[-1])
test_label = test_label.reshape(-1, OUTPUT_SIZE, env_df.shape[-1])

In [8]:
print(train_input.shape)
print(train_label.shape)
print(test_input.shape)
print(test_label.shape)

(41025, 24, 5)
(41025, 24, 5)
(16119, 24, 5)
(16119, 24, 5)


## Normalization

In [9]:
print(MAXS)
print(MINS)

[  42.4    37.35  100.   2999.    969.27]
[  8.15 -20.92  27.87   1.67   0.  ]


In [10]:
train_input = (train_input - MINS) / (MAXS - MINS)
train_label = (train_label - MINS) / (MAXS - MINS)

test_input = (test_input - MINS) / (MAXS - MINS)
test_label = (test_label - MINS) / (MAXS - MINS)

In [11]:
train_input.shape

(41025, 24, 5)

In [12]:
print(train_input.max(axis=0).max(axis=0))
print(train_label.max(axis=0).max(axis=0))
print()
print(test_input.max(axis=0).max(axis=0))
print(test_label.max(axis=0).max(axis=0))

[1. 1. 1. 1. 1.]
[1.         1.         1.         1.         1.06028248]

[1.36437956 0.9689377  1.         0.9964802  0.98788779]
[1.36437956 0.9689377  1.         0.9964802  0.98788779]


In [13]:
print(train_input.shape)
print(train_label.shape)

(41025, 24, 5)
(41025, 24, 5)


In [14]:
print(test_input.shape)
print(test_label.shape)

(16119, 24, 5)
(16119, 24, 5)


In [15]:
f = open('./env_set/dataset.npz', 'wb')
np.savez(f,
         train_input = train_input,
         train_label = train_label,
         test_input = test_input,
         test_label = test_label,
         MAXS = MAXS,
         MINS = MINS,
         TIME_STEPS = TIME_STEPS,
         OUTPUT_SIZE = OUTPUT_SIZE
        )
f.close()

## Test data

In [8]:
DIRECTORY = './env_set/'
file_list = os.listdir(DIRECTORY)
dataset_list = [file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()

In [9]:
dataset_list

['Val_PF_0001288_pap_env.csv',
 'Val_PF_0001393_pap_env.csv',
 'Val_PF_0001400_pap_env.csv',
 'Val_PF_0002537_pap_env.csv']

In [10]:
num_data = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    num_data.append(env_df.shape[0]/24)
num_data = np.array(num_data)
num_data.sum()

1343.9583333333335

In [18]:
temp_train_df = []
temp_train_input = []
temp_train_label = []
temp_test_df = []
temp_test_input = []
temp_test_label = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    np.random.seed(3101)
    slicer = int((env_df.shape[0]-OUTPUT_SIZE - TIME_STEPS)/10)
    test_index_start = (np.random.choice(10, 7, replace=False)*slicer).astype('int')
    test_index_bound = np.concatenate([np.arange(_+TIME_STEPS, _+slicer-OUTPUT_SIZE) for _ in test_index_start])
    test_index_start = np.concatenate([np.arange(_, _+slicer) for _ in test_index_start])
    for INDEX in range(TIME_STEPS, env_df.shape[0]-OUTPUT_SIZE):
        if INDEX in test_index_start:
            if INDEX in test_index_bound:
                temp_test_input.append(env_df[(INDEX-TIME_STEPS):INDEX, :])
                temp_test_label.append(env_df[INDEX:(INDEX+OUTPUT_SIZE), :])
            else:
                continue
        else:
            temp_train_input.append(env_df[(INDEX-TIME_STEPS):INDEX, :])
            temp_train_label.append(env_df[INDEX:(INDEX+OUTPUT_SIZE), :])
train_input = np.concatenate(temp_train_input)
train_label = np.concatenate(temp_train_label)
test_input = np.concatenate(temp_test_input)
test_label = np.concatenate(temp_test_label)

In [19]:
train_input = train_input.reshape(-1, TIME_STEPS, env_df.shape[-1])
train_label = train_label.reshape(-1, OUTPUT_SIZE, env_df.shape[-1])
test_input = test_input.reshape(-1, TIME_STEPS, env_df.shape[-1])
test_label = test_label.reshape(-1, OUTPUT_SIZE, env_df.shape[-1])

In [20]:
print(train_input.shape)
print(train_label.shape)
print(test_input.shape)
print(test_label.shape)

(9731, 24, 5)
(9731, 24, 5)
(21084, 24, 5)
(21084, 24, 5)


## Normalization

In [21]:
print(MAXS)
print(MINS)

[  42.4    37.35  100.   2999.    969.27]
[  8.15 -20.92  27.87   1.67   0.  ]


In [22]:
train_input = (train_input - MINS) / (MAXS - MINS)
train_label = (train_label - MINS) / (MAXS - MINS)

test_input = (test_input - MINS) / (MAXS - MINS)
test_label = (test_label - MINS) / (MAXS - MINS)

In [23]:
train_input.shape

(9731, 24, 5)

In [24]:
print(train_input.max(axis=0).max(axis=0))
print(train_label.max(axis=0).max(axis=0))
print()
print(test_input.max(axis=0).max(axis=0))
print(test_label.max(axis=0).max(axis=0))

[1.34540146 0.99502317 1.0176071  0.99899911 0.98769177]
[1.34540146 0.99502317 1.0176071  0.99899911 0.98769177]

[1.09138686 0.99038957 1.         0.75523216 1.01530017]
[1.09138686 0.99038957 1.         0.75523216 1.01530017]


In [25]:
print(train_input.shape)
print(train_label.shape)

(9731, 24, 5)
(9731, 24, 5)


In [26]:
print(test_input.shape)
print(test_label.shape)

(21084, 24, 5)
(21084, 24, 5)


In [27]:
f = open('./env_set/val_dataset.npz', 'wb')
np.savez(f,
         train_input = train_input,
         train_label = train_label,
         test_input = test_input,
         test_label = test_label,
         MAXS = MAXS,
         MINS = MINS,
         TIME_STEPS = TIME_STEPS,
         OUTPUT_SIZE = OUTPUT_SIZE
        )
f.close()

## Test data - Tomato

In [11]:
DIRECTORY = './env_set/tom_env/'
file_list = os.listdir(DIRECTORY)
dataset_list = [file for file in file_list if file.endswith('.csv')]
dataset_list.sort()

In [12]:
dataset_list

['PFS_0000008_tom_env.csv',
 'PF_0000227_tom_env.csv',
 'PF_0000304_tom_env.csv',
 'PF_0001405_tom_env.csv',
 'PF_0002525_tom_env.csv',
 'PF_0002526_tom_env.csv',
 'PF_0002527_tom_env.csv',
 'PF_0002529_tom_env.csv',
 'PF_0002533_tom_env.csv',
 'Val_PF_0001396_tom_env.csv',
 'Val_PF_0002528_tom_env.csv',
 'Val_PF_0002531_tom_env.csv',
 'Val_PF_0002532_tom_env.csv']

In [13]:
num_data = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    num_data.append(env_df.shape[0]/24)
num_data = np.array(num_data)
num_data.sum()

5249.458333333333

In [29]:
temp_train_df = []
temp_train_input = []
temp_train_label = []
temp_test_df = []
temp_test_input = []
temp_test_label = []
for FILENAME in dataset_list:
    env_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
    env_df = env_df[~(env_df == 0).all(axis=1)].interpolate().values
    np.random.seed(3101)
    slicer = int((env_df.shape[0]-OUTPUT_SIZE - TIME_STEPS)/10)
    test_index_start = (np.random.choice(10, 7, replace=False)*slicer).astype('int')
    test_index_bound = np.concatenate([np.arange(_+TIME_STEPS, _+slicer-OUTPUT_SIZE) for _ in test_index_start])
    test_index_start = np.concatenate([np.arange(_, _+slicer) for _ in test_index_start])
    for INDEX in range(TIME_STEPS, env_df.shape[0]-OUTPUT_SIZE):
        if INDEX in test_index_start:
            if INDEX in test_index_bound:
                temp_test_input.append(env_df[(INDEX-TIME_STEPS):INDEX, :])
                temp_test_label.append(env_df[INDEX:(INDEX+OUTPUT_SIZE), :])
            else:
                continue
        else:
            temp_train_input.append(env_df[(INDEX-TIME_STEPS):INDEX, :])
            temp_train_label.append(env_df[INDEX:(INDEX+OUTPUT_SIZE), :])
train_input = np.concatenate(temp_train_input)
train_label = np.concatenate(temp_train_label)
test_input = np.concatenate(temp_test_input)
test_label = np.concatenate(temp_test_label)

In [30]:
train_input = train_input.reshape(-1, TIME_STEPS, env_df.shape[-1])
train_label = train_label.reshape(-1, OUTPUT_SIZE, env_df.shape[-1])
test_input = test_input.reshape(-1, TIME_STEPS, env_df.shape[-1])
test_label = test_label.reshape(-1, OUTPUT_SIZE, env_df.shape[-1])

In [31]:
print(train_input.shape)
print(train_label.shape)
print(test_input.shape)
print(test_label.shape)

(37951, 24, 5)
(37951, 24, 5)
(83356, 24, 5)
(83356, 24, 5)


## Normalization

In [32]:
print(MAXS)
print(MINS)

[  42.4    37.35  100.   2999.    969.27]
[  8.15 -20.92  27.87   1.67   0.  ]


In [33]:
train_input = (train_input - MINS) / (MAXS - MINS)
train_label = (train_label - MINS) / (MAXS - MINS)

test_input = (test_input - MINS) / (MAXS - MINS)
test_label = (test_label - MINS) / (MAXS - MINS)

In [34]:
train_input.shape

(37951, 24, 5)

In [35]:
print(train_input.max(axis=0).max(axis=0))
print(train_label.max(axis=0).max(axis=0))
print()
print(test_input.max(axis=0).max(axis=0))
print(test_label.max(axis=0).max(axis=0))

[1.40905109 1.00154453 1.         0.62308121 1.7228636 ]
[1.40905109 1.00154453 1.         0.62308121 1.7228636 ]

[1.52116788 1.01029689 1.         0.79417682 1.45896396]
[1.52116788 1.01029689 1.         0.79417682 1.45896396]


In [36]:
print(train_input.shape)
print(train_label.shape)

(37951, 24, 5)
(37951, 24, 5)


In [37]:
print(test_input.shape)
print(test_label.shape)

(83356, 24, 5)
(83356, 24, 5)


In [38]:
f = open('./env_set/val_tom_dataset.npz', 'wb')
np.savez(f,
         train_input = train_input,
         train_label = train_label,
         test_input = test_input,
         test_label = test_label,
         MAXS = MAXS,
         MINS = MINS,
         TIME_STEPS = TIME_STEPS,
         OUTPUT_SIZE = OUTPUT_SIZE
        )
f.close()