In [1]:
import pandas as pd
import numpy as np

np.set_printoptions(precision=6, suppress=True)

In [2]:
df_train = pd.read_csv('./data/second-order/Centar_train', index_col=0)
df_valid = pd.read_csv('./data/second-order/Centar_validation', index_col=0)
df_test = pd.read_csv('./data/second-order/Centar_test', index_col=0)

In [3]:
df_train.head()

Unnamed: 0_level_0,PM10,PM25,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,weekend,holiday,...,precip_intensity,precip_probability,temperature,uv_index,visibility,wind_speed,wind_bearing_sin,wind_bearing_cos,PM10_missing,PM25_missing
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-09-13 18:00:00,-0.330611,0.856226,0.0,0.5,0.900969,0.801938,0.900969,0.801938,0.0,0.0,...,0.0,0.0,2.009019,0.0,0.6206,0.225955,0.620961,0.985148,0.0,0.0
2011-09-13 19:00:00,-0.408947,0.890166,0.017037,0.62941,0.900969,0.801938,0.900969,0.801938,0.0,0.0,...,0.0,0.0,1.804491,0.0,0.6206,0.264156,0.569587,0.995134,0.0,0.0
2011-09-13 20:00:00,-0.468705,0.821216,0.066987,0.75,0.900969,0.801938,0.900969,0.801938,0.0,0.0,...,0.0,0.0,1.521727,0.0,0.6206,0.21864,0.719186,0.949397,0.0,0.0
2011-09-13 21:00:00,0.390181,1.59406,0.146447,0.853553,0.900969,0.801938,0.900969,0.801938,0.0,0.0,...,0.0,0.0,1.337316,0.0,0.6206,0.091574,0.32899,0.969846,0.0,0.0
2011-09-13 22:00:00,0.616925,1.795114,0.25,0.933013,0.900969,0.801938,0.900969,0.801938,0.0,0.0,...,0.0,0.0,1.09367,0.0,0.6206,0.087781,0.430413,0.995134,0.0,0.0


In [6]:
def build_seq2seq_datasets(dataset, mode, history = 24, target_size = 12):
    start_index = history
    end_index = len(dataset) - target_size
    
    # Selecting the appropriate columns from the dataset
    encoder_input_dataset = dataset.values.copy()
    decoder_input_dataset = dataset.drop(['PM10_missing', 'PM25_missing'], axis=1).values.copy()
    decoder_target_dataset = dataset[['PM10', 'PM10_missing']].values.copy()

    # These lists will hold the final (third-order) datasets
    encoder_input_data = []
    decoder_input_data = []
    decoder_target_data = []
    
    for i in range(start_index, end_index):
        encoder_input = encoder_input_dataset[i-history:i]
        decoder_input = decoder_input_dataset[i:i+target_size]
        decoder_output = decoder_target_dataset[i+1:i+1+target_size]
        
        # If we are building a test/validation set and any of the 
        # target values has been imputed, then we discard the sample
        if mode == 'evaluating' and np.any(decoder_output[:, 1] == 1):
            continue
            
        encoder_input_data.append(encoder_input)
        decoder_input_data.append(decoder_input)
        decoder_target_data.append(decoder_output)

    encoder_input_data = np.array(encoder_input_data).reshape(-1, 
                                                              history, 
                                                              encoder_input_dataset.shape[1])
    decoder_input_data = np.array(decoder_input_data).reshape(-1, 
                                                              target_size, 
                                                              decoder_input_dataset.shape[1])
    decoder_target_data = np.array(decoder_target_data).reshape(-1, 
                                                                target_size, 
                                                                decoder_target_dataset.shape[1])
        
    return encoder_input_data, decoder_input_data, decoder_target_data

In [7]:
train_encoder_input_data, train_decoder_input_data, train_decoder_target_data = \
        build_seq2seq_datasets(df_train, mode='training')

valid_encoder_input_data, valid_decoder_input_data, valid_decoder_target_data = \
        build_seq2seq_datasets(df_valid, mode='evaluating')

test_encoder_input_data, test_decoder_input_data, test_decoder_target_data = \
        build_seq2seq_datasets(df_test, mode='evaluating')

In [11]:
np.save('./data/third-order/seq2seq/train_encoder_input_data.npy', train_encoder_input_data)
np.save('./data/third-order/seq2seq/train_decoder_input_data.npy', train_decoder_input_data)
np.save('./data/third-order/seq2seq/train_decoder_target_data.npy', train_decoder_target_data)

np.save('./data/third-order/seq2seq/valid_encoder_input_data.npy', valid_encoder_input_data)
np.save('./data/third-order/seq2seq/valid_decoder_input_data.npy', valid_decoder_input_data)
np.save('./data/third-order/seq2seq/valid_decoder_target_data.npy', valid_decoder_target_data)

np.save('./data/third-order/seq2seq/test_encoder_input_data.npy', test_encoder_input_data)
np.save('./data/third-order/seq2seq/test_decoder_input_data.npy', test_decoder_input_data)
np.save('./data/third-order/seq2seq/test_decoder_target_data.npy', test_decoder_target_data)