# Build train and test matrices

The first thing to notice is that now, X and Y come from different dataframes. Since we want to use the GP image as a feature, but predict the value at each sensor. Maybe it can be used as a suplementary feature and combine both.

In order to avoid repeating the GP matrix when creating df_roll, we apply df_shift to df_idx, the dataframe connecting the datetime to the index of the GP array.

In [156]:
import pandas as pd
import numpy as np
import sys
import os
%load_ext autoreload
%autoreload 2

sys.path.append('../src/')
from utils.build_matrix import df_shift

PATH_DATA = '../../data'
if not os.path.isdir(PATH_DATA):
    print('The folder {} does not exist!'.format(PATH_DATA))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [286]:
try:
    df_gp = pd.read_pickle(os.path.join(PATH_DATA,'oahu_GP.pkl')) 
#     df.colums = data.colums.drop()
except:
    print('File {} does not exist.'.format(os.path.join(PATH_DATA,'oahu_GP.pkl')))

try:
    df_sensors = pd.read_pickle(os.path.join(PATH_DATA,'oahu_min_final.pkl')) 
#     df.colums = data.colums.drop()
except:
    print('File {} does not exist.'.format(os.path.join(PATH_DATA,'oahu_min_final.pkl')))

assert (df_sensors.index == df_gp.index).all(), 'The index is not the same'
df_idx = pd.DataFrame(data = list(range(len(df_gp))),index=df_gp.index,columns=['idx'],dtype='int')

# We load the info of the sensors to extract the longitude information
try:
    info = pd.read_pickle(os.path.join(PATH_DATA,'info.pkl')) 
#     df.colums = data.colums.drop()
except:
    print('File {} does not exist.'.format(os.path.join(PATH_DATA,'info.pkl')))

# Sorted longitudes
lon = info['Longitude'].sort_values(ascending=False).drop('AP3')
lat = info['Latitude'].sort_values(ascending=False).drop('AP3')
# Sort by lon
df_sensors[lon.index]


lon_list = df_gp.columns.levels[0].to_numpy()
lat_list = df_gp.columns.levels[1].to_numpy()
sensors_list = df_sensors.columns.to_numpy()
print('There are {} lons: {}'.format(len(lon_list),list(lon_list)))
print('There are {} lats: {}'.format(len(lat_list),list(lat_list)))
print('There are {} sensors: {}'.format(len(sensors_list),list(sensors_list)))

There are 12 lons: [-158.088, -158.087, -158.086, -158.085, -158.084, -158.083, -158.082, -158.081, -158.08, -158.079, -158.078, -158.077]
There are 9 lats: [21.308, 21.309, 21.31, 21.311, 21.312, 21.313, 21.314, 21.315, 21.316]
There are 16 sensors: ['AP1', 'AP4', 'AP5', 'AP6', 'AP7', 'DH1', 'DH10', 'DH11', 'DH2', 'DH3', 'DH4', 'DH5', 'DH6', 'DH7', 'DH8', 'DH9']


In [287]:
df_gp.sort_index(axis=1)
array_gp = df_gp.to_numpy().reshape(  [-1]+list(map(len,df_gp.columns.levels))   )
print(array_gp.shape)

array_sensors = df_sensors.to_numpy().reshape(  [-1]+[len(df_sensors.columns)]   )
print(array_sensors.shape)

(355387, 12, 9)
(355387, 16)


In [288]:
def df_shift(df, periods=1):
    return (pd.concat([df] + [ df.tshift(t+1, freq='1min') for t in range(periods) ], axis=1, 
                      keys=['t'] + [ 't-{:d}'.format(t+1) for t in range(periods) ],
                     names = ['time']+df.columns.names)
.dropna())

In [289]:
df_roll = df_shift(df_idx, periods=3)

In [293]:
# # Split target (time t) and variables (times t-1 to t-width+1)
# y = df_roll['t']
# X = df_roll.drop(columns='t', level='time')
df_train = df_roll[:'2011-07-31']
df_test = df_roll['2011-08-01':]

y_idx_train = df_train['t'].to_numpy(dtype='int').flatten()
# Reverse the time index for the X, in case we use Recursive NN
X_idx_train = df_train.drop(labels='t',axis=1,level='time').to_numpy(dtype='int')[:,::-1]

y_idx_test = df_test['t'].to_numpy(dtype='int').flatten()
# Reverse the time index for the X
X_idx_test = df_test.drop(labels='t',axis=1,level='time').to_numpy(dtype='int')[:,::-1]

Now, we use as features the GP array and as labels the sensors values.

In [294]:
X_train = array_gp[X_idx_train]
y_train = array_sensors[y_idx_train]

X_test = array_gp[X_idx_test]
y_test = array_sensors[y_idx_test]

print(X_train.shape)
print(y_train.shape)

(298595, 3, 12, 9)
(298595, 16)


In [337]:
def to_array(df_gp, df_sensors, periods):
    
    assert (df_sensors.index == df_gp.index).all(), 'The index is not the same'
    df_idx = pd.DataFrame(data = list(range(len(df_gp))),index=df_gp.index,columns=['idx'],dtype='int')
    
    df_gp.sort_index(axis=1)
    array_gp = df_gp.to_numpy().reshape(  [-1]+list(map(len,df_gp.columns.levels))   )
    print(array_gp.shape)

    array_sensors = df_sensors.to_numpy().reshape(  [-1]+[len(df_sensors.columns)]   )
    print(array_sensors.shape)
    
    df_roll = df_shift(df_idx, periods=periods)
    
    df_train = df_roll[:'2011-07-31']
    df_test = df_roll['2011-08-01':]

    y_idx_train = df_train['t'].to_numpy(dtype='int').flatten()
    # Reverse the time index for the X, in case we use Recursive NN
    X_idx_train = df_train.drop(labels='t',axis=1,level='time').to_numpy(dtype='int')[:,::-1]

    y_idx_test = df_test['t'].to_numpy(dtype='int').flatten()
    # Reverse the time index for the X
    X_idx_test = df_test.drop(labels='t',axis=1,level='time').to_numpy(dtype='int')[:,::-1]
    
    
    X_train = array_gp[X_idx_train]
    y_train = array_sensors[y_idx_train]

    X_test = array_gp[X_idx_test]
    y_test = array_sensors[y_idx_test]
    
    return X_train, y_train, X_test, y_test

In [300]:
for sensor_value in array_sensors[10]:
    print(np.isclose(array_gp[10],sensor_value,atol=0.01))

[[False False False False False False False False False]
 [False False False False False False False False False]
 [False False False False  True False False False False]
 [False False False False False  True False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]
 [False False False False False False False False False]]
[[False False False False False False False False False]
 [False False False False  True  True False False False]
 [False False False False False  True False False False]
 [False False False False  True False False False False]
 [False False False False False False False False False]
 [False False False False Fals

# Convolutional predictor

First we preprocess the dataset (for the moment, we'll just use as features the t-1 values at each sensor)

Now, in order to use a 1D convolution, we are going to sort the sensors. For the initial test, we'll just sort them by longitude (from East to West). That way, nearer sensors are in close positions in the tensor, so the 1D convolution may extract useful correlations.

Note: many other possible ordenations of the sensors could be added as new channels in the input tensor

Now we specify which sensor do we want to predict and test.

(In the future, we need to discuss how are we going to predict, if just by looping over each sensor, or just give a vectorial prediction)

In [399]:
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda, Reshape, Add, Multiply, Subtract, Dropout
from keras.layers import Conv2D, MaxPooling2D, LocallyConnected1D, Conv1D, UpSampling1D, MaxPooling1D, Dot, Concatenate
from keras.layers import TimeDistributed, LocallyConnected2D
from keras.layers import AveragePooling2D, LSTM

from keras import backend as K

Model architecture is defined below.

Some highlights:
* Locally connected works better than pure convolutional at the first layers (probably because the sensors at not located in a uniform grid)
* Trick to improve acc: add a final layer combining the convolutional prediction with the persistance prediction, so in case the input is "strange", the model could learn to output the persistance prediction (i.e., the previous time-step), which is somewhat reasonable

In [315]:
y_train.shape

(298595, 16)

In [395]:
def make_model_base( input_shape, n_sensors = 16):
    ''' Returns a model using all the sensors to predict index_sensor 
    Input shape : (items, time_slice, longitude, latitude)
    '''
    xin = Input(shape=input_shape, name='GP_input')
    x = LocallyConnected2D(n_sensors, 3, data_format = 'channels_first', padding='valid',activation='relu', name='LocCon1')(xin)
    x = AveragePooling2D(data_format='channels_first')(x)
    x = LocallyConnected2D(n_sensors, 3, data_format = 'channels_first', padding='valid',activation='relu', name='LocCon2')(x)
    xo = TimeDistributed(Dense(2*n_sensors, name='TDDense',activation='relu'))(x)
    xo = Flatten(data_format='channels_first')(xo)
    xo = Dense(2*n_sensors, name='Dense1',activation='relu')(xo)
    xo = Dense(n_sensors, name='Dense2',activation='relu')(xo)
  
    x_final = Activation('relu', name='FinalRelu')(xo)
    
    model = Model(inputs=[xin], outputs=[x_final])
    return model

In [406]:
def make_model_rnn( input_shape, n_sensors = 16):
    ''' Returns a model using all the sensors to predict index_sensor 
    Input shape : (items, time_slice, longitude, latitude)
    '''
    xin = Input(shape=input_shape, name='GP_input')
    
    x_add_dim = Lambda(K.expand_dims)(xin)
    x = TimeDistributed( LocallyConnected2D(5,3, data_format = 'channels_last', padding = 'valid', activation ='relu')  )(x_add_dim)
    x = TimeDistributed( Flatten() )(x)
    x = TimeDistributed( Dense(5) )(x)
    
    x1 = LSTM(30)(x)
    
#     xo = Flatten(data_format='channels_first')(x1)
    xo = Dense(2*n_sensors, name='Dense1',activation='relu')(x1)
    xo = Dense(n_sensors, name='Dense2',activation='relu')(xo)
  
    x_final = Activation('relu', name='FinalRelu')(xo)
    
    model = Model(inputs=[xin], outputs=[x_final])
    return model

In [407]:
model = make_model_rnn(X_train.shape[1:], n_sensors=y_train.shape[1])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
GP_input (InputLayer)        (None, 3, 12, 9)          0         
_________________________________________________________________
lambda_4 (Lambda)            (None, 3, 12, 9, 1)       0         
_________________________________________________________________
time_distributed_46 (TimeDis (None, 3, 10, 7, 5)       3500      
_________________________________________________________________
time_distributed_47 (TimeDis (None, 3, 350)            0         
_________________________________________________________________
time_distributed_48 (TimeDis (None, 3, 5)              1755      
_________________________________________________________________
lstm_3 (LSTM)                (None, 30)                4320      
_________________________________________________________________
Dense1 (Dense)               (None, 32)                992       
__________

In [416]:
def train_and_tess(df_gp, df_sensors,make_model_base ):
    X_tr1, y_tr1, X_te1, y_te1 = to_array(df_gp, df_sensors, periods=3)
    input_shape = X_tr1.shape[1:]
    n_sensors = y_tr1.shape[1]
    # Validation using TS split (just to obtain different MAE estimations, no hyperoptimization for the moment)
    cv_loss = []
    cv = TimeSeriesSplit(n_splits=5)
    for i, (tr_idx, va_idx) in enumerate(cv.split(X_tr1)):
        model = make_model_base(input_shape, n_sensors=n_sensors)
        model.compile(opt, 
                      loss='mean_absolute_error', 
                      metrics = ['mean_absolute_percentage_error'])
        history_cv = model.fit(X_tr1[tr_idx], y_tr1[tr_idx],
                  batch_size=batch_size, 
                  epochs=epochs, 
                  validation_data=(np.atleast_3d(X_tr1[va_idx]), y_tr1[va_idx]), 
                  callbacks=[c2], 
                  shuffle = False,
                  verbose=0)
        cv_loss.append(history_cv.history['val_loss'][-1])
        print('Fold {}/{} ended!'.format(
            i+1,cv.get_n_splits()
        ))
    # Testing
    model = make_model_base(input_shape, n_sensors=n_sensors)
    model.compile(opt, 
                      loss='mean_absolute_error', 
                      metrics = ['mean_absolute_percentage_error'])

    history_test = model.fit(X_tr1, y_tr1, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_data=(X_te1, y_te1), 
              callbacks=[c2, c3], 
              shuffle = False,
              verbose=0)
    pred = model.predict(X_te1)
    mae_val_list = np.mean(np.abs(pred-y_te1),axis=0)
    test_loss = history_test.history['val_loss'][-1]
    
    print('MAE_val ', cv_loss)
    print('MAE_test ', test_loss)
    print('MAE_test dissagregated', mae_val_list)
    
    return test_loss, cv_loss, mae_val_list, model

In [413]:
pred = model.predict(X_te1)
mae_val_list = np.mean(np.abs(pred-y_te1),axis=0)

In [414]:
mae_val_list

array([0.60197124, 0.60372911, 0.60158409, 0.60919865, 0.61403072,
       0.60375531, 0.6004798 , 0.60187092, 0.60143636, 0.61015682,
       0.59612057, 0.60851704, 0.62237687, 0.61428493, 0.59714013,
       0.61135669])

Now we are ready to train. The below configuration should take 2 minutes in a 16 core CPU
(no GPU needed). We are using a huge batch-size to speed up things

In [397]:
from sklearn.model_selection import TimeSeriesSplit

lr = 0.0001
lr = 0.0001
opt = keras.optimizers.Adam(lr=lr)

# We add a callback to log metrics and another one to schedule the learning rate

#see clr.py in this same folder
from utils.clr import CyclicLR

c1 = keras.callbacks.BaseLogger(stateful_metrics=None)
c2 = CyclicLR(step_size=250, base_lr=lr)


batch_size = 2048   # as big as possible so we can explore many models
epochs = 32

In [398]:
test_loss, cv_loss, mae_val_list,model = train_and_tess(df_gp,df_sensors, make_model_base)

(355387, 12, 9)
(355387, 16)


  % delta_t_median)


Fold 1/5 ended!
Fold 2/5 ended!
Fold 3/5 ended!
Fold 4/5 ended!
Fold 5/5 ended!
MAE_val  [0.15324023766971814, 0.08265787162912991, 0.048722806831921546, 0.0703014814328467, 0.11200692595132306]
MAE_test  0.0848339474533158


In [417]:
test_loss, cv_loss, mae_val_list,model = train_and_tess(df_gp,df_sensors, make_model_rnn)

(355387, 12, 9)
(355387, 16)
Fold 1/5 ended!
Fold 2/5 ended!
Fold 3/5 ended!
Fold 4/5 ended!
Fold 5/5 ended!
MAE_val  [0.11688826329759841, 0.08202466341479453, 0.03480555661577358, 0.06845635730182958, 0.1053334079000874]
MAE_test  0.08313409182283271
MAE_test dissagregated [0.07594877 0.08945843 0.07794931 0.1075724  0.09771928 0.09725696
 0.07316568 0.07443318 0.08763584 0.07830296 0.07806862 0.07983921
 0.07595031 0.08333987 0.07637616 0.07712847]


In [419]:
for sensor, mae in zip(sensors_list,mae_val_list):
    print(sensor,':',mae)

AP1 : 0.07594876622300296
AP4 : 0.08945843413185926
AP5 : 0.07794931063452094
AP6 : 0.1075724044997297
AP7 : 0.09771928025793097
DH1 : 0.09725695730749398
DH10 : 0.07316567883830735
DH11 : 0.07443318272480763
DH2 : 0.08763583841571153
DH3 : 0.07830296454216458
DH4 : 0.07806861531233503
DH5 : 0.07983920661425122
DH6 : 0.07595031261948751
DH7 : 0.08333986833906305
DH8 : 0.07637615707183532
DH9 : 0.07712846554004894


In [412]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
GP_input (InputLayer)        (None, 3, 12, 9)          0         
_________________________________________________________________
lambda_4 (Lambda)            (None, 3, 12, 9, 1)       0         
_________________________________________________________________
time_distributed_46 (TimeDis (None, 3, 10, 7, 5)       3500      
_________________________________________________________________
time_distributed_47 (TimeDis (None, 3, 350)            0         
_________________________________________________________________
time_distributed_48 (TimeDis (None, 3, 5)              1755      
_________________________________________________________________
lstm_3 (LSTM)                (None, 30)                4320      
_________________________________________________________________
Dense1 (Dense)               (None, 32)                992       
__________

In [411]:
model.predict(X_tr1[100:110])

array([[0.        , 0.        , 0.00476906, 0.        , 0.00265758,
        0.        , 0.00912051, 0.        , 0.00425053, 0.        ,
        0.00807438, 0.00012891, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00474056, 0.        , 0.00264126,
        0.        , 0.00907155, 0.        , 0.00421449, 0.        ,
        0.00802038, 0.00012716, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00472278, 0.        , 0.00263032,
        0.        , 0.00903426, 0.        , 0.00419413, 0.        ,
        0.00798569, 0.00012277, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00470619, 0.        , 0.00262324,
        0.        , 0.0090066 , 0.        , 0.00418057, 0.        ,
        0.00796086, 0.00012267, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.0046875 , 0.        , 0.00260448,
        0.        , 0.00896043, 

In [380]:
df_sensors[(df_sensors == 0).sum(axis=1) != 0]

Location,AP1,AP4,AP5,AP6,AP7,DH1,DH10,DH11,DH2,DH3,DH4,DH5,DH6,DH7,DH8,DH9
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-10-31 14:57:00-10:00,0.0,0.303491,0.199010,0.222799,0.730146,0.235378,0.206958,0.195953,0.222458,0.197924,0.207120,0.215793,0.213093,0.218725,0.197835,0.200003
2011-10-31 14:58:00-10:00,0.0,0.210922,0.195017,0.240965,0.730983,0.246627,0.191409,0.182255,0.241759,0.182671,0.224530,0.220245,0.189771,0.202259,0.181770,0.179073
2011-10-31 14:59:00-10:00,0.0,0.300267,0.199470,0.277964,0.731833,0.351331,0.192025,0.182467,0.484857,0.176684,0.184388,0.193702,0.190857,0.305415,0.178547,0.259879
2011-10-31 15:00:00-10:00,0.0,0.684157,0.202666,0.219297,0.732695,0.724320,0.186316,0.186065,0.572306,0.177926,0.185465,0.192396,0.193652,0.310382,0.182195,0.187766
2011-10-31 15:01:00-10:00,0.0,0.625959,0.199096,0.202010,0.733570,0.678754,0.180199,0.179512,0.193857,0.171924,0.177501,0.182649,0.183730,0.200460,0.176388,0.172972
2011-10-31 15:02:00-10:00,0.0,0.197745,0.184069,0.203540,0.734458,0.192645,0.174070,0.178034,0.177689,0.170059,0.171246,0.179412,0.173742,0.178450,0.167124,0.161774
2011-10-31 15:03:00-10:00,0.0,0.205087,0.189390,0.368534,0.735360,0.172765,0.314819,0.258466,0.177907,0.671207,0.319159,0.686979,0.181494,0.171523,0.236762,0.160934
2011-10-31 15:04:00-10:00,0.0,0.757395,0.211734,0.673544,0.736275,0.614117,0.177279,0.181024,0.558267,0.170480,0.174264,0.183318,0.247375,0.182682,0.180489,0.184005
2011-10-31 15:05:00-10:00,0.0,0.340428,0.245633,0.584497,0.737205,0.691210,0.179094,0.188061,0.435888,0.173817,0.174916,0.181619,0.175070,0.452647,0.175527,0.163418
2011-10-31 15:06:00-10:00,0.0,0.237139,0.322246,0.539593,0.738149,0.444239,0.182908,0.190431,0.173872,0.183419,0.179909,0.204628,0.176905,0.179772,0.178349,0.162583


In [18]:
maes1 = {}
maes2 = {}
for idx_sensor, id_sensor in enumerate(lon.index.values):
    print(idx_sensor, id_sensor)
    maes1[id_sensor], _ = train_and_test_sensor(idx_sensor, id_sensor, n_sensors=16)
    maes2[id_sensor], _ = train_and_test_sensor_2D(idx_sensor, id_sensor, n_sensors=16)
    break

0 AP7
MAE_val  [0.1303799112687743, 0.08445334024805945, 0.03708531506906786, 0.07641364477457299, 0.1326083803546976]
MAE_test  0.09443650227616966
MAE_val  [0.131003138528665, 0.08435243353192248, 0.036877147991262245, 0.07672260898763986, 0.13359125043987616]
MAE_test  0.09414855431306496


In [19]:
maes1 = pd.Series(maes1, name='MAE').sort_values()
maes1

AP7    0.094437
Name: MAE, dtype: float64

In [20]:
maes2 = pd.Series(maes2, name='MAE').sort_values()
maes2

{'AP7': 0.09414855431306496}