# Build train and test matrices

The first thing to notice is that now, X and Y come from different dataframes. Since we want to use the GP image as a feature, but predict the value at each sensor. Maybe it can be used as a suplementary feature and combine both.

In order to avoid repeating the GP matrix when creating df_roll, we apply df_shift to df_idx, the dataframe connecting the datetime to the index of the GP array.

In [156]:
import pandas as pd
import numpy as np
import sys
import os
%load_ext autoreload
%autoreload 2

sys.path.append('../src/')
from utils.build_matrix import df_shift

PATH_DATA = '../../data'
if not os.path.isdir(PATH_DATA):
    print('The folder {} does not exist!'.format(PATH_DATA))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [286]:
try:
    df_gp = pd.read_pickle(os.path.join(PATH_DATA,'oahu_GP.pkl')) 
#     df.colums = data.colums.drop()
except:
    print('File {} does not exist.'.format(os.path.join(PATH_DATA,'oahu_GP.pkl')))

try:
    df_sensors = pd.read_pickle(os.path.join(PATH_DATA,'oahu_min_final.pkl')) 
#     df.colums = data.colums.drop()
except:
    print('File {} does not exist.'.format(os.path.join(PATH_DATA,'oahu_min_final.pkl')))

assert (df_sensors.index == df_gp.index).all(), 'The index is not the same'
df_idx = pd.DataFrame(data = list(range(len(df_gp))),index=df_gp.index,columns=['idx'],dtype='int')

# We load the info of the sensors to extract the longitude information
try:
    info = pd.read_pickle(os.path.join(PATH_DATA,'info.pkl')) 
#     df.colums = data.colums.drop()
except:
    print('File {} does not exist.'.format(os.path.join(PATH_DATA,'info.pkl')))

# Sorted longitudes
lon = info['Longitude'].sort_values(ascending=False).drop('AP3')
lat = info['Latitude'].sort_values(ascending=False).drop('AP3')
# Sort by lon
df_sensors[lon.index]


lon_list = df_gp.columns.levels[0].to_numpy()
lat_list = df_gp.columns.levels[1].to_numpy()
sensors_list = df_sensors.columns.to_numpy()
print('There are {} lons: {}'.format(len(lon_list),list(lon_list)))
print('There are {} lats: {}'.format(len(lat_list),list(lat_list)))
print('There are {} sensors: {}'.format(len(sensors_list),list(sensors_list)))

There are 12 lons: [-158.088, -158.087, -158.086, -158.085, -158.084, -158.083, -158.082, -158.081, -158.08, -158.079, -158.078, -158.077]
There are 9 lats: [21.308, 21.309, 21.31, 21.311, 21.312, 21.313, 21.314, 21.315, 21.316]
There are 16 sensors: ['AP1', 'AP4', 'AP5', 'AP6', 'AP7', 'DH1', 'DH10', 'DH11', 'DH2', 'DH3', 'DH4', 'DH5', 'DH6', 'DH7', 'DH8', 'DH9']


In [287]:
df_gp.sort_index(axis=1)
array_gp = df_gp.to_numpy().reshape(  [-1]+list(map(len,df_gp.columns.levels))   )
print(array_gp.shape)

array_sensors = df_sensors.to_numpy().reshape(  [-1]+[len(df_sensors.columns)]   )
print(array_sensors.shape)

(355387, 12, 9)
(355387, 16)


In [288]:
def df_shift(df, periods=1):
    return (pd.concat([df] + [ df.tshift(t+1, freq='1min') for t in range(periods) ], axis=1, 
                      keys=['t'] + [ 't-{:d}'.format(t+1) for t in range(periods) ],
                     names = ['time']+df.columns.names)
.dropna())

In [289]:
df_roll = df_shift(df_idx, periods=3)

In [290]:
# # Split target (time t) and variables (times t-1 to t-width+1)
# y = df_roll['t']
# X = df_roll.drop(columns='t', level='time')
df_train = df_roll[:'2011-07-31']
df_test = df_roll['2011-08-01':]

y_idx_train = df_train['t'].to_numpy(dtype='int')
# Reverse the time index for the X, in case we use Recursive NN
X_idx_train = df_train.drop(labels='t',axis=1,level='time').to_numpy(dtype='int')[:,::-1]

y_idx_test = df_test['t'].to_numpy(dtype='int')
# Reverse the time index for the X
X_idx_test = df_test.drop(labels='t',axis=1,level='time').to_numpy(dtype='int')[:,::-1]

Now, we use as features the GP array and as labels the sensors values.

In [291]:
X_train = array_gp[X_idx_train]
y_train = array_sensors[y_idx_train]

X_test = array_gp[X_idx_test]
y_test = array_sensors[y_idx_test]

# Convolutional predictor

First we preprocess the dataset (for the moment, we'll just use as features the t-1 values at each sensor)

Now, in order to use a 1D convolution, we are going to sort the sensors. For the initial test, we'll just sort them by longitude (from East to West). That way, nearer sensors are in close positions in the tensor, so the 1D convolution may extract useful correlations.

Note: many other possible ordenations of the sensors could be added as new channels in the input tensor

Now we specify which sensor do we want to predict and test.

(In the future, we need to discuss how are we going to predict, if just by looping over each sensor, or just give a vectorial prediction)

In [11]:
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda, Reshape, Add, Multiply, Subtract, Dropout
from keras.layers import Conv2D, MaxPooling2D, LocallyConnected1D, Conv1D, UpSampling1D, MaxPooling1D, Dot, Concatenate

from keras import backend as K

Using TensorFlow backend.


Model architecture is defined below.

Some highlights:
* Locally connected works better than pure convolutional at the first layers (probably because the sensors at not located in a uniform grid)
* Trick to improve acc: add a final layer combining the convolutional prediction with the persistance prediction, so in case the input is "strange", the model could learn to output the persistance prediction (i.e., the previous time-step), which is somewhat reasonable

In [12]:
def make_model_sensor(idx_sensor, n_sensors=16):
    ''' Returns a model using all the sensors to predict index_sensor '''
    xin = Input(shape=(n_sensors, 1), name='main_input')
    x = LocallyConnected1D(8, 7, data_format = 'channels_last', padding='valid')(xin)
    x = Activation('relu')(x)
    x = LocallyConnected1D(16, 5, data_format = 'channels_last', padding='valid')(x)
    x = Activation('relu')(x)
    x = Conv1D(32, 3, data_format = 'channels_last', padding='causal')(x)
    xl = Flatten()(x)
    xl = Dropout(0.2)(xl)
    xo = Dense(1)(xl)

    # use date info here?
    xinf = Flatten()(xin)
    s  = Dense(5)(xinf)
    s = Activation('tanh')(s)
    s = Dense(2)(s)
    s = Activation('softmax')(s)

    # sort of residual connection
    xin_0 = Activation('relu')(xin)
    xin_1 = Lambda(lambda x : x[:, idx_sensor, :])(xin_0)
    xo_m = Dot(axes=1)([Concatenate()([xo, xin_1]), s])
    xo_m = Activation('relu')(xo_m)

    model = Model(inputs=[xin], outputs=[xo_m])
    return model

In [13]:
def make_model_sensor_2D(idx_sensor, n_sensors=16):
    ''' Returns a model using all the sensors to predict index_sensor '''
    xin = Input(shape=(n_sensors, 1), name='lon_input')
    x = LocallyConnected1D(8, 7, data_format = 'channels_last', padding='valid')(xin)
    x = Activation('relu')(x)
    x = LocallyConnected1D(16, 5, data_format = 'channels_last', padding='valid')(x)
    x = Activation('relu')(x)
    x = Conv1D(32, 3, data_format = 'channels_last', padding='causal')(x)
    xl = Flatten()(x) 
    
    yin = Input(shape=(n_sensors, 1), name='lat_input')
    y = LocallyConnected1D(8, 7, data_format = 'channels_last', padding='valid')(xin)
    y = Activation('relu')(x)
    y = LocallyConnected1D(16, 5, data_format = 'channels_last', padding='valid')(x)
    y = Activation('relu')(x)
    y = Conv1D(32, 3, data_format = 'channels_last', padding='causal')(x)
    yl = Flatten()(y)
    
    xc = Concatenate()([xl, yl])
    xc = Dropout(0.2)(xc)
    xo = Dense(1)(xc)

    # use date info here?
    xinf = Flatten()(xin)
    s  = Dense(5)(xinf)
    s = Activation('tanh')(s)
    s = Dense(2)(s)
    s = Activation('softmax')(s)

    # sort of residual connection
    xin_0 = Activation('relu')(xin)
    xin_1 = Lambda(lambda x : x[:, idx_sensor, :])(xin_0)
    xo_m = Dot(axes=1)([Concatenate()([xo, xin_1]), s])
    xo_m = Activation('relu')(xo_m)

    model = Model(inputs=[xin, yin], outputs=[xo_m])
    return model

Now we are ready to train. The below configuration should take 2 minutes in a 16 core CPU
(no GPU needed). We are using a huge batch-size to speed up things

In [14]:
def to_array(X_train, y_train, X_test, y_test, id_sensor='AP5', val=0.1):
    ''' Converts dataframe to numpy array for predicting any given sensor. val specifies the fraction
    of training samples to be used as validation. '''
    X_tr1_1_np = X_train.values
    y_tr1_1_np = y_train[id_sensor].values
    
    #val_idx = int((1 - val)*len(y_tr1_1_np))

    X_te1_1_np = X_test.values
    y_te1_1_np = y_test[id_sensor].values
    
    #return X_tr1_1_np[:val_idx], y_tr1_1_np[:val_idx], X_tr1_1_np[val_idx:], y_tr1_1_np[val_idx:], X_te1_1_np, y_te1_1_np
    return X_tr1_1_np, y_tr1_1_np, X_te1_1_np, y_te1_1_np

In [15]:
from sklearn.model_selection import TimeSeriesSplit

lr = 0.0001
lr = 0.0001
opt = keras.optimizers.Adam(lr=lr)

# We add a callback to log metrics and another one to schedule the learning rate

#see clr.py in this same folder
from utils.clr import CyclicLR

c1 = keras.callbacks.BaseLogger(stateful_metrics=None)
c2 = CyclicLR(step_size=250, base_lr=lr)
c3 = keras.callbacks.History()

batch_size = 2048   # as big as possible so we can explore many models
epochs = 32

In [16]:
def train_and_test_sensor(idx_sensor, id_sensor, n_sensors):
    X_tr1, y_tr1, X_te1, y_te1 = to_array(X_tr_lon, y_tr_lon, X_te_lon, y_te_lon, id_sensor=id_sensor)
    
    # Validation using TS split (just to obtain different MAE estimations, no hyperoptimization for the moment)
    cv_loss = []
    for tr_idx, va_idx in TimeSeriesSplit(n_splits=5).split(X_tr1):
        model = make_model_sensor(idx_sensor, n_sensors=n_sensors)
        model.compile(opt, loss='mean_absolute_error')
        model.fit(np.atleast_3d(X_tr1[tr_idx]), y_tr1[tr_idx], 
                  batch_size=batch_size, 
                  epochs=epochs, 
                  validation_data=(np.atleast_3d(X_tr1[va_idx]), y_tr1[va_idx]), 
                  callbacks=[c2, c3], 
                  verbose=0)
        cv_loss.append(c3.history['val_loss'][-1])
    
    # Testing
    model = make_model_sensor(idx_sensor, n_sensors=n_sensors)
    model.compile(opt, loss='mean_absolute_error')
    model.fit(np.atleast_3d(X_tr1), y_tr1, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_data=(np.atleast_3d(X_te1), y_te1), 
              callbacks=[c2, c3], 
              verbose=0)
    test_loss = c3.history['val_loss'][-1]
    
    print('MAE_val ', cv_loss)
    print('MAE_test ', test_loss)
    
    return test_loss, cv_loss

In [17]:
def train_and_test_sensor_2D(idx_sensor, id_sensor, n_sensors):
    X_tr1, y_tr1, X_te1, y_te1 = to_array(X_tr_lon, y_tr_lon, X_te_lon, y_te_lon, id_sensor=id_sensor)
    X_tr2, y_tr2, X_te2, y_te2 = to_array(X_tr_lat, y_tr_lat, X_te_lat, y_te_lat, id_sensor=id_sensor)
    
    # Validation using TS split (just to obtain different MAE estimations, no hyperoptimization for the moment)
    cv_loss = []
    for tr_idx, va_idx in TimeSeriesSplit(n_splits=5).split(X_tr1):
        model = make_model_sensor_2D(idx_sensor, n_sensors=n_sensors)
        model.compile(opt, loss='mean_absolute_error')
        model.fit([np.atleast_3d(X_tr1[tr_idx]), np.atleast_3d(X_tr2[tr_idx])],
                  y_tr1[tr_idx], 
                  batch_size=batch_size, 
                  epochs=epochs, 
                  validation_data=([np.atleast_3d(X_tr1[va_idx]), np.atleast_3d(X_tr2[va_idx])], 
                                   y_tr1[va_idx]), 
                  callbacks=[c2, c3], 
                  verbose=0)
        cv_loss.append(c3.history['val_loss'][-1])
    
    # Testing
    model = make_model_sensor_2D(idx_sensor, n_sensors=n_sensors)
    model.compile(opt, loss='mean_absolute_error')
    model.fit([np.atleast_3d(X_tr1), np.atleast_3d(X_tr2)], 
              y_tr1, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_data=([np.atleast_3d(X_te1), np.atleast_3d(X_te2)], 
                                y_te1), 
              callbacks=[c2, c3], 
              verbose=0)
    test_loss = c3.history['val_loss'][-1]
    
    print('MAE_val ', cv_loss)
    print('MAE_test ', test_loss)
    
    return test_loss, cv_loss

In [18]:
maes1 = {}
maes2 = {}
for idx_sensor, id_sensor in enumerate(lon.index.values):
    print(idx_sensor, id_sensor)
    maes1[id_sensor], _ = train_and_test_sensor(idx_sensor, id_sensor, n_sensors=16)
    maes2[id_sensor], _ = train_and_test_sensor_2D(idx_sensor, id_sensor, n_sensors=16)
    break

0 AP7
MAE_val  [0.1303799112687743, 0.08445334024805945, 0.03708531506906786, 0.07641364477457299, 0.1326083803546976]
MAE_test  0.09443650227616966
MAE_val  [0.131003138528665, 0.08435243353192248, 0.036877147991262245, 0.07672260898763986, 0.13359125043987616]
MAE_test  0.09414855431306496


In [19]:
maes1 = pd.Series(maes1, name='MAE').sort_values()
maes1

AP7    0.094437
Name: MAE, dtype: float64

In [20]:
maes2 = pd.Series(maes2, name='MAE').sort_values()
maes2

{'AP7': 0.09414855431306496}