# Build train and test matrices

In [1]:
import pandas as pd
import numpy as np
import sys

%load_ext autoreload
%autoreload 2

sys.path.append('../src/')
from utils.build_matrix import df_shift

In [2]:
df = pd.read_pickle('/home/SHARED/SOLAR/data/oahu_min_final.pkl')  

In [3]:
df_roll = df_shift(df, periods=3)

In [4]:
# Split target (time t) and variables (times t-1 to t-width+1)
y = df_roll['t']
X = df_roll.drop(columns='t', level='time')

In [5]:
# Split train-test, approximately 12 and 4 months respectively
X_train, X_test = X[:'2011-07-31'], X['2011-08-01':]
y_train, y_test = y[:'2011-07-31'], y['2011-08-01':]

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(298595, 48)
(55016, 48)
(298595, 16)
(55016, 16)


# Convolutional predictor

First we preprocess the dataset (for the moment, we'll just use as features the t-1 values at each sensor)

In [7]:
# We only use the previous timestep as features
X_tr1 = X_train['t-1']
y_tr1 = y_train

X_te1 = X_test['t-1']
y_te1 = y_test

Now, in order to use a 1D convolution, we are going to sort the sensors. For the initial test, we'll just sort them by longitude (from East to West). That way, nearer sensors are in close positions in the tensor, so the 1D convolution may extract useful correlations.

Note: many other possible ordenations of the sensors could be added as new channels in the input tensor

In [8]:
# We load the info of the sensors to extract the longitude information
info = pd.read_pickle('/home/SHARED/SOLAR/data/info.pkl')

# Sorted longitudes
lon = info['Longitude'].sort_values(ascending=False).drop('AP3')
lat = info['Latitude'].sort_values(ascending=False).drop('AP3')

In [9]:
# Finally, we sort the data according to sensor's longitude
X_tr_lon = X_tr1[lon.index]
y_tr_lon = y_tr1[lon.index]
X_te_lon = X_te1[lon.index]
y_te_lon = y_te1[lon.index]

In [10]:
X_tr_lat = X_tr1[lat.index]
y_tr_lat = y_tr1[lat.index]
X_te_lat = X_te1[lat.index]
y_te_lat = y_te1[lat.index]

Now we specify which sensor do we want to predict and test.

(In the future, we need to discuss how are we going to predict, if just by looping over each sensor, or just give a vectorial prediction)

In [11]:
import keras
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda, Reshape, Add, Multiply, Subtract, Dropout
from keras.layers import Conv2D, MaxPooling2D, LocallyConnected1D, Conv1D, UpSampling1D, MaxPooling1D, Dot, Concatenate

from keras import backend as K

Using TensorFlow backend.


Model architecture is defined below.

Some highlights:
* Locally connected works better than pure convolutional at the first layers (probably because the sensors at not located in a uniform grid)
* Trick to improve acc: add a final layer combining the convolutional prediction with the persistance prediction, so in case the input is "strange", the model could learn to output the persistance prediction (i.e., the previous time-step), which is somewhat reasonable

In [12]:
def make_model_sensor(idx_sensor, n_sensors=16):
    ''' Returns a model using all the sensors to predict index_sensor '''
    xin = Input(shape=(n_sensors, 1), name='main_input')
    x = LocallyConnected1D(8, 7, data_format = 'channels_last', padding='valid')(xin)
    x = Activation('relu')(x)
    x = LocallyConnected1D(16, 5, data_format = 'channels_last', padding='valid')(x)
    x = Activation('relu')(x)
    x = Conv1D(32, 3, data_format = 'channels_last', padding='causal')(x)
    xl = Flatten()(x)
    xl = Dropout(0.2)(xl)
    xo = Dense(1)(xl)

    # use date info here?
    xinf = Flatten()(xin)
    s  = Dense(5)(xinf)
    s = Activation('tanh')(s)
    s = Dense(2)(s)
    s = Activation('softmax')(s)

    # sort of residual connection
    xin_0 = Activation('relu')(xin)
    xin_1 = Lambda(lambda x : x[:, idx_sensor, :])(xin_0)
    xo_m = Dot(axes=1)([Concatenate()([xo, xin_1]), s])
    xo_m = Activation('relu')(xo_m)

    model = Model(inputs=[xin], outputs=[xo_m])
    return model

In [13]:
def make_model_sensor_2D(idx_sensor, n_sensors=16):
    ''' Returns a model using all the sensors to predict index_sensor '''
    xin = Input(shape=(n_sensors, 1), name='lon_input')
    x = LocallyConnected1D(8, 7, data_format = 'channels_last', padding='valid')(xin)
    x = Activation('relu')(x)
    x = LocallyConnected1D(16, 5, data_format = 'channels_last', padding='valid')(x)
    x = Activation('relu')(x)
    x = Conv1D(32, 3, data_format = 'channels_last', padding='causal')(x)
    xl = Flatten()(x) 
    
    yin = Input(shape=(n_sensors, 1), name='lat_input')
    y = LocallyConnected1D(8, 7, data_format = 'channels_last', padding='valid')(xin)
    y = Activation('relu')(x)
    y = LocallyConnected1D(16, 5, data_format = 'channels_last', padding='valid')(x)
    y = Activation('relu')(x)
    y = Conv1D(32, 3, data_format = 'channels_last', padding='causal')(x)
    yl = Flatten()(y)
    
    xc = Concatenate()([xl, yl])
    xc = Dropout(0.2)(xc)
    xo = Dense(1)(xc)

    # use date info here?
    xinf = Flatten()(xin)
    s  = Dense(5)(xinf)
    s = Activation('tanh')(s)
    s = Dense(2)(s)
    s = Activation('softmax')(s)

    # sort of residual connection
    xin_0 = Activation('relu')(xin)
    xin_1 = Lambda(lambda x : x[:, idx_sensor, :])(xin_0)
    xo_m = Dot(axes=1)([Concatenate()([xo, xin_1]), s])
    xo_m = Activation('relu')(xo_m)

    model = Model(inputs=[xin, yin], outputs=[xo_m])
    return model

Now we are ready to train. The below configuration should take 2 minutes in a 16 core CPU
(no GPU needed). We are using a huge batch-size to speed up things

In [14]:
def to_array(X_train, y_train, X_test, y_test, id_sensor='AP5', val=0.1):
    ''' Converts dataframe to numpy array for predicting any given sensor. val specifies the fraction
    of training samples to be used as validation. '''
    X_tr1_1_np = X_train.values
    y_tr1_1_np = y_train[id_sensor].values
    
    #val_idx = int((1 - val)*len(y_tr1_1_np))

    X_te1_1_np = X_test.values
    y_te1_1_np = y_test[id_sensor].values
    
    #return X_tr1_1_np[:val_idx], y_tr1_1_np[:val_idx], X_tr1_1_np[val_idx:], y_tr1_1_np[val_idx:], X_te1_1_np, y_te1_1_np
    return X_tr1_1_np, y_tr1_1_np, X_te1_1_np, y_te1_1_np

In [15]:
from sklearn.model_selection import TimeSeriesSplit

lr = 0.0001
lr = 0.0001
opt = keras.optimizers.Adam(lr=lr)

# We add a callback to log metrics and another one to schedule the learning rate

#see clr.py in this same folder
from utils.clr import CyclicLR

c1 = keras.callbacks.BaseLogger(stateful_metrics=None)
c2 = CyclicLR(step_size=250, base_lr=lr)
c3 = keras.callbacks.History()

batch_size = 2048   # as big as possible so we can explore many models
epochs = 32

In [16]:
def train_and_test_sensor(idx_sensor, id_sensor, n_sensors):
    X_tr1, y_tr1, X_te1, y_te1 = to_array(X_tr_lon, y_tr_lon, X_te_lon, y_te_lon, id_sensor=id_sensor)
    
    # Validation using TS split (just to obtain different MAE estimations, no hyperoptimization for the moment)
    cv_loss = []
    for tr_idx, va_idx in TimeSeriesSplit(n_splits=5).split(X_tr1):
        model = make_model_sensor(idx_sensor, n_sensors=n_sensors)
        model.compile(opt, loss='mean_absolute_error')
        model.fit(np.atleast_3d(X_tr1[tr_idx]), y_tr1[tr_idx], 
                  batch_size=batch_size, 
                  epochs=epochs, 
                  validation_data=(np.atleast_3d(X_tr1[va_idx]), y_tr1[va_idx]), 
                  callbacks=[c2, c3], 
                  verbose=0)
        cv_loss.append(c3.history['val_loss'][-1])
    
    # Testing
    model = make_model_sensor(idx_sensor, n_sensors=n_sensors)
    model.compile(opt, loss='mean_absolute_error')
    model.fit(np.atleast_3d(X_tr1), y_tr1, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_data=(np.atleast_3d(X_te1), y_te1), 
              callbacks=[c2, c3], 
              verbose=0)
    test_loss = c3.history['val_loss'][-1]
    
    print('MAE_val ', cv_loss)
    print('MAE_test ', test_loss)
    
    return test_loss, cv_loss

In [17]:
def train_and_test_sensor_2D(idx_sensor, id_sensor, n_sensors):
    X_tr1, y_tr1, X_te1, y_te1 = to_array(X_tr_lon, y_tr_lon, X_te_lon, y_te_lon, id_sensor=id_sensor)
    X_tr2, y_tr2, X_te2, y_te2 = to_array(X_tr_lat, y_tr_lat, X_te_lat, y_te_lat, id_sensor=id_sensor)
    
    # Validation using TS split (just to obtain different MAE estimations, no hyperoptimization for the moment)
    cv_loss = []
    for tr_idx, va_idx in TimeSeriesSplit(n_splits=5).split(X_tr1):
        model = make_model_sensor_2D(idx_sensor, n_sensors=n_sensors)
        model.compile(opt, loss='mean_absolute_error')
        model.fit([np.atleast_3d(X_tr1[tr_idx]), np.atleast_3d(X_tr2[tr_idx])],
                  y_tr1[tr_idx], 
                  batch_size=batch_size, 
                  epochs=epochs, 
                  validation_data=([np.atleast_3d(X_tr1[va_idx]), np.atleast_3d(X_tr2[va_idx])], 
                                   y_tr1[va_idx]), 
                  callbacks=[c2, c3], 
                  verbose=0)
        cv_loss.append(c3.history['val_loss'][-1])
    
    # Testing
    model = make_model_sensor_2D(idx_sensor, n_sensors=n_sensors)
    model.compile(opt, loss='mean_absolute_error')
    model.fit([np.atleast_3d(X_tr1), np.atleast_3d(X_tr2)], 
              y_tr1, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_data=([np.atleast_3d(X_te1), np.atleast_3d(X_te2)], 
                                y_te1), 
              callbacks=[c2, c3], 
              verbose=0)
    test_loss = c3.history['val_loss'][-1]
    
    print('MAE_val ', cv_loss)
    print('MAE_test ', test_loss)
    
    return test_loss, cv_loss

In [18]:
maes1 = {}
maes2 = {}
for idx_sensor, id_sensor in enumerate(lon.index.values):
    print(idx_sensor, id_sensor)
    maes1[id_sensor], _ = train_and_test_sensor(idx_sensor, id_sensor, n_sensors=16)
    maes2[id_sensor], _ = train_and_test_sensor_2D(idx_sensor, id_sensor, n_sensors=16)
    break

0 AP7
MAE_val  [0.1303799112687743, 0.08445334024805945, 0.03708531506906786, 0.07641364477457299, 0.1326083803546976]
MAE_test  0.09443650227616966
MAE_val  [0.131003138528665, 0.08435243353192248, 0.036877147991262245, 0.07672260898763986, 0.13359125043987616]
MAE_test  0.09414855431306496


In [19]:
maes1 = pd.Series(maes1, name='MAE').sort_values()
maes1

AP7    0.094437
Name: MAE, dtype: float64

In [20]:
maes2 = pd.Series(maes2, name='MAE').sort_values()
maes2

{'AP7': 0.09414855431306496}