# Build train and test matrices

In [72]:
import pandas as pd
import numpy as np
import feather

df = (feather.read_dataframe('/home/SHARED/SOLAR/data/oahu_min.feather')
             .set_index('Datetime'))

In [73]:
# https://stackoverflow.com/questions/15722324/sliding-window-in-numpy
def window_stack_forward(a, stepsize=1, width=3):
    return np.hstack( a[i:1+i-width or None:stepsize] for i in range(0, width) )

In [74]:
# I feel this function can also be done for pd.DataFrame
def window_stack(a, width=3):
    n = a.shape[0]
    return np.hstack(list(a[(width-1-i):(n-i)] for i in range(0, width)))

In [75]:
# In pandas 0.24, use df.to_numpy() instead of df.values. Also care with non-numeric columns
width = 61
a = window_stack(df.values, width=width)

In [76]:
a.shape

(532777, 1159)

In [77]:
df.shape

(532837, 19)

In [78]:
times   = [ ('t' if not idx else 't-{:d}'.format(idx)) for idx in range(width) ]
columns = pd.MultiIndex.from_product((times, df.columns), names=('time', 'location'))

In [79]:
# Convert back to DataFrame, just for convenience of having indexes
df_roll = pd.DataFrame(a, index=df.index[width-1:], columns=columns)

In [80]:
# Split target (time t) and variables (times t-1 to t-width+1)
y = df_roll['t']
X = df_roll.drop(columns='t', level='time')

In [81]:
# Split train-test, approximately 12 and 4 months respectively
X_train, X_test = X[:'2011-07-31'], X['2011-08-01':]
y_train, y_test = y[:'2011-07-31'], y['2011-08-01':]

In [82]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(449885, 1140)
(82892, 1140)
(449885, 19)
(82892, 19)


# Convolutional predictor

First we preprocess the dataset (for the moment, we'll just use as features the t-1 values at each sensor)

In [83]:
# We only use the previous timestep as features
X_tr1 = X_train['t-1']
y_tr1 = y_train

X_te1 = X_test['t-1']
y_te1 = y_test

Now, in order to use a 1D convolution, we are going to sort the sensors. For the initial test, we'll just sort them by longitude (from East to West). That way, nearer sensors are in close positions in the tensor, so the 1D convolution may extract useful correlations.

Note: many other possible ordenations of the sensors could be added as new channels in the input tensor

In [84]:
# We load the info of the sensors to extract the longitude information
info = pd.read_csv('/home/SHARED/SOLAR/data/info.csv')

info.Location = info.Location.apply(lambda x: (x[:2] + x[-2:]).replace('_', ''))
info.index = info.Location
# Sorted longitudes
longs = info['Longitude'].sort_values(ascending=False)

# We drop two sensors (they are different compared to the other 17, since they are "tilted")
X_tr1.drop('GT_AP6', inplace=True, axis=1)
y_tr1.drop('GT_AP6', inplace=True, axis=1)
X_tr1.drop('GT_DH1', inplace=True, axis=1)
y_tr1.drop('GT_DH1', inplace=True, axis=1)
X_te1.drop('GT_AP6', inplace=True, axis=1)
y_te1.drop('GT_AP6', inplace=True, axis=1)
X_te1.drop('GT_DH1', inplace=True, axis=1)
y_te1.drop('GT_DH1', inplace=True, axis=1)

# Just some auxiliar code to homogeneize name of sensors across different tables
homogen_name = lambda x: x[-4:].replace('_', '')
X_tr1.columns = [homogen_name(x) for x in X_tr1.columns.values.tolist()]
y_tr1.columns = [homogen_name(x) for x in y_tr1.columns.values.tolist()]
X_te1.columns = [homogen_name(x) for x in X_te1.columns.values.tolist()]
y_te1.columns = [homogen_name(x) for x in y_te1.columns.values.tolist()]


# Finally, we sort the data according to sensor's longitude
X_tr1_1 = X_tr1[longs.index]
y_tr1_1 = y_tr1[longs.index]
X_te1_1 = X_te1[longs.index]
y_te1_1 = y_te1[longs.index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [94]:
y_tr1[y_tr1.columns.difference(["AP7", "AP6"])]  = 0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_array(key, value)


In [95]:
y_tr1

Unnamed: 0_level_0,DH3,DH4,DH5,DH10,DH11,DH9,DH2,DH1,AP6,AP1,AP3,AP5,AP4,AP7,DH6,DH7,DH8
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2010-03-19 15:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,852.863,0.0,0.0,0.0,0.0,867.215,0.0,0.0,0.0
2010-03-19 15:16:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,845.067,0.0,0.0,0.0,0.0,860.998,0.0,0.0,0.0
2010-03-19 15:17:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,838.051,0.0,0.0,0.0,0.0,844.272,0.0,0.0,0.0
2010-03-19 15:18:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,827.137,0.0,0.0,0.0,0.0,830.662,0.0,0.0,0.0
2010-03-19 15:19:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,810.766,0.0,0.0,0.0,0.0,770.723,0.0,0.0,0.0
2010-03-19 15:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,801.021,0.0,0.0,0.0,0.0,796.804,0.0,0.0,0.0
2010-03-19 15:21:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,793.225,0.0,0.0,0.0,0.0,788.248,0.0,0.0,0.0
2010-03-19 15:22:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,783.870,0.0,0.0,0.0,0.0,780.078,0.0,0.0,0.0
2010-03-19 15:23:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,775.684,0.0,0.0,0.0,0.0,773.850,0.0,0.0,0.0
2010-03-19 15:24:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,767.109,0.0,0.0,0.0,0.0,766.064,0.0,0.0,0.0


Now we specify which sensor do we want to predict and test.

(In the future, we need to discuss how are we going to predict, if just by looping over each sensor, or just give a vectorial prediction)

In [12]:
import keras
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda, Reshape, Add, Multiply, Subtract, Dropout
from keras.layers import Conv2D, MaxPooling2D, LocallyConnected1D, Conv1D, UpSampling1D, MaxPooling1D, Dot, Concatenate

from keras import backend as K

Using TensorFlow backend.


Model architecture is defined below.

Some highlights:
* Locally connected works better than pure convolutional at the first layers (probably because the sensors at not located in a uniform grid)
* Trick to improve acc: add a final layer combining the convolutional prediction with the persistance prediction, so in case the input is "strange", the model could learn to output the persistance prediction (i.e., the previous time-step), which is somewhat reasonable

In [13]:
def make_model_sensor(index_sensor, n_sensors=17):
    ''' Returns a model using all the sensors to predict index_sensor '''
    xin = Input(shape=(n_sensors,1), name='main_input')
    x = LocallyConnected1D(8, 7, data_format = 'channels_last', padding='valid')(xin)
    x = Activation('relu')(x)
    x = LocallyConnected1D(16, 5, data_format = 'channels_last', padding='valid')(x)
    x = Activation('relu')(x)
    x = Conv1D(32, 3, data_format = 'channels_last', padding='causal')(x)
    xl = Flatten()(x)
    xl = Dropout(0.2)(xl)
    xo = Dense(1)(xl)

    # use date info here?
    xinf = Flatten()(xin)
    s  = Dense(5)(xinf)
    s = Activation('tanh')(s)
    s = Dense(2)(s)
    s = Activation('softmax')(s)

    # sort of residual connection
    xin_0 = Activation('relu')(xin)
    xin_1 = Lambda(lambda x : x[:,index_sensor,:])(xin_0)
    xo_m = Dot(axes=1)([Concatenate()([xo,xin_1]), s])
    xo_m = Activation('relu')(xo_m)

    model = Model(inputs=[xin], outputs=[xo_m])
    return model

In [14]:
lr = 0.0001
lr = 0.0001
opt = keras.optimizers.Adam(lr=lr)

# We add a callback to log metrics and another one to schedule the learning rate

#see clr.py in this same folder
from clr import CyclicLR

c1 = keras.callbacks.BaseLogger(stateful_metrics=None)
c2 = CyclicLR(step_size=250, base_lr=lr)
c3 = keras.callbacks.History()

Now we are ready to train. The below configuration should take 2 minutes in a 16 core CPU
(no GPU needed). We are using a huge batch-size to speed up things

In [23]:
n_sensors = 17


def to_array(sensor='AP5', val=0.1):
    ''' Converts dataframe to numpy array for predicting any given sensor. val specifies the fraction
    of training samples to be used as validation. '''
    X_tr1_1_np = X_tr1_1.values
    y_tr1_1_np = y_tr1_1[sensor].values
    
    #val_idx = int((1 - val)*len(y_tr1_1_np))

    X_te1_1_np = X_te1_1.values
    y_te1_1_np = y_te1_1[sensor].values
    
    #return X_tr1_1_np[:val_idx], y_tr1_1_np[:val_idx], X_tr1_1_np[val_idx:], y_tr1_1_np[val_idx:], X_te1_1_np, y_te1_1_np
    return X_tr1_1_np, y_tr1_1_np, X_te1_1_np, y_te1_1_np

In [16]:
batch_size = 1 << 11   # as big as possible so we can explore many models
epochs = 1 << 5

In [17]:
from sklearn.model_selection import TimeSeriesSplit

longs_np = longs.index.values

In [18]:
def train_and_test_sensor(id_sensor=4):
    X_tr, y_tr, X_te, y_te = to_array(sensor=longs_np[id_sensor])
    
    
    # Validation using TS split (just to obtain different MAE estimations, no hyperoptimization for the moment)
    for tr_idx, va_idx in TimeSeriesSplit(n_splits=5).split(X_tr):
        model = make_model_sensor(id_sensor, n_sensors=17)
        model.compile(opt, loss='mean_absolute_error')
        model.fit(np.atleast_3d(X_tr[tr_idx]), y_tr[tr_idx], batch_size=batch_size, epochs=epochs, validation_data=
              (np.atleast_3d(X_tr[va_idx]),y_tr[va_idx]), callbacks=[c2, c3], verbose=0)
        print('MAE_val ', c3.history['val_loss'][-1])
    
    # Testing
    model = make_model_sensor(id_sensor, n_sensors=17)
    model.compile(opt, loss='mean_absolute_error')
    model.fit(np.atleast_3d(X_tr), y_tr, batch_size=batch_size, epochs=epochs, validation_data=
              (np.atleast_3d(X_te),y_te), callbacks=[c2, c3], verbose=0)
    
    print('MAE_test ', c3.history['val_loss'][-1])
    return longs_np[id_sensor], c3.history['val_loss'][-1]

In [None]:
maes = {}
for i in range(len(longs_np)):
    print(i, longs_np[i])
    sensor, mae = train_and_test_sensor(i)
    maes[sensor] = mae 

In [20]:
maes = pd.Series(maes, name='MAE').sort_values()

In [21]:
maes

AP3     11.054148
AP5     40.033432
DH8     40.209159
DH11    40.929952
DH9     43.142448
DH10    43.338742
DH6     43.868629
DH7     45.834521
AP1     46.314959
DH3     47.618087
DH5     48.752640
DH4     50.553688
DH1     50.940663
AP4     51.292125
DH2     52.278010
AP6     56.697464
AP7     57.063573
Name: MAE, dtype: float64