# Build train and test matrices

In [1]:
import pandas as pd
import numpy as np
import feather

df = (feather.read_dataframe('/home/SHARED/SOLAR/data/oahu_min.feather')
             .set_index('Datetime'))

In [2]:
# https://stackoverflow.com/questions/15722324/sliding-window-in-numpy
def window_stack_forward(a, stepsize=1, width=3):
    return np.hstack( a[i:1+i-width or None:stepsize] for i in range(0, width) )

In [3]:
# I feel this function can also be done for pd.DataFrame
def window_stack(a, width=3):
    n = a.shape[0]
    return np.hstack(list(a[(width-1-i):(n-i)] for i in range(0, width)))

In [4]:
# In pandas 0.24, use df.to_numpy() instead of df.values. Also care with non-numeric columns
width = 61
a = window_stack(df.values, width=width)

In [5]:
times   = [ ('t' if not idx else 't-{:d}'.format(idx)) for idx in range(width) ]
columns = pd.MultiIndex.from_product((times, df.columns), names=('time', 'location'))

In [6]:
# Convert back to DataFrame, just for convenience of having indexes
df_roll = pd.DataFrame(a, index=df.index[width-1:], columns=columns)

In [7]:
# Split target (time t) and variables (times t-1 to t-width+1)
y = df_roll['t']
X = df_roll.drop(columns='t', level='time')

In [8]:
# Split train-test, approximately 12 and 4 months respectively
X_train, X_test = X[:'2011-07-31'], X['2011-08-01':]
y_train, y_test = y[:'2011-07-31'], y['2011-08-01':]

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(449885, 1140)
(82892, 1140)
(449885, 19)
(82892, 19)


# Naive (aka persistence) predictor

We compute the error while predicting with the inmediately previous observation, for each sensor. Intended to use as a simple (though sometimes effective) baseline.

In [10]:
y_pred = np.zeros_like(y_test.values)
y_pred = X_test['t-1']

In [11]:
y_pred.columns

Index(['GH_DH3', 'GH_DH4', 'GH_DH5', 'GH_DH10', 'GH_DH11', 'GH_DH9', 'GH_DH2',
       'GH_DH1', 'GT_DH1', 'GH_AP6', 'GT_AP6', 'GH_AP1', 'GH_AP3', 'GH_AP5',
       'GH_AP4', 'GH_AP7', 'GH_DH6', 'GH_DH7', 'GH_DH8'],
      dtype='object', name='location')

In [12]:
from sklearn.metrics import mean_absolute_error

In [13]:
mae = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
mae = pd.Series(mae, index=y_test.columns)
mae.sort_values()

location
GH_AP3      0.000000
GT_DH1     43.546326
GT_AP6     45.910107
GH_DH2     55.875255
GH_DH1     56.050096
GH_DH4     56.824072
GH_AP7     57.005657
GH_DH11    57.096757
GH_DH10    57.214625
GH_DH8     57.261513
GH_DH7     57.296253
GH_AP4     57.439284
GH_DH5     57.471329
GH_AP1     57.739932
GH_DH6     58.379390
GH_AP5     58.714546
GH_DH9     59.378966
GH_DH3     60.004030
GH_AP6     60.582939
dtype: float64