# Build train and test matrices

In [35]:
import pandas as pd
import numpy as np
import feather

df = (feather.read_dataframe('/home/SHARED/SOLAR/data/oahu_min.feather')
             .set_index('Datetime'))

In [36]:
df.head()

Unnamed: 0_level_0,GH_DH3,GH_DH4,GH_DH5,GH_DH10,GH_DH11,GH_DH9,GH_DH2,GH_DH1,GT_DH1,GH_AP6,GT_AP6,GH_AP1,GH_AP3,GH_AP5,GH_AP4,GH_AP7,GH_DH6,GH_DH7,GH_DH8
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2010-03-19 14:15:00,593.28,772.944,494.34,965.019,547.24,1020.85,407.945,382.189,361.729,353.928,343.313,476.328,382.777,390.092,351.61,354.152,1107.31,497.199,659.911
2010-03-19 14:16:00,363.136,360.969,331.889,633.846,412.742,453.608,330.478,690.88,702.998,336.387,315.673,336.512,331.889,334.53,338.346,362.713,1078.8,530.448,1156.02
2010-03-19 14:17:00,320.692,325.264,343.064,319.268,318.324,314.508,513.484,1132.58,1176.44,407.717,373.86,335.044,431.793,334.528,369.882,555.355,331.587,363.059,329.172
2010-03-19 14:18:00,1213.91,1156.67,1161.93,595.967,511.785,1178.34,1140.33,1132.94,1174.39,1087.11,1064.84,1160.36,1177.52,1094.64,1194.57,1160.13,853.309,1151.47,365.224
2010-03-19 14:19:00,1174.3,1140.19,1145.85,1138.18,1143.04,1165.14,1134.99,1128.28,1170.71,1197.79,1204.11,1142.38,1149.08,1162.15,1150.49,333.524,1153.34,1144.97,1142.7


In [37]:
# https://stackoverflow.com/questions/15722324/sliding-window-in-numpy
def window_stack_forward(a, stepsize=1, width=3):
    return np.hstack( a[i:1+i-width or None:stepsize] for i in range(0, width) )

In [38]:
# I feel this function can also be done for pd.DataFrame
def window_stack(a, width=3):
    n = a.shape[0]
    return np.hstack(list(a[(width-1-i):(n-i)] for i in range(0, width)))

In [64]:
# In pandas 0.24, use df.to_numpy() instead of df.values. Also care with non-numeric columns
width = 2
a = window_stack(df.values, width=width)

In [65]:
times   = [ ('t' if not idx else 't-{:d}'.format(idx)) for idx in range(width) ]
columns = pd.MultiIndex.from_product((times, df.columns), names=('time', 'location'))

In [66]:
# Convert back to DataFrame, just for convenience of having indexes
df_roll = pd.DataFrame(a, index=df.index[width-1:], columns=columns)

In [67]:
# Split target (time t) and variables (times t-1 to t-width+1)
y = df_roll['t']
X = df_roll.drop(columns='t', level='time')

In [68]:
# Split train-test, approximately 12 and 4 months respectively
X_train, X_test = X[:'2011-07-31'], X['2011-08-01':]
y_train, y_test = y[:'2011-07-31'], y['2011-08-01':]

In [69]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(449944, 19)
(82892, 19)
(449944, 19)
(82892, 19)


# Train Robust Regression model

In [71]:
from sklearn.linear_model import (
    LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor)
from sklearn.pipeline import make_pipeline
##
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

estimators = [('HuberRegressor', HuberRegressor())]

for name, estimator in estimators:
        model = estimator
        model.fit(X, y.GH_DH11)
        mae = mean_absolute_error(model.predict(X_test), y_test.GH_AP1)
        print(mae)

54.439773944320734


In [72]:
from sklearn.linear_model import MultiTaskElasticNetCV
from sklearn.multioutput import MultiOutputRegressor

huber_reg = MultiOutputRegressor(HuberRegressor())
huber_reg.fit(X_train, y_train)

MultiOutputRegressor(estimator=HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,
        tol=1e-05, warm_start=False),
           n_jobs=1)

In [73]:
y_test_pred = pd.DataFrame(huber_reg.predict(X_test), index=y_test.index, columns=y_test.columns)

In [74]:
res = pd.concat((y_test, y_test_pred), axis=1, keys=['Actual', 'Pred'])

In [75]:
res.stack(level='location').groupby('location').apply(lambda s: mean_absolute_error(s['Actual'], s['Pred'])).sort_values()

location
GH_AP3      3.569474
GT_DH1     41.994406
GH_AP5     42.463516
GH_DH8     42.530231
GH_DH11    43.688298
GH_DH6     44.380105
GT_AP6     44.657527
GH_DH9     44.734481
GH_DH10    44.886628
GH_DH7     47.039244
GH_DH4     48.944709
GH_AP1     49.950864
GH_DH5     51.607809
GH_DH1     51.899124
GH_DH3     52.270199
GH_DH2     52.747690
GH_AP4     53.320186
GH_AP7     56.597601
GH_AP6     57.321147
dtype: float64