In [None]:
# Below example
# https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
# https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
# https://machinelearningmastery.com/how-to-develop-convolutional-neural-network-models-for-time-series-forecasting/
# https://machinelearningmastery.com/how-to-develop-lstm-models-for-multi-step-time-series-forecasting-of-household-power-consumption/

In [1]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import pandas as pd


In [2]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg


In [3]:

# load dataset
dataset = read_csv('pollution.txt', header=0, index_col=0)
dataset

Unnamed: 0_level_0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43820,2014,12,31,19,8.0,-23,-2.0,1034.0,NW,231.97,0,0
43821,2014,12,31,20,10.0,-22,-3.0,1034.0,NW,237.78,0,0
43822,2014,12,31,21,10.0,-22,-3.0,1034.0,NW,242.70,0,0
43823,2014,12,31,22,8.0,-22,-4.0,1034.0,NW,246.72,0,0


In [4]:
dataset.insert(loc=0,column='date',value=pd.to_datetime(dataset[['year', 'month', 'day', 'hour']]))
dataset.drop(columns=['year', 'month', 'day', 'hour'], inplace=True)
dataset.set_index(keys='date', inplace=True)
dataset

Unnamed: 0_level_0,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-01 00:00:00,,-21,-11.0,1021.0,NW,1.79,0,0
2010-01-01 01:00:00,,-21,-12.0,1020.0,NW,4.92,0,0
2010-01-01 02:00:00,,-21,-11.0,1019.0,NW,6.71,0,0
2010-01-01 03:00:00,,-21,-14.0,1019.0,NW,9.84,0,0
2010-01-01 04:00:00,,-20,-12.0,1018.0,NW,12.97,0,0
...,...,...,...,...,...,...,...,...
2014-12-31 19:00:00,8.0,-23,-2.0,1034.0,NW,231.97,0,0
2014-12-31 20:00:00,10.0,-22,-3.0,1034.0,NW,237.78,0,0
2014-12-31 21:00:00,10.0,-22,-3.0,1034.0,NW,242.70,0,0
2014-12-31 22:00:00,8.0,-22,-4.0,1034.0,NW,246.72,0,0


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43824 entries, 2010-01-01 00:00:00 to 2014-12-31 23:00:00
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pm2.5   41757 non-null  float64
 1   DEWP    43824 non-null  int64  
 2   TEMP    43824 non-null  float64
 3   PRES    43824 non-null  float64
 4   cbwd    43824 non-null  object 
 5   Iws     43824 non-null  float64
 6   Is      43824 non-null  int64  
 7   Ir      43824 non-null  int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 3.0+ MB


In [6]:
values = dataset.values
values

array([[nan, -21, -11.0, ..., 1.79, 0, 0],
       [nan, -21, -12.0, ..., 4.92, 0, 0],
       [nan, -21, -11.0, ..., 6.71, 0, 0],
       ...,
       [10.0, -22, -3.0, ..., 242.7, 0, 0],
       [8.0, -22, -4.0, ..., 246.72, 0, 0],
       [12.0, -21, -3.0, ..., 249.85, 0, 0]], dtype=object)

In [7]:
values[0]

array([nan, -21, -11.0, 1021.0, 'NW', 1.79, 0, 0], dtype=object)

In [8]:
values[:, 4], len(values[:, 4])

(array(['NW', 'NW', 'NW', ..., 'NW', 'NW', 'NW'], dtype=object), 43824)

In [9]:
# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])


In [12]:
values[:, 4].min(), values[:, 4].max()

(0, 3)

In [13]:

# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)


In [21]:
scaled.shape

(43824, 8)

In [16]:
len(scaled[0])

8

In [18]:
scaled[0]

array([       nan, 0.2794118 , 0.13114753, 0.545454  , 0.33333334,
       0.00229001, 0.        , 0.        ], dtype=float32)

In [None]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg


In [22]:
1 if type(scaled) is list else scaled.shape[1]

8

In [23]:
df = DataFrame(scaled)
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,,0.279412,0.131148,0.545454,0.333333,0.002290,0.0,0.0
1,,0.279412,0.114754,0.527273,0.333333,0.007639,0.0,0.0
2,,0.279412,0.131148,0.509090,0.333333,0.010698,0.0,0.0
3,,0.279412,0.081967,0.509090,0.333333,0.016047,0.0,0.0
4,,0.294118,0.114754,0.490910,0.333333,0.021396,0.0,0.0
...,...,...,...,...,...,...,...,...
43819,0.008048,0.250000,0.278689,0.781818,0.333333,0.395659,0.0,0.0
43820,0.010060,0.264706,0.262295,0.781818,0.333333,0.405588,0.0,0.0
43821,0.010060,0.264706,0.262295,0.781818,0.333333,0.413996,0.0,0.0
43822,0.008048,0.264706,0.245902,0.781818,0.333333,0.420866,0.0,0.0


In [25]:
df.shift(-1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,,0.279412,0.114754,0.527273,0.333333,0.007639,0.0,0.0
1,,0.279412,0.131148,0.509090,0.333333,0.010698,0.0,0.0
2,,0.279412,0.081967,0.509090,0.333333,0.016047,0.0,0.0
3,,0.294118,0.114754,0.490910,0.333333,0.021396,0.0,0.0
4,,0.308824,0.147541,0.472727,0.333333,0.026745,0.0,0.0
...,...,...,...,...,...,...,...,...
43819,0.010060,0.264706,0.262295,0.781818,0.333333,0.405588,0.0,0.0
43820,0.010060,0.264706,0.262295,0.781818,0.333333,0.413996,0.0,0.0
43821,0.008048,0.264706,0.245902,0.781818,0.333333,0.420866,0.0,0.0
43822,0.012072,0.279412,0.262295,0.781818,0.333333,0.426216,0.0,0.0


In [26]:
# specify the number of lag hours
n_hours = 3
n_features = 8
# frame as supervised learning
reframed = series_to_supervised(scaled, n_hours, 1)
print(reframed.shape)

(41157, 32)


In [27]:
reframed

Unnamed: 0,var1(t-3),var2(t-3),var3(t-3),var4(t-3),var5(t-3),var6(t-3),var7(t-3),var8(t-3),var1(t-2),var2(t-2),...,var7(t-1),var8(t-1),var1(t),var2(t),var3(t),var4(t),var5(t),var6(t),var7(t),var8(t)
27,0.129779,0.352941,0.245902,0.527273,0.666667,0.002290,0.000000,0.0,0.148893,0.367647,...,0.000000,0.0,0.182093,0.485294,0.229508,0.563637,0.666667,0.008391,0.037037,0.0
28,0.148893,0.367647,0.245902,0.527273,0.666667,0.003811,0.000000,0.0,0.159960,0.426471,...,0.037037,0.0,0.138833,0.485294,0.229508,0.563637,0.666667,0.009912,0.074074,0.0
29,0.159960,0.426471,0.229508,0.545454,0.666667,0.005332,0.000000,0.0,0.182093,0.485294,...,0.074074,0.0,0.109658,0.485294,0.213115,0.563637,0.666667,0.011433,0.111111,0.0
30,0.182093,0.485294,0.229508,0.563637,0.666667,0.008391,0.037037,0.0,0.138833,0.485294,...,0.111111,0.0,0.105634,0.485294,0.213115,0.581818,0.666667,0.014492,0.148148,0.0
31,0.138833,0.485294,0.229508,0.563637,0.666667,0.009912,0.074074,0.0,0.109658,0.485294,...,0.148148,0.0,0.124748,0.485294,0.229508,0.600000,0.666667,0.017551,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,0.008048,0.250000,0.311475,0.745455,0.333333,0.365103,0.000000,0.0,0.009054,0.264706,...,0.000000,0.0,0.008048,0.250000,0.278689,0.781818,0.333333,0.395659,0.000000,0.0
43820,0.009054,0.264706,0.295082,0.763638,0.333333,0.377322,0.000000,0.0,0.010060,0.264706,...,0.000000,0.0,0.010060,0.264706,0.262295,0.781818,0.333333,0.405588,0.000000,0.0
43821,0.010060,0.264706,0.278689,0.763638,0.333333,0.385730,0.000000,0.0,0.008048,0.250000,...,0.000000,0.0,0.010060,0.264706,0.262295,0.781818,0.333333,0.413996,0.000000,0.0
43822,0.008048,0.250000,0.278689,0.781818,0.333333,0.395659,0.000000,0.0,0.010060,0.264706,...,0.000000,0.0,0.008048,0.264706,0.245902,0.781818,0.333333,0.420866,0.000000,0.0


In [28]:

# split into train and test sets
values = reframed.values
n_train_hours = 365 * 24
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]


In [33]:
train.shape

(8760, 32)

In [39]:
train[:, :n_obs].shape, n_obs

((8760, 24), 24)

In [38]:
train[:, -n_features].shape, n_features

((8760,), 8)

In [42]:
train[:, -2].shape

(8760,)

In [29]:
# split into input and outputs
n_obs = n_hours * n_features
train_X, train_y = train[:, :n_obs], train[:, -n_features]
test_X, test_y = test[:, :n_obs], test[:, -n_features]
print(train_X.shape, len(train_X), train_y.shape)


(8760, 24) 8760 (8760,)


In [34]:
train_X.shape, train_y.shape

((8760, 24), (8760,))

In [43]:
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_hours, n_features))
test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)


(8760, 3, 8) (8760,) (32397, 3, 8) (32397,)


In [None]:
 
# design network
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
 


In [None]:
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], n_hours*n_features))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, -7:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, -7:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)


In [None]:
yhat

In [None]:
len(yhat)

In [None]:
s