In [1]:
import os, sys, inspect
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
# add parent dir to system dir
currdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
rootdir = os.path.dirname(currdir)
sys.path.insert(0, rootdir)

from airpollutionpy.utils import parse_datetime
from airpollutionpy.utils import series_to_supervised

Using TensorFlow backend.


ImportError: cannot import name 'series_to_supervised'

In [None]:
# load data
fname = "PRSA_data_2010.1.1-2014.12.31.csv"
ffname = os.path.join(rootdir, "airpollutionpy", "data", "ext", fname)
data = pd.read_csv(ffname, parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse_datetime)

In [None]:
# clean data
data.drop("No", axis=1, inplace=True)
data.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
data.index.name = 'date'
# mark all NA values with 0
data['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
data = data[24:]
# summarize first 5 rows
print(data.head(5))
# save to file
ffname = os.path.join(rootdir, "airpollutionpy", "data", "int", "pollution.csv")
data.to_csv(ffname)

In [None]:
# plot trends
# load data
data = pd.read_csv(ffname, header=0, index_col=0)
values = data.values
n_data, n_variables = data.shape
# specify columns to plot
groups = [0, 1, 2, 3, 5, 6, 7]
i = 1
# plot each column
plt.figure()
for group in groups:
    plt.subplot(len(groups), 1, i)
    plt.plot(values[:, group])
    plt.title(data.columns[group], y=0.5, loc='right')
    i += 1
plt.show()

In [None]:
# indext of target variable to predict
index_target = 0
# set model parameters
n_lag = 3
n_train = 365 * 24
n_units = 50

# set train parameters
optimizer = "adam"
loss = "mae"
n_epochs = 30
sz_batch = 72
verbose = 1

In [None]:
# pre-process data
# encode wind direction with value of the range [0:7]
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])
# call all data types to float
values = values.astype('float32')
# normalize variables to [0, 1]
minmax_scaler = MinMaxScaler(feature_range=(0, 1))
values_scaled = minmax_scaler.fit_transform(values)
# reframe sequential data as supervised learning probelm
reframed_df = series_to_supervised(values_scaled, n_lag, 1)
print(reframed_df.head(5))

In [None]:
# create train/valid data
# split into train and test sets
values = reframed_df.values
train_values = values[:n_train, :]
valid_values= values[n_train:, :]

In [None]:
# split into input and targets
n_train = train_values.shape[0]
n_valid = valid_values.shape[0]
n_observations = n_lag * n_variables
x_train, y_train = train_values[:, :n_observations], train_values[:, index_target-n_variables]
x_valid, y_valid = valid_values[:, :n_observations], valid_values[:, index_target-n_variables]
x_train = x_train.reshape((n_train, n_lag, n_variables))
x_valid = x_valid.reshape((n_valid, n_lag, n_variables))
print(f"Train Inputs Shape: {x_train.shape}, Train Targets Shape: {y_train.shape}")
print(f"Valid Inputs Shape: {x_valid.shape}, Valid Targets Shape: {y_valid.shape}")

In [None]:
# build model
model = Sequential()
model.add(LSTM(n_units, input_shape=(n_lag, n_variables)))
model.add(Dense(1))
model.compile(loss=loss, optimizer=optimizer)

In [None]:
# train model
history = model.fit(x_train, y_train,
                    epochs=n_epochs,
                    batch_size=sz_batch,
                    validation_data=(x_valid, y_valid),
                    verbose=verbose,
                    shuffle=False)

In [None]:
# plot history
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
# make a prediction
yhat_valid = model.predict(x_valid)


In [None]:
# invert normalization/scaling for prediction on valid set
x_valid = x_valid.reshape((n_valid, n_lag*n_variables))
xyhat_valid = x_valid[:, -n_variables:].copy()
xyhat_valid[:,[index_target]] = yhat_valid

In [None]:
# invert normalization/scaling for input valid set
y_valid = y_valid.reshape((len(y_valid), 1))
xy_valid = x_valid[:, -n_variables:].copy()
xy_valid[:,[index_target]] = y_valid

In [None]:
# calculate RMSE
rmse = np.sqrt(mean_squared_error(xy_valid[:, [index_target]], xyhat_valid[:, [index_target]]))
print('Test RMSE: %.3f' % rmse)

In [None]:
figsize = (20, 8)
fig = plt.figure(figsize=figsize)
ax = fig.subplots(2, 1)
ax[0].plot(np.arange(0, xy_valid.shape[0]), xyhat_valid[:, index_target], label="Ground Truth")
ax[0].legend()
ax[1].plot(np.arange(0, xyhat_valid.shape[0]), xyhat_valid[:, index_target], label="prediction")
ax[1].legend()