In [None]:
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

import pandas as pd

from math import sqrt
from numpy import concatenate

from datetime import datetime
from matplotlib import pyplot

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [None]:
# loading data
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')
dataset = read_csv('raw.csv', parse_dates= [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
# Droping column 'No'
dataset.drop('No',axis=1,inplace=True)
# Specifiying column names
dataset.columns = ['pollution', 'dew','temp','press','wnd_dir','wnd_spd','snow','rain']
dataset.index.name = 'date'
# Markinng NaN values as 0
dataset['pollution'].fillna(0,inplace=True)
# drop the first 24 hours
dataset = dataset[24:]
# checking the head
print(dataset.head(5))
# saving as CSV
dataset.to_csv('pollution.csv')

In [None]:
# Loading the new dataset
dataset = read_csv('pollution.csv', header=0,index_col=0)
values = dataset.values
#specifying columns to plot
# not plotting wind direction (wnd_dir) (column 4) since its categorical
groups = [0,1,2,3,5,6,7]
i = 1
# plot each plot
pyplot.figure()
for group in groups:
    pyplot.subplot(len(groups), 1, i)
    pyplot.plot(values[:, group])
    pyplot.title(dataset.columns[group], y=0.5, loc='right')
    i += 1
pyplot.show()

In [None]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [None]:
# integer encoder (that's for the wind direction)
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])

#ensuring is float
values = values.astype('float32')

# normalizing values
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(values)

# Framing as supervised learning
reframed = series_to_supervised(scaled,1,1)

# drop columns t+n since we only want to predict t
reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
print(reframed.head())

In [None]:
# spliting the reframed dataset into train and test
values = reframed.values
n_train_hours = 365 * 24
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]
print(train.shape)

In [None]:
# For training:
# for X: Getting all the TRAIN rows and all but the last column
# for Y: Getting all the TRAIN rows and the last column
train_X, train_y = train[:, :-1], train[:, -1]
# For testing:
# for X: Getting all the TEST rows and all but the last column
# for Y: Getting all the TEST rows and the last column
test_X, test_y = test[:, :-1], test[:, -1]


In [None]:
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

# split into train and test sets
values = reframed.values
n_train_hours = 365 * 24
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]



In [None]:
# design network
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
import numpy as np

print(np.shape(test_X))

print((train_X.shape[1], train_X.shape[2]))

In [None]:
# Making a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# Inverting scale for forecast
inv_yhat = concatenate((yhat, test_X[:,1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# Inverting scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:,1:]), axis=1)
inv_y = inv_y[:,0]
# calculating RMSE
rmse = sqrt(mean_squared_error(inv_y,inv_yhat))
print('Test RMSE: %.3f' % rmse)