In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

0. Prepare time series data

In [2]:
# prepare time series data
zip_path = tf.keras.utils.get_file(
    origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
    fname='jena_climate_2009_2016.csv.zip',
    extract=True)
csv_path, _ = os.path.splitext(zip_path)

df = pd.read_csv(csv_path)
df = df[5::6] # use only 'hour' data for convenience

date_time = pd.to_datetime(df.pop('Date Time'), format='%d.%m.%Y %H:%M:%S')

In [3]:
df.describe()

Unnamed: 0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
count,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0,70091.0
mean,989.212842,9.450482,283.493086,4.956471,76.009788,13.576576,9.533968,4.042536,6.02256,9.640437,1216.061232,1.702567,2.963041,174.789095
std,8.358886,8.423384,8.504424,6.730081,16.47492,7.739883,4.183658,4.898549,2.655812,4.234862,39.974263,65.447512,75.597657,86.619431
min,913.6,-22.76,250.85,-24.8,13.88,0.97,0.81,0.0,0.51,0.81,1059.45,-9999.0,-9999.0,0.0
25%,984.2,3.35,277.44,0.24,65.21,7.77,6.22,0.87,3.92,6.29,1187.47,0.99,1.76,125.3
50%,989.57,9.41,283.46,5.21,79.3,11.82,8.86,2.19,5.59,8.96,1213.8,1.76,2.98,198.1
75%,994.72,15.48,289.53,10.08,89.4,17.61,12.36,5.3,7.8,12.49,1242.765,2.86,4.74,234.0
max,1015.29,37.28,311.21,23.06,100.0,63.77,28.25,46.01,18.07,28.74,1393.54,14.01,23.5,360.0


In [4]:
# simple data preprocessing
## (you can just skip this part, just run this cell)
wv = df['wv (m/s)']
bad_wv = (wv == -9999.0)
wv[bad_wv] = 0.0

max_wv = df['max. wv (m/s)']
bad_max_wv = (max_wv == -9999.0)
max_wv[bad_max_wv] = 0.0

# The above inplace edits are reflected in the DataFrame
df['wv (m/s)'].min()


wv = df.pop('wv (m/s)')
max_wv = df.pop('max. wv (m/s)')

# Convert to radians.
wd_rad = df.pop('wd (deg)')*np.pi / 180

# Calculate the wind x and y components.
df['Wx'] = wv*np.cos(wd_rad)
df['Wy'] = wv*np.sin(wd_rad)

# Calculate the max wind x and y components.
df['max Wx'] = max_wv*np.cos(wd_rad)
df['max Wy'] = max_wv*np.sin(wd_rad)

In [5]:
# data split to train : val : test (7 : 2 : 1)
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]

In [6]:
# data standardization
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

Now, See how to use **timeseries_datagenerator module** to make time series dataset for RNN based Models.

In [7]:
import timeseries_datagenerator as td

w1 = td.WindowGenerator(train_df=train_df, val_df=val_df, test_df=test_df, input_width=24, label_width=1, shift=24, label_columns=['T (degC)'])
w1

Total window size: 48
Input indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Label indices: [47]
Label column name(s): ['T (degC)']

In [8]:
w2 = td.WindowGenerator(train_df=train_df, val_df=val_df, test_df=test_df, input_width=6, label_width=1, shift=1,
                     label_columns=['T (degC)'])
w2.example() # get a batch sample from train data

(<tf.Tensor: shape=(32, 6, 15), dtype=float32, numpy=
 array([[[-0.15141281, -0.11302279, -0.10143077, ...,  1.4983623 ,
           1.5714433 ,  1.5666423 ],
         [-0.15141281, -0.19505677, -0.1828085 , ...,  1.1052896 ,
           1.405495  ,  1.0980324 ],
         [-0.16828544, -0.26784748, -0.25387076, ...,  0.88682437,
           1.352313  ,  0.8107216 ],
         [-0.18515806, -0.2967327 , -0.28137872, ...,  1.261403  ,
           1.7618648 ,  1.1088971 ],
         [-0.19359437, -0.32792872, -0.31232518, ...,  0.9575766 ,
           1.6345305 ,  0.84076107],
         [-0.22492924, -0.3787667 , -0.35931793, ...,  1.2813354 ,
           1.9291767 ,  1.0737977 ]],
 
        [[ 0.8452771 ,  0.52822876,  0.459044  , ...,  0.27874115,
          -0.24188605,  0.2971479 ],
         [ 0.8344304 ,  0.43810692,  0.36964312, ...,  0.20427828,
          -0.45849502,  0.21806127],
         [ 0.7946592 ,  0.35722834,  0.29285005, ..., -0.09096947,
          -0.28808087, -0.08223975],
       

In [9]:
# you can change batch_size easily by WindowGenerator.train function's parameter
for example_inputs, example_labels in w2.train(batch_size=10).take(1):
  print(f'Inputs shape (batch, time, features): {example_inputs.shape}')
  print(f'Labels shape (batch, time, features): {example_labels.shape}')


Inputs shape (batch, time, features): (10, 6, 15)
Labels shape (batch, time, features): (10, 1, 1)


You can use this module for both single-step model and multi-step model.
Also, the number of label columns doesn't matter.

1. Single Step Model

In [10]:
single_step_window = td.WindowGenerator(train_df=train_df, val_df=val_df, test_df=test_df,
    input_width=1, label_width=1, shift=1,
    label_columns=['T (degC)'])
single_step_window # input t-1 -> predict t

Total window size: 2
Input indices: [0]
Label indices: [1]
Label column name(s): ['T (degC)']

In [11]:
lstm_model = tf.keras.models.Sequential([
    # Shape [batch, time, features] => [batch, time, lstm_units]
    tf.keras.layers.LSTM(32, return_sequences=True),
    # Shape => [batch, time, features]
    tf.keras.layers.Dense(units=1) # label column 의 개수가 1 이므로
])


In [12]:
print('Input shape:', single_step_window.example()[0].shape) # use example function to make tf dataset
print('Output shape:', lstm_model(single_step_window.example()[0]).shape)

Input shape: (32, 1, 15)
Output shape: (32, 1, 1)


In [13]:
lstm_model.compile(loss=tf.losses.MeanSquaredError(), optimizer=tf.optimizers.Adam(), metrics=[tf.metrics.MeanAbsoluteError()])
history = lstm_model.fit(single_step_window.train(), epochs=5, validation_data=single_step_window.val())



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
# use test() function to make tf dataset for test dataframe
lstm_model.evaluate(single_step_window.test()) # loss, metric



[0.01258764136582613, 0.08300337195396423]

2. Multi Step Model

In [15]:
OUT_STEPS = 24
multi_window = td.WindowGenerator(
    train_df=train_df, val_df=val_df, test_df=test_df,
    input_width=24, label_width=OUT_STEPS, shift=OUT_STEPS)

multi_window # use t~t+23 to predict t+24~t+47

Total window size: 48
Input indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Label indices: [24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47]
Label column name(s): None

In [16]:
multi_lstm_model = tf.keras.Sequential([
    # Shape [batch, time, features] => [batch, lstm_units]
    # Adding more `lstm_units` just overfits more quickly.
    tf.keras.layers.LSTM(32, return_sequences=False), # LSTM layer 는 일부 정보라도 input의 long history 를 반영할 수 있음
    # Shape => [batch, out_steps*features]
    tf.keras.layers.Dense(OUT_STEPS*num_features,
                          kernel_initializer=tf.initializers.zeros),
    # Shape => [batch, out_steps, features]
    tf.keras.layers.Reshape([OUT_STEPS, num_features])
])

In [17]:
multi_lstm_model.compile(loss=tf.losses.MeanSquaredError(), optimizer=tf.optimizers.Adam(), metrics=[tf.metrics.MeanAbsoluteError()])
# You can change sequence_stride or batch_size not only when you define WindowGenerator instance, but also when you call train, val, test functions.
history = multi_lstm_model.fit(multi_window.train(sequence_stride=2, batch_size=64), epochs=5, validation_data=multi_window.val(sequence_stride=2, batch_size=16))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
multi_lstm_model.evaluate(multi_window.test(sequence_stride=2, batch_size=16)) # loss, metric





[0.2710717022418976, 0.3538897931575775]