philipperemy.github.io/keras-stateful-lstm/

In [1]:
import numpy as np
from numpy import array

import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras import layers
from tensorflow.keras import optimizers

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, LSTM, TimeDistributed

  from ._conv import register_converters as _register_converters


In [2]:
def gen_data(samples, features_no, ts_length):
    arr = np.random.randn(samples, ts_length, features_no)
    print("data shape {}".format(arr.shape))
    labels = np.random.randint(0,2 , size=(samples) )
    return arr, labels

In [3]:
samples = 1
features_number = 1
ts_length = 2

data, labels = gen_data(samples,features_number,ts_length)

data shape (1, 2, 1)


In [4]:
print(data)
print("")
print(labels)

[[[ 0.26464407]
  [-1.24543778]]]

[1]


### Parameters number:

* input 1, 

* hidden states 1

1 parameter for matrices transforming input and hidden states (2 params) + 1 bias == 3 params

Each gate has above paramaters (i,f,o,g) = 4*3 = 12 parameters for single LSTM node with single input

In [5]:
model = Sequential()
model.add(LSTM(
    units = 1,
#     input_shape=(None, ts_length, features_number) # number of examples (not the same as timesteps) is typically omitted in the input_shape arguments
    input_shape=(1, 1)  # parameter can be ommited, input determined from data - examples below
))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 1)                 12        
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________


In [6]:
def get_compile_param():
    return {"loss": 'binary_crossentropy',
              "optimizer": 'adam',
              "metrics": ['accuracy']}

In [7]:
model = Sequential()
model.add(LSTM(
    units = 1,
))

model.compile(**get_compile_param())

samples = 1
features_number = 1
ts_length = 1

data, labels = gen_data(samples,features_number,ts_length)

model.fit(data, labels, epochs=1, verbose=0)

model.summary()  

data shape (1, 1, 1)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 1)                 12        
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________


In [8]:
model = Sequential()
model.add(LSTM(
    units = 1,
))

model.compile(**get_compile_param())

samples = 1
features_number = 1
ts_length = 2  # time series length does not change LSTM cell

data, labels = gen_data(samples,features_number,ts_length)

model.fit(data, labels, epochs=1, verbose=0)

model.summary()  

data shape (1, 2, 1)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 1)                 12        
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________


In [9]:
model = Sequential()
model.add(LSTM(
    units = 1,
))

model.compile(**get_compile_param())

samples = 1
features_number = 2  # + 1 parameter (1 additional (input dim)) for each matrix (4) transforming input data
ts_length = 1

data, labels = gen_data(samples,features_number,ts_length)

model.fit(data, labels, epochs=1, verbose=0)

model.summary()  

data shape (1, 1, 2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 1)                 16        
Total params: 16
Trainable params: 16
Non-trainable params: 0
_________________________________________________________________


In [10]:
model = Sequential()
model.add(LSTM(
    units = 1
))

model.compile(**get_compile_param())

samples = 2   # number of sampels does not change LSTM cell
features_number = 1
ts_length = 1

data, labels = gen_data(samples,features_number,ts_length)

model.fit(data, labels, epochs=1, verbose=0)

model.summary()  

data shape (2, 1, 1)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 1)                 12        
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________


In [11]:
model = Sequential()
model.add(LSTM(
    units = 2,
))
model.add(Dense(1))  # for compatibility of output dim (params: 2 weights and 1 bias)

model.compile(**get_compile_param())

samples = 1
features_number = 1
ts_length = 1

data, labels = gen_data(samples,features_number,ts_length)

model.fit(data, labels, epochs=1, verbose=0)

model.summary()  

data shape (1, 1, 1)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 2)                 32        
_________________________________________________________________
dense (Dense)                (None, 1)                 3         
Total params: 35
Trainable params: 35
Non-trainable params: 0
_________________________________________________________________


https://www.reddit.com/r/MachineLearning/comments/87djn7/d_what_is_meant_by_number_of_hidden_units_in_an/

based on adam_jc answers:
In more detail, there are 2 matrices (for transforming input and cell state) for 3 gates + cell state (one with dimensions n_input_features x n_units and one with n_units x n_units).

For 2 units:

so 8 matrices altogether in a cell. # of weights (not including bias term) = 4(n_input_features x n_units + n_units^2 )  (*)

single unit, single feature: 4(1 * 1 + 1) + 4bias = 12

single unit, two features: 4(2 * 1 + 1) + 4bias = 16

two units, signle feature : 4(1 * 2 + 4) + 8bias = 32

(Description of the equation (*)):

(...)
Through each gate we pass in our previous hidden state vector and our current timestep vector.

So we have our previous hidden state vector with size n_units and our timestep vector with size n_input_features.

The way I alluded to is why we have 2 matrices at each gate.

One matrix has the size (n_input_features x n_units) which transforms our current timestep vector into a vector with size n_units

The other matrix is size (n_units x n_units) which we use to transform our previous hidden state vector into a new vector, but still with size n_units  - each hidden state unit go through all athers hidden states units - that is why there is square

Then we add these two resulting n_units vectors together with element-wise addition.

## LSTM seq prediction

Based on 

https://machinelearningmastery.com/timedistributed-layer-for-long-short-term-memory-networks-in-python/

also very usefull 

https://datascience.stackexchange.com/questions/10836/the-difference-between-dense-and-timedistributeddense-of-keras

### One-to-one

In [None]:
# prepare sequence
length = 5
seq = array([i/float(length) for i in range(length)])
X = seq.reshape(len(seq), 1, 1)
y = seq.reshape(len(seq), 1)
# define LSTM configuration
n_neurons = length
n_batch = length
n_epoch = 1000
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=(1, 1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary())
# train LSTM
model.fit(X, y, epochs=n_epoch, batch_size=n_batch, verbose=0)
# evaluate
result = model.predict(X, batch_size=n_batch, verbose=0)
print(result)
print(result.shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 5)                 140       
_________________________________________________________________
dense (Dense)                (None, 1)                 6         
Total params: 146
Trainable params: 146
Non-trainable params: 0
_________________________________________________________________
None


### Many-to-one

In [None]:
# prepare sequence
length = 5
seq = array([i/float(length) for i in range(length)])
X = seq.reshape(1, length, 1)
y = seq.reshape(1, length)
# define LSTM configuration
n_neurons = length
n_batch = 1
n_epoch = 500
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=(length, 1)))
model.add(Dense(length))
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary())
# train LSTM
model.fit(X, y, epochs=n_epoch, batch_size=n_batch, verbose=0)
# evaluate
result = model.predict(X, batch_size=n_batch, verbose=0)
print(result)
print(result.shape)

### Many-to-many

In [None]:
# prepare sequence
length = 5
seq = array([i/float(length) for i in range(length)])
X = seq.reshape(1, length, 1)
y = seq.reshape(1, length, 1)
# define LSTM configuration
n_neurons = 10
n_batch = 1
n_epoch = 1000
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=(length, 1), return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary())
# train LSTM
model.fit(X, y, epochs=n_epoch, batch_size=n_batch, verbose=0)
# evaluate
result = model.predict(X, batch_size=n_batch, verbose=0)
print(result)
print(result.shape)

TimeDistributed nakładane jest na wszystkie unity dabej sieci rekurencyjnej dla kazdego (pojedynczego) time stamp. Time stamp nie sa ze soba mieszane