The decision boundary of each output neuron is linear, so Perceptrons are incapable of learning complex patterns(just like Logistic classifiers).However, if the training instances are linearly separable,Rosenblatt demonstrated that this algorithm would converge to a solution.This is called the Perceptron Convergence Theorem.

Scikit-Learn provides a Perceptron class that implements a single-TLU network.It can be used pretty much as you would expect-for example, on the iris datatset

In [3]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import Perceptron

iris = load_iris()
X = iris.data[:,(2,3)] # petal length and width
y = (iris.target == 0).astype(int) # Iris setosa?

per_clf = Perceptron()
per_clf.fit(X,y)

y_pred = per_clf.predict([[2, 0.5]])

In [4]:
y_pred

array([0])

# Forecasting a Time Series

In [5]:
def generate_time_series(batch_size, n_steps):
    freq1, freq2, offsets1, offsets2 = np.random.rand(4, batch_size, 1)
    time = np.linspace(0, 1, n_steps)
    series = 0.5 * np.sin((time - offsets1) * (freq1 * 10 + 10))
    series += 0.2 * np.sin((time - offsets2) * (freq2 * 20 + 20))
    series += 0.1 * (np.random.rand(batch_size, n_steps) - 0.5)
    return series[..., np.newaxis].astype(np.float32)

In [6]:
n_steps= 50
series = generate_time_series(10000, n_steps+1)
X_train, y_train = series[:7000, :n_steps], series[:7000, -1]
X_valid, y_valid = series[7000:9000,:n_steps], series[7000:9000, -1]
X_test, y_test = series[9000:, :n_steps], series[9000:, -1]

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

## Baseline Matrics

In [8]:
import keras
from keras import layers
y_pred = X_valid[:, -1]
np.mean(keras.losses.mean_squared_error(y_valid, y_pred))

0.021079862

Another simple approach is to use a fully conncted network.Since it expects a flat list of features for each input, we need to add a Flatten layer.Lets just use a simple Linear Regression model so that each prediction will be a linear combination of the values in the times series:

In [9]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[50, 1]),
    keras.layers.Dense(1)
])




In [49]:
X_train

array([[[-0.64664835],
        [-0.6314309 ],
        [-0.5818596 ],
        ...,
        [ 0.0177887 ],
        [-0.1094785 ],
        [-0.12598152]],

       [[ 0.18005867],
        [-0.07718117],
        [-0.23642164],
        ...,
        [-0.2560669 ],
        [-0.27271163],
        [-0.25928968]],

       [[-0.05538947],
        [-0.07547172],
        [-0.09227587],
        ...,
        [-0.27734149],
        [-0.14546561],
        [-0.10853981]],

       ...,

       [[ 0.25889802],
        [ 0.28087577],
        [ 0.32635367],
        ...,
        [ 0.61782104],
        [ 0.51009524],
        [ 0.3705604 ]],

       [[-0.4888316 ],
        [-0.38419583],
        [-0.22218531],
        ...,
        [-0.29471216],
        [-0.17163214],
        [ 0.00397158]],

       [[ 0.25146055],
        [ 0.37155056],
        [ 0.48482686],
        ...,
        [-0.37325314],
        [-0.4285371 ],
        [-0.45727104]]], dtype=float32)

In [10]:
model.compile(loss='mean_squared_error', optimizer='adam')




In [11]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:

# Evaluate the model on the validation set and calculate MSE
mse = model.evaluate(X_valid, y_valid)
print(f'Mean Squared Error on the validation set: {mse}')

Mean Squared Error on the validation set: 0.0037384419701993465


## Implementing a simple RNN

lets see if we can beat that with a simple RNN

In [13]:
model = keras.models.Sequential([
    keras.layers.SimpleRNN(1, input_shape=[None, 1])
])

This is simplest RNN which contains a single layer with a single neuron, we do not need to specify the length of the input sequences since a recurrent neural network can process any number of time steps.By Default, the simpleRNN layers uses the hyperbolic tangent activation function.

In [14]:
model.compile(loss='mean_squared_error', optimizer='adam')

In [15]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
# Evaluate the model on the validation set and calculate MSE
mse = model.evaluate(X_valid, y_valid)
print(f'Mean Squared Error on the validation set: {mse}')

Mean Squared Error on the validation set: 0.011650647968053818


## Deep RNN

In [17]:
model = keras.models.Sequential([
    keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None,1]),
    keras.layers.SimpleRNN(20, return_sequences=True),
    keras.layers.SimpleRNN(1)
])

In [18]:
model.compile(loss='mean_squared_error', optimizer='adam')

In [19]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
# Evaluate the model on the validation set and calculate MSE
mse = model.evaluate(X_valid, y_valid)
print(f'Mean Squared Error on the validation set: {mse}')

Mean Squared Error on the validation set: 0.002548994030803442


In [21]:
model = keras.models.Sequential([
    keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None,1]),
    keras.layers.SimpleRNN(20, return_sequences=True),
    keras.layers.Dense(1)
])

In [22]:
model.compile(loss='mean_squared_error', optimizer='adam')

In [23]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [24]:
# Evaluate the model on the validation set and calculate MSE
mse = model.evaluate(X_valid, y_valid)
print(f'Mean Squared Error on the validation set: {mse}')

Mean Squared Error on the validation set: 0.07814665883779526


In [30]:
series = generate_time_series(1,n_steps+10)
X_new, Y_new = series[:, :n_steps],series[:, n_steps:]
X = X_new
for step_ahead in range(10):
    y_pred_one = model.predict(X[:, step_ahead:])[:, np.newaxis, :]
    x = np.concatenate([X, y_pred_one], axis=1)
    
Y_pred = X[:, n_steps:]



ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 2, the array at index 0 has size 1 and the array at index 1 has size 20

In [None]:
X_new

The Second option is to train an RNN to predict all 10 next values at once.We can still use a sequence-to-vector model, but it will output 10 values instead of 1.However, we first need to change the targets to be vectors containig the next 10 values
:

In [26]:
series = generate_time_series(10000, n_steps + 10)
X_train, Y_train = series[:7000, :n_steps], series[:7000, -10:, 0]
X_valid, Y_valid = series[7000:9000, :n_steps], series[7000:9000, -10:, 0]
X_test, Y_test = series[9000:, :n_steps], series[9000:, -10:, 0]

Now we just need the output layer to have 10 units instead of 1:

In [28]:
model = keras.models.Sequential([
    keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),
    keras.layers.SimpleRNN(20),
    keras.layers.Dense(20)
])

In [31]:
Y_pred=model.predict(X_new)



In [32]:
Y_pred

array([[ 0.61012095,  0.44307926, -0.2140224 , -0.51406616,  0.04858202,
         0.3525835 , -0.22086477, -0.10937619, -0.19917727, -0.38751072,
         0.48200843, -0.11143138, -0.01095603,  0.96937877, -0.17860182,
         0.26824927, -0.20585363,  0.8874372 ,  0.16062392, -0.29658255]],
      dtype=float32)

Instead of traning the model to forecast 10 values only at the very last step , we can train it to forecast the next 10 values at each and every time step.

In [33]:
Y = np.empty((10000, n_steps, 10))
for step_ahead in range(1, 10 +  1):
    Y[:, :, step_ahead - 1] = series[:, step_ahead:step_ahead + n_steps,0]

Y_train = Y[:7000]
Y_valid = Y[7000:9000]
Y_test = Y[9000:]

To turn the model into a sequence-to-sequence model, we must set return_sequences=True in all recurrent layers(even the last one), and we must apply the output Dense layer at every time step.Keras offers a TimeDistributed layer for this very purpose , it wraps any layer (eg Dense layer) and applies it at every time step of its input sequence.

In [34]:
model = keras.models.Sequential([
    keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),
    keras.layers.SimpleRNN(20, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(10)) # makes the dense layer to be applied independently at each time step and that the model will output a sequence, not just a single vector
])

All outputs are  needed during training , but only the output at the last time step is useful for predictions ad for evaluation. So although we will rely on the MSE over all the outputs for training , we will use a custom metric for evaluation, to only compute the MSE over the output at the last time step:

In [35]:
def last_time_step_mse(Y_true, Y_pred):
    return keras.metrics.mean_squared_error(Y_true[:, -1], Y_pred[:, -1])

optimizer = keras.optimizers.Adam(lr=0.01)
model.compile(loss="mse", optimizer=optimizer, metrics=[last_time_step_mse])



# Fighting the Unstable Gradients Problem

Let's use tf.keras to implement Layer Normalization within a simple memory cell.For this , we need to define a custom memory cell.It is juts like a regular layer ,except it call() method takes two arguments : the inputs amd the current time stepamd hidden states from the previous time step.Note that the states argument is a list containig equals to the outputs of the previous time step, but other cells may multiple state tensors (e.g., an LSTMCell has a long term state and a short term state as we see shortly).A cell must have a state_size attribute and an output_size attibute.In a simple RNN,both are simply equal to the number of units.

The following code implements a custom memory cell which will behave like SimpleRNNCell except it will also apply Layer Normalization at each time step:


In [36]:
class LNSimpleRNNCell(keras.layers.Layer):
    def __init__(self, units, activation="tanh", **kwargs):
        super().__init__(**kwargs)
        self.state_size = units
        self.output_size = units
        self.simple_rnn_cell = keras.layers.SimpleRNNCell(units,activation=None)
        self.layer_norm = keras.layers.LayerNormalization()
        self.activation = keras.activations.get(activation)
    def call(self, inputs, states):
        outputs, new_states = self.simple_rnn_cell(inputs, states)
        norm_outputs = self.activation(self.layer_norm(outputs))
        return norm_outputs, [norm_outputs]


Above code , LNSimpleRNNCell class inherits from keras.layers.Layer class , just like any custom layer.The constructor takes the number of units and the desired activation function and sets the state_size and output_size attributes , then creates a SimpleRNNCell with no activation function(because we want to perform Layer Normalization after the linear operation but before the activation function). Then the constructor creates the LayerNormaization layer, and finally it fetches the desired activation function.The call() method starts by applying the simple RNN cell, which computes a linear combination of the current inputs and the previous hidden states, and it returns the result twice(indeed in a SimpleRNNCell, the outputs are just equal to the hidden states: in other words,new_states[0] is equal to outputs, so we can safely ignore new_states in the rest of the call() method).Next, the call() method applies Layer Normalization,followed by the activation function.Finally it returns the output twice(once as the outputs and once as the new hidden states).To use this custom cell, all we need to do is create a keras.layers.RNN layer, passing it a cell instance:


In [37]:
model = keras.models.Sequential([
    keras.layers.RNN(LNSimpleRNNCell(20), return_sequences=True, input_shape=[None, 1]),
    keras.layers.RNN(LNSimpleRNNCell(20), return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(10))
    
])







Similarly, you could createa a custom cell to apply dropout between each time step.But there's a simple way: all recurrent layers(except for keras.layers.RNN) and all cells provided by Keras has a droput parameter and a recurrent_dropout hyper-parameter: the former defines the dropout rate for the hidden states(also at each time step).No need to create a custom cell to apply dropout at each time step in an RNN.
With these techniques, you can alleviate the unstable gardients problem and train an RNN much more efficiently.Now lets look at how to deal with the short-term memory problem.

# Tackling the Short-Term Memory Problem

In Keras, you can simply use the LSTM layer instead of SimpleRNN layer to make a RNN to LSTM 

In [38]:
model = keras.models.Sequential([
    keras.layers.LSTM(20, return_sequences=True, input_shape=[None, 1]),
    keras.layers.LSTM(20, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(10))
    
])

However, the LSTM layer uses an optimised implementation when running on GPU , so it is peferable to use it

# Apply 1D convolutional layer to GRU

In [40]:
model = keras.models.Sequential([
    keras.layers.Conv1D(filters=20,kernel_size=4,strides=2,padding="valid" ,input_shape=[None, 1]),
    keras.layers.GRU(20, return_sequences=True),
    keras.layers.GRU(20, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(10))
    
])

In [45]:
model.compile(loss='mse', optimizer='adam',metrics=[last_time_step_mse])

In [46]:
history=model.fit(X_train,Y_train[:,3::2],epochs=20,validation_data=(X_valid, Y_valid[:, 3::2]))

Epoch 1/20






Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [47]:
X_valid, Y_valid[:, 3::2]

(array([[[ 0.4128169 ],
         [ 0.5113632 ],
         [ 0.59807044],
         ...,
         [ 0.19303648],
         [ 0.17872018],
         [ 0.08321963]],
 
        [[-0.46015626],
         [-0.46750906],
         [-0.44173956],
         ...,
         [ 0.49921614],
         [ 0.4350306 ],
         [ 0.37085634]],
 
        [[-0.50013334],
         [-0.61801845],
         [-0.62538296],
         ...,
         [ 0.28145257],
         [ 0.33534035],
         [ 0.35960633]],
 
        ...,
 
        [[-0.56406534],
         [-0.60747445],
         [-0.6084812 ],
         ...,
         [ 0.25958344],
         [ 0.30402866],
         [ 0.2100297 ]],
 
        [[-0.29693457],
         [-0.2686142 ],
         [-0.28101456],
         ...,
         [-0.58783597],
         [-0.66277367],
         [-0.6597046 ]],
 
        [[-0.2669471 ],
         [-0.38518706],
         [-0.4471798 ],
         ...,
         [ 0.10362528],
         [ 0.01292876],
         [-0.01603315]]], dtype=float32),
 arr

In [48]:
# Evaluate the model on the validation set and calculate MSE
mse = model.evaluate(X_valid, Y_valid[:, 3::2])
print(f'Mean Squared Error on the validation set: {mse}')

Mean Squared Error on the validation set: [0.00013080639473628253, 0.0003368490724824369]
