## RNN training from scratch

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import time

%load_ext tensorboard

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


## 1) Time series generation

In this first step, you will create a synthetic dataset of discrete-time trajectories based on a very simple model.

It writes down as follows
$$
\forall t \in \mathbb{N}, \ x^k (t)= 0.5  \sin \left(100 f_1 (t-\varphi_1) \right) + 0.2 * \sin \left( 400 f_2 (t - \varphi_2) \right) + 0.1 c \ \mbox{with} \ f_1,f_2, \varphi_1, \varphi_2 \sim \mathcal{U} (\left[0,1 \right]) \ \mbox{and} \ c \sim \mathcal{U} ([-0.5,0.5 ]).
$$


Generation of a set of 1000 independant realizations $(x^k(t))$ of the model above; each with $50+1$ time steps. 

This will be referred to as dataset $\mathbf{D}$ in the following.

In [None]:
#def generate_time_series(batch_size, n_steps, n_dim):
#  f1,f2,ph1, ph2 = np.random.rand(4, batch_size, n_dim)
#  time=np.linspace(0,1, n_steps)
#  series = np.zeros((n_sample,n_steps+1,n_dim))
#  series  = 0.5*np.sin((time-ph1)*f1*100)
#  series += 0.2 * np.sin((time-ph2)*f2*400)
#  series += 0.1* (np.random.rand(batch_size,n_steps,n_dim)-0.5)
#  return series

def generate_time_series(batch_size, n_steps):
  f1,f2,ph1, ph2 = np.random.rand(4, batch_size,1)
  time=np.linspace(0,1, n_steps)
  series  = 0.5*np.sin((time-ph1)*f1*100)
  series += 0.2 * np.sin((time-ph2)*f2*400)
  series += 0.1* (np.random.rand(batch_size,n_steps)-0.5)
  return series  

In [None]:
n_sample = 5000
n_steps = 50
series = generate_time_series(n_sample, n_steps+1)
Series=series.reshape((n_sample,n_steps+1,1))

print(series.shape)

(5000, 51)


# 2) Training step by step

## 2.1) Layers as functions


Applying a simpleRNN layer (3 units) to the inputs $X_0$ (1000 trajectories up to time $t=50$); the output should be a sequence. 

Applying a simpleRNN layer (1 unit) to $X_1$; the output being that of a Seq2seq model.

In [None]:
X0=Series[:,:n_steps,:]
print(X0.shape)
print(X0)
X1=tf.keras.layers.SimpleRNN(3, return_sequences=True,input_shape=[None,1])(X0)
print(X1.shape)
print(X1)
X2=tf.keras.layers.SimpleRNN(1,return_sequences=True)(X1)
print(X2.shape)
print(X2)

(5000, 50, 1)
[[[ 0.66225275]
  [ 0.32064596]
  [ 0.09007671]
  ...
  [ 0.49712428]
  [ 0.3249121 ]
  [ 0.40999295]]

 [[-0.17772733]
  [ 0.13624392]
  [ 0.60794034]
  ...
  [ 0.40943367]
  [ 0.23435953]
  [ 0.14617582]]

 [[ 0.53293663]
  [ 0.22526757]
  [-0.11386336]
  ...
  [ 0.37696882]
  [ 0.10320845]
  [-0.20891503]]

 ...

 [[-0.47934666]
  [-0.50156481]
  [-0.14949891]
  ...
  [-0.40245048]
  [-0.71194123]
  [-0.45659152]]

 [[-0.34125646]
  [ 0.06521414]
  [ 0.43507957]
  ...
  [-0.30939038]
  [ 0.51261748]
  [-0.15422766]]

 [[-0.62282205]
  [-0.71653909]
  [-0.73124321]
  ...
  [ 0.39364534]
  [ 0.51208033]
  [ 0.6190817 ]]]
(5000, 50, 3)
tf.Tensor(
[[[ 0.33921528 -0.43598193  0.600492  ]
  [-0.49241593  0.17347012  0.33224392]
  [ 0.35152832  0.37959152 -0.16224378]
  ...
  [-0.06673425 -0.17252155  0.23508053]
  [ 0.0123141  -0.02790023  0.18648027]
  [ 0.1324764  -0.12013917  0.40418127]]

 [[-0.09450613  0.1247441  -0.1841015 ]
  [ 0.27119124 -0.2196153   0.14063688]
  [

## 2.2) Accessing the parameters


Building an RNN $\texttt{model}$ with two simpleRNN layers (3 and 1 units respectively) and only one output $y(50) \approx x(51)$



In [None]:
#log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
#tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model=tf.keras.models.Sequential([
    tf.keras.layers.SimpleRNN(3, return_sequences=True,input_shape=[None,1]),
    tf.keras.layers.SimpleRNN(1)
])

#print('VARIABLES',model.variables)
#print('TRAINABLE VARIABLES',model.trainable_variables)

print('TRAINABLE WEIGHTS',model.trainable_weights)

TRAINABLE WEIGHTS [<tf.Variable 'simple_rnn_2/simple_rnn_cell_2/kernel:0' shape=(1, 3) dtype=float32, numpy=array([[-0.23641968,  0.14918315, -0.769784  ]], dtype=float32)>, <tf.Variable 'simple_rnn_2/simple_rnn_cell_2/recurrent_kernel:0' shape=(3, 3) dtype=float32, numpy=
array([[-0.21063721, -0.77099264, -0.6010011 ],
       [ 0.47261637, -0.61848545,  0.62778133],
       [-0.85572535, -0.15180884,  0.49465972]], dtype=float32)>, <tf.Variable 'simple_rnn_2/simple_rnn_cell_2/bias:0' shape=(3,) dtype=float32, numpy=array([0., 0., 0.], dtype=float32)>, <tf.Variable 'simple_rnn_3/simple_rnn_cell_3/kernel:0' shape=(3, 1) dtype=float32, numpy=
array([[-0.53541654],
       [ 0.07216978],
       [ 0.5560576 ]], dtype=float32)>, <tf.Variable 'simple_rnn_3/simple_rnn_cell_3/recurrent_kernel:0' shape=(1, 1) dtype=float32, numpy=array([[1.]], dtype=float32)>, <tf.Variable 'simple_rnn_3/simple_rnn_cell_3/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]


## 2.3) Gradient and training



Using $\texttt{GradientTape}$, computing the derivatives of $\displaystyle{\frac{1}{1000} \sum_{k=1}^{1000} \left\| \hat{y}^{k}_2 (50) - x^k(51) \right\|^2}$ with respect to all the trainable parameters.


In [None]:
loss_fn = tf.keras.losses.MeanSquaredError()

with tf.GradientTape() as tape:
  X2=model(X0,training=True)
  loss_value = loss_fn(X2, Series[:,-1,:])
  print('loss_value',loss_value)
  grads = tape.gradient(loss_value, model.trainable_weights)
  print('grad length', len(grads))
  print('gradient',grads)

loss_value tf.Tensor(0.4792969226837158, shape=(), dtype=float64)
grad length 6
gradient [<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[ 0.07866429, -0.18679382,  0.11946323]], dtype=float32)>, <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[-0.09564289, -0.01420624, -0.06465584],
       [-0.00377125,  0.0268035 ,  0.01120147],
       [-0.10509069,  0.02422908, -0.07476117]], dtype=float32)>, <tf.Tensor: shape=(3,), dtype=float32, numpy=array([ 0.07012494, -0.18416154, -0.02856793], dtype=float32)>, <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[-0.24019219],
       [ 0.09486873],
       [-0.2084916 ]], dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.0600991]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.05794057], dtype=float32)>]


## 2.4) Training loop


In [None]:
optimizer = tf.keras.optimizers.Adam()

In [None]:

n_epoch=2

start=time.time()
for epoch in range(n_epoch):
  with tf.GradientTape() as tape:
    X2=model(X0,training=True)
    loss_value = loss_fn(X2, Series[:,-1,:])
    print('loss_value',loss_value)
    grads = tape.gradient(loss_value, model.trainable_weights)
    #print('grad length', len(grads))
    #print('grads W_0 ',grads[0])
    #print('grads WR_0',grads[1])
    #print('old weights',model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    #print('update weights',model.trainable_weights[3])
    #print('diff',model.trainable_weights[0]-old_weights[0])
    #print('diff',model.trainable_weights[1]-old_weights[1])
    #print('diff',model.trainable_weights[2]-old_weights[2])

print('duration of the loop',time.time()-start)

loss_value tf.Tensor(0.4792969226837158, shape=(), dtype=float64)
loss_value tf.Tensor(0.47718095779418945, shape=(), dtype=float64)
loss_value tf.Tensor(0.4743861258029938, shape=(), dtype=float64)
loss_value tf.Tensor(0.4711986780166626, shape=(), dtype=float64)
loss_value tf.Tensor(0.46776890754699707, shape=(), dtype=float64)
loss_value tf.Tensor(0.4641799330711365, shape=(), dtype=float64)
loss_value tf.Tensor(0.460477352142334, shape=(), dtype=float64)
loss_value tf.Tensor(0.45669320225715637, shape=(), dtype=float64)
loss_value tf.Tensor(0.4528559446334839, shape=(), dtype=float64)
loss_value tf.Tensor(0.44899141788482666, shape=(), dtype=float64)
loss_value tf.Tensor(0.44512221217155457, shape=(), dtype=float64)
loss_value tf.Tensor(0.44126787781715393, shape=(), dtype=float64)
loss_value tf.Tensor(0.4374452233314514, shape=(), dtype=float64)
loss_value tf.Tensor(0.4336685538291931, shape=(), dtype=float64)
loss_value tf.Tensor(0.429950088262558, shape=(), dtype=float64)
loss_v

## 2.5) Graph execution


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.list_physical_devices('CPU')))
print("Num TPUs Available: ", len(tf.config.list_physical_devices('TPU')))

Num GPUs Available:  1
Num CPUs Available:  1
Num TPUs Available:  0


In [None]:
## graph execution 

@tf.function 
def train_tffunct(x, y):
    with tf.GradientTape() as tape:
        X2=model(x,training=True)
        loss_value = loss_fn(X2, y)
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value

In [None]:
start=time.time()

for epoch in range(n_epoch):
  with tf.GradientTape() as tape:
    loss_value = train_tffunct(X0,Series[:,-1,:])
    print('loss_value',loss_value)


print('duration of the loop',time.time()-start)

loss_value tf.Tensor(0.35742056369781494, shape=(), dtype=float64)
loss_value tf.Tensor(0.3552796542644501, shape=(), dtype=float64)
loss_value tf.Tensor(0.3531627058982849, shape=(), dtype=float64)
loss_value tf.Tensor(0.3510694205760956, shape=(), dtype=float64)
loss_value tf.Tensor(0.34899967908859253, shape=(), dtype=float64)
loss_value tf.Tensor(0.3469533622264862, shape=(), dtype=float64)
loss_value tf.Tensor(0.3449302613735199, shape=(), dtype=float64)
loss_value tf.Tensor(0.34292978048324585, shape=(), dtype=float64)
loss_value tf.Tensor(0.34095126390457153, shape=(), dtype=float64)
loss_value tf.Tensor(0.3389938473701477, shape=(), dtype=float64)
loss_value tf.Tensor(0.33705660700798035, shape=(), dtype=float64)
loss_value tf.Tensor(0.3351387083530426, shape=(), dtype=float64)
loss_value tf.Tensor(0.33323934674263, shape=(), dtype=float64)
loss_value tf.Tensor(0.3313578963279724, shape=(), dtype=float64)
loss_value tf.Tensor(0.3294937014579773, shape=(), dtype=float64)
loss_va

## 2.6) Graph Optimization



In [None]:
opts = tf.config.optimizer.get_experimental_options()
print(opts)
tf.config.optimizer.set_experimental_options({'constant_folding': True})
tf.config.optimizer.set_experimental_options({'arithmetic_optimizer': True})
tf.config.optimizer.set_experimental_options({'shape_optimizer': True})
opts = tf.config.optimizer.get_experimental_options()
print(opts)

{'constant_folding': True, 'disable_model_pruning': False, 'disable_meta_optimizer': False}
{'constant_folding': True, 'disable_model_pruning': False, 'disable_meta_optimizer': False}
