# Layers and Modules
Layers define a single layer operation on a neural network, while modules define a group of related layers, which form a common pattern, or can even be an entire model on its own. We use modules to abstract repeated patterns in neural network design, this allows us to build more complex models with compact code

### Using prebuilt modules

In [1]:
# import the required packages
import tensorflow as tf

In [9]:
# define a sequential model and test the output on sample generated input

# we did not need to define a matching input layer
net = tf.keras.models.Sequential(
    [ tf.keras.layers.Dense(256, activation=tf.nn.relu),
      tf.keras.layers.Dense(10),
    ]
)

# generated sample input code

X = tf.random.uniform((2, 14))
# the input shape of the network is defined the firs time it is called
print(net(X).shape)

X = tf.random.uniform((2, 14))
print(net(X).shape)



(2, 10)
(2, 10)


### Building a custom module

In [10]:
# defining our custom module

class MLP(tf.keras.Model):
    def __init__(self):
        # we call the constructor of the parent class to 
        # perform all the necessary intialization
        super(MLP, self).__init__()
        self.hidden = tf.keras.layers.Dense(units=256, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(units=10)
    
    # the forward propagation of the model,
    # which defines how we get the input from the output
    def call(self, X):
        return self.out(self.hidden(X))

In [11]:
net = MLP()
net(X).shape

TensorShape([2, 10])

### The sequential module

In [15]:
from typing import List

class  MySequential(tf.keras.Model):
    def __init__(self, modules: List[tf.keras.Model]= []):
        super().__init__()
        self.modules = modules
        
    def add_layer(self, layer: tf.keras.Model):
        self.modules.append(layer)
        
    def call(self, X):
        # loop through all the layers and perform the operation on them
        # with the output of a layer being the input of the next layer
        for layer in self.modules:
            X = layer(X)
        return X
    
# each layer is executed in the order in which they were added

In [16]:
net = MySequential([tf.keras.layers.Dense(units=256, activation=tf.nn.relu), 
                    tf.keras.layers.Dense(10)])
net(X).shape

TensorShape([2, 10])

### Executing code in the forward propagation method

In [28]:
class FixedHiddenMLP(tf.keras.Model):
    def __init__(self):
        super(FixedHiddenMLP, self).__init__()
        self.flatten = tf.keras.layers.Flatten()
        # randomly generated weights are not updated during training (i.e contant parameters)
        self.rand_weights = tf.constant(tf.random.uniform((14, 20)))
        self.dense = tf.keras.layers.Dense(20, activation=tf.nn.relu)
        
    def call(self, inputs):
        X = self.flatten(inputs)
        # use the constant parameters as well
        # as the relu and the matmul functions
        X = tf.nn.relu(tf.matmul(X, self.rand_weights) + 1)
        
        # reuse the fully connected layers
        X = self.dense(X)
        # control flow
        while tf.reduce_sum(tf.math.abs(X)) > 1:
            X /= 2
        return tf.reduce_sum(X)

In [18]:
sample = tf.random.uniform(shape=(3, 3))
sample

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0.71274793, 0.65159774, 0.6920341 ],
       [0.597064  , 0.17945683, 0.4411521 ],
       [0.6865252 , 0.24947512, 0.40614355]], dtype=float32)>

In [19]:
tf.math.abs(sample)

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0.71274793, 0.65159774, 0.6920341 ],
       [0.597064  , 0.17945683, 0.4411521 ],
       [0.6865252 , 0.24947512, 0.40614355]], dtype=float32)>

In [22]:
tf.reduce_sum(tf.math.abs(sample))

<tf.Tensor: shape=(), dtype=float32, numpy=4.616197>

In [29]:
net = FixedHiddenMLP()
net(X)

<tf.Tensor: shape=(), dtype=float32, numpy=0.54806477>

### Nested networks for people who like confusion

In [31]:
class NestMLP(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.net = tf.keras.Sequential()
        self.net.add(tf.keras.layers.Dense(64, activation=tf.nn.relu))
        self.net.add(tf.keras.layers.Dense(32, activation=tf.nn.relu))
        self.dense = tf.keras.layers.Dense(16, activation=tf.nn.relu)

    def call(self, inputs):
        return self.dense(self.net(inputs))

chimera = tf.keras.Sequential()
chimera.add(NestMLP())
chimera.add(tf.keras.layers.Dense(14))
chimera.add(FixedHiddenMLP())
chimera(X)

<tf.Tensor: shape=(), dtype=float32, numpy=0.5758785>

In [37]:
class ParallelMLP(tf.keras.Model):
    def __init__(self, net1, net2):
        super(ParallelMLP, self).__init__()
        self.net1 = net1
        self.net2 = net2
        
    def call(self, X):
        X1 = self.net1(X)
        X2 = self.net2(X)   
        X = tf.concat([X1, X2], axis=1)
        return X


In [39]:
net1 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(20, activation=tf.nn.relu),
    tf.keras.layers.Dense(20, activation=tf.nn.relu),
    tf.keras.layers.Dense(2, activation=tf.nn.relu),  
])

net2 = tf.keras.models.Sequential([
    tf.keras.layers.Dense(20, activation=tf.nn.tanh),
    tf.keras.layers.Dense(2, activation=tf.nn.tanh),  
])

parallel_mlp = ParallelMLP(net1, net2)
parallel_mlp(X)

<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  0.5597044 , -0.10258479],
       [ 0.        ,  0.        ,  0.87296665, -0.59194946]],
      dtype=float32)>

# Parameter management
Parameters are what store the learned structural pattern of our data and is the purpose of trainig, once trained the parameters are the value extracted from the training process. The parameters are what is used to make future predictions when the model is trained. 

In [40]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, activation=tf.nn.relu),
    tf.keras.layers.Dense(1)
])

X = tf.random.uniform((2, 4))
net(X).shape

TensorShape([2, 1])

In [41]:
len(net.layers)

3

In [48]:
net.layers[2].weights

[<tf.Variable 'dense_53/kernel:0' shape=(4, 1) dtype=float32, numpy=
 array([[ 0.7134845 ],
        [ 0.13386679],
        [-0.79644763],
        [-0.874894  ]], dtype=float32)>,
 <tf.Variable 'dense_53/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

In [49]:
type(net.layers[2].weights[1]), tf.convert_to_tensor(net.layers[2].weights[1])

(tensorflow.python.ops.resource_variable_ops.ResourceVariable,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>)

In [50]:
# get all the weights on a network
net.get_weights()

[array([[ 0.8567994 , -0.13660324, -0.61530447, -0.6061369 ],
        [ 0.62813133,  0.18498129, -0.7404247 , -0.50554276],
        [-0.2476942 , -0.229617  ,  0.03248268, -0.6440674 ],
        [-0.75030273, -0.09672415,  0.5307556 , -0.5516254 ]],
       dtype=float32),
 array([0., 0., 0., 0.], dtype=float32),
 array([[ 0.7134845 ],
        [ 0.13386679],
        [-0.79644763],
        [-0.874894  ]], dtype=float32),
 array([0.], dtype=float32)]

### Tied parameters

In [58]:
# tf.keras behaves a bit differently. It removes the duplicate layer
# automatically
shared = tf.keras.layers.Dense(4, activation=tf.nn.relu)
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    shared,
    tf.keras.layers.Dense(16),
    shared,
    tf.keras.layers.Dense(1),
])


net(X)
# Check whether the parameters are different
print(len(net.layers) == 4)


True


In [60]:
net = NestMLP()
type(net)

__main__.NestMLP

In [75]:
net(X)
y = tf.random.uniform((2, 1))
net.compile(optimizer=tf.optimizers.legacy.Adam(learning_rate=1e-3), 
            loss=tf.losses.MAE,
            metrics=["accuracy"])
net.fit(X, y)



<keras.callbacks.History at 0x2c1aae550>

# Parameter Initialization
Deep learning frameworks provide default random initialization for our parameters, but there are other strategies we can use to initialize our model parameters. We may want to initialize our model parameters based on different protocols, we can also use a custom initializer to initialize our model parameters.

In [76]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, activation=tf.nn.relu),
    tf.keras.layers.Dense(1),
])

X = tf.random.uniform((2, 4))
net(X).shape

TensorShape([2, 1])

## built in initialization
We can use the random normal initialization strategy that is used by default in tensorflow


In [78]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, 
                          activation=tf.nn.relu, 
                          kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01), 
                          bias_initializer=tf.zeros_initializer()),
    tf.keras.layers.Dense(1)])
net(X)
net.weights[0], net.weights[1]

(<tf.Variable 'dense_75/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[ 0.00093964, -0.00284592, -0.00988948,  0.01653008],
        [-0.00289992,  0.0105628 , -0.01163867, -0.00177453],
        [ 0.00653719, -0.00986328,  0.01661347,  0.00828161],
        [ 0.00509416, -0.00229942,  0.00503318,  0.00372605]],
       dtype=float32)>,
 <tf.Variable 'dense_75/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)

In [79]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, 
                          activation=tf.nn.relu, 
                          kernel_initializer=tf.initializers.Constant(1), 
                          bias_initializer=tf.zeros_initializer()),
    tf.keras.layers.Dense(1)])
net(X)
net.weights[0], net.weights[1]

(<tf.Variable 'dense_77/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], dtype=float32)>,
 <tf.Variable 'dense_77/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)

### Xavier Initialization

In [80]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, 
                          activation=tf.nn.relu, 
                          kernel_initializer=tf.initializers.GlorotUniform(), 
                          bias_initializer=tf.initializers.Constant(42)),
    tf.keras.layers.Dense(1)])
net(X)
net.weights[0], net.weights[1]

(<tf.Variable 'dense_79/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[ 0.7457518 , -0.57943475, -0.64610535, -0.0158475 ],
        [ 0.3413654 ,  0.5317233 ,  0.42933124, -0.67540705],
        [ 0.19575137,  0.1263653 ,  0.41023523,  0.69752795],
        [ 0.80360955, -0.12941885,  0.78638715,  0.7876795 ]],
       dtype=float32)>,
 <tf.Variable 'dense_79/bias:0' shape=(4,) dtype=float32, numpy=array([42., 42., 42., 42.], dtype=float32)>)

### Custom initializations

In [81]:
# we can create our custom initializer classes by subclassing the Initializer parent class
# then we modify the call function
class MyInit(tf.keras.initializers.Initializer):
    def __call__(self, shape, dtype=None, **kwargs):
        data =  tf.random.uniform(shape, -10, 10, dtype=dtype)
        factor = (tf.abs(data)>=5)
        factor = tf.cast(factor, tf.float32)
        return data * factor
    
    
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, activation=tf.nn.relu, kernel_initializer=MyInit()),
    tf.keras.layers.Dense(1)
])

net(X)
print(net.layers[1].weights[0])

<tf.Variable 'dense_81/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[ 0.       ,  0.       , -0.       ,  0.       ],
       [ 0.       ,  0.       , -7.7973294,  0.       ],
       [ 5.177188 ,  0.       ,  0.       , -8.511324 ],
       [-0.       ,  0.       , -0.       , -7.1130776]], dtype=float32)>


We also have the option to set our parameters directly if we wish to

# Lazy Initialization
This allows us to initialize our layers and modules without knowing the input shape of our network. This is because initialization is done on the fly, the first time data passes through the network.

In [82]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(10)
])

In [83]:
# we cannot access the weights because the network is not yet initialized
[net.layers[i].get_weights() for i in range(len(net.layers))]

[[], []]

In [84]:
X = tf.random.uniform((2, 20))
net(X)
[w.shape for w in net.get_weights()]

[(20, 256), (256,), (256, 10), (10,)]

# Implementation of custom layers
We often need custom layers for some new algorithm implementations, in that case we would need to implement custom layers for our algorithms.

### Layers without parameters


In [89]:
class CenteredLayer(tf.keras.Model):
    def __init__(self):
        super(CenteredLayer, self).__init__()
    def call(self, X):
        # subtracts each input value by the mean across the input value matrix
        return X - tf.reduce_mean(X)

In [92]:
# Testing out the centered layer
layer = CenteredLayer()
layer(tf.constant([[1, 2, 3, 4, 5]]))

<tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[-2, -1,  0,  1,  2]], dtype=int32)>

In [94]:
# using our custom centered layer to create more complex architectures
net = tf.keras.Sequential([tf.keras.layers.Dense(128), CenteredLayer()])
data = tf.random.uniform((2, 10))
net(data).shape


TensorShape([2, 128])

### Layers with parameters


In [120]:
# we would be implementing a custom dense layer
class MyDense(tf.keras.Model):
    def __init__(self, units):
        super(MyDense, self).__init__()
        self.units = units
        
    def build(self, X_shape):
        self.weight = self.add_weight(name='weight',
            shape=[X_shape[-1], self.units],
            initializer=tf.random_normal_initializer())
        self.bias = self.add_weight(
            name='bias', shape=[self.units],
            initializer=tf.zeros_initializer())

    def call(self, X):
        linear = tf.matmul(X, self.weights[0]) + self.bias
        return tf.nn.relu(linear)

In [121]:
dense = MyDense(3)
dense(tf.random.uniform((2, 5)))
dense.get_weights()

[array([[-0.0662522 ,  0.01260725, -0.04122119],
        [ 0.01992019, -0.0181719 , -0.00875344],
        [-0.02290619, -0.02234221,  0.01695419],
        [ 0.0264551 , -0.05543638, -0.05993166],
        [-0.00914237,  0.0074254 ,  0.03734822]], dtype=float32),
 array([0., 0., 0.], dtype=float32)]

In [122]:
dense(tf.random.uniform((2, 5))) # using custom layers to carry out operations

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0., 0., 0.],
       [0., 0., 0.]], dtype=float32)>

In [125]:
# We can also use these custom layers with parameters to create large sequence networks
net = tf.keras.models.Sequential([MyDense(8), MyDense(1)])
net(tf.random.uniform((2, 64)))

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.00721058],
       [0.03586151]], dtype=float32)>

In [126]:
class Exercise1(tf.keras.Model):
    def __init__(self):
        super(Exercise1, self).__init__()
    
    def call(self, X):
        return tf.reduce_sum(X)

class Exercise2(tf.keras.Model):
    def __init__(self):
        pass
    def call(self, X):
        pass

# File I/O
We would often need to interface with the file I/O system to be able to store weights and model information in order to be used for future pedcitosn, we can also store the weights of a model during training at intervals so as to not lose the computation done by the models if the system should fail during training for any reason, these are called checkpoints.

In [128]:
x = tf.range(4)
x

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 1, 2, 3], dtype=int32)>

In [130]:
# saving the tensor data a file using numpy
import numpy as np
np.save("x-file.npy", x)

In [131]:
# reading the data from a file using numpy
x2 = np.load("x-file.npy", allow_pickle=True)
x2

array([0, 1, 2, 3], dtype=int32)

In [134]:
y = tf.zeros(4)
np.save("xy-file.npy", [x, y])
x2, y2 = np.load("xy-file.npy", allow_pickle=True)
(x2, y2)

(array([0., 1., 2., 3.]), array([0., 0., 0., 0.]))

In [135]:
# we can store dictionaries with the numpy format
mydict = {'x':x, 'y':y}
np.save('mydict.npy', mydict)
mydict2 = np.load('mydict.npy', allow_pickle=True)
mydict2

array({'x': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 1, 2, 3], dtype=int32)>, 'y': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>},
      dtype=object)

### Loading and saving model parameters
We need to save entire model parameters because when we start to deal with more complex models, it becomes difficult to save all the individual weights

In [137]:
class MLP(tf.keras.Model):
    def __init__(self):
        super(MLP, self).__init__()
        self.flatten = tf.keras.layers.Flatten()
        self.hidden = tf.keras.layers.Dense(256, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(10)
        
        
    def call(self, X):
        X = self.flatten(X)
        X = self.hidden(X)
        out = self.out(X)
        return out

In [138]:
net = MLP()
X = tf.random.uniform((2, 20))
Y = net(X)

net.save_weights("mlp.params")

In [140]:
clone = MLP()
clone.load_weights("mlp.params")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2c25a0090>

In [141]:
Y_clone = clone(X)
Y_clone == Y

<tf.Tensor: shape=(2, 10), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]])>

# Working with GPUs
GPUs enable us to perform more complex computations in a short amount of time, this is because GPUs enable parallelization of operations which is useful for training large neural networks. You can work with GPUs as a single GPU on a device or multiple GPUs on a device, our you could use it as a network of distributed computers, each with their own individual GPUs. 

In [142]:
# check if your device has a gpu
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [143]:
tf.config.list_logical_devices() # list all the logical computational devices available to tensorflow

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]

In [144]:
tf.config.list_physical_devices() # lists all the physical devices available to tensorflow

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [145]:
def get_num_gpus():
    return len(tf.config.list_physical_devices('GPU'))

get_num_gpus()

0

In [146]:
x = tf.constant([1, 2, 3])
x.device

'/job:localhost/replica:0/task:0/device:CPU:0'

In [147]:
# for an operation between two variables to be computed, the
# two variables must be on the same device. 

### Neural networks and GPUs
We can store entire neural network architectures on a specific GPU device

In [None]:
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    net = tf.keras.models.Sequential([
        tf.keras.layers.Dense(1)])