In [6]:
##https://developer.apple.com/metal/tensorflow-plugin/
## at the end: pip install numpy==1.21.2
#for scipy
#brew install openblas gfortran
#OPENBLAS="$(brew --prefix openblas)" pip install numpy scipy
#https://betterprogramming.pub/installing-tensorflow-on-apple-m1-with-new-metal-plugin-6d3cb9cb00ca (problem with Adam)

### 

# 4. Kernel dies when fitting the model

This is the most common error, which occurs when training the model.

Specifically, the execution crashes, and an NSInvalidArgumentException is thrown. Under the hood, TensorFlow uses tensorflow-metal which was built using MPSGraph inference enhancements capabilities for the GPU.

To fix this, use the previous version Tensorflow, as well as the previous version of tensorflow_metal. For instance, if the current version of Tensorflow is 2.6 and tensorflow-metal is 0.2, try:

$ conda create --name tensorflow_m1 python==3.9

$ conda activate tensorflow_m1$ conda install -c apple tensorflow-deps==2.5.0

$ pip install tensorflow-macos==2.5.0

$ pip install tensorflow-macos==2.5.0 --no-dependencies

$ pip install tensorflow-metal==0.1.2

In [1]:
import scipy

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [3]:
#Sequential
model=keras.Sequential()
model.add(layers.Dense(64, activation="relu", input_shape=(784,)))
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(10))

Metal device set to: Apple M1


2021-10-29 12:50:34.355797: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-10-29 12:50:34.356032: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
#The input can be introduced in another way
model=keras.Sequential()
model.add(keras.Input(shape=(784,)))
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dense(10))


In [5]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                50240     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 10)                650       
Total params: 55,050
Trainable params: 55,050
Non-trainable params: 0
_________________________________________________________________


In [6]:
#functional 
inputs = keras.Input(shape=(784,))
x1 = layers.Dense(64, activation="relu")(inputs)
x2 = layers.Dense(64, activation="relu")(x1) #i also can concat x1 with another input--[x1, x3]
outputs = layers.Dense(10)(x2)
model = keras.Model(inputs=inputs, outputs=outputs, name="mnist_model")

In [7]:
keras.utils.plot_model(model, "my_first_model.png")

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


### https://keras.io/getting_started/intro_to_keras_for_researchers/

Keras layers

While TensorFlow is an infrastructure layer for differentiable programming, dealing with tensors, variables, and gradients, Keras is a user interface for deep learning, dealing with layers, models, optimizers, loss functions, metrics, and more.

Keras serves as the high-level API for TensorFlow: Keras is what makes TensorFlow simple and productive.

The Layer class is the fundamental abstraction in Keras. A Layer encapsulates a state (weights) and some computation (defined in the call method).

A simple layer looks like this:

In [8]:
class Test(keras.layers.Layer):
    def __init__(self):
        super(Test, self).__init__()
    

In [9]:
test = Test()

In [10]:
test.weights

[]

In [11]:
class Linear(keras.layers.Layer):
    """y = w.x + b"""

    def __init__(self, units=32, input_dim=32):
        super(Linear, self).__init__()
        w_init = tf.random_normal_initializer() #first constant random weights generated. 
        self.w = tf.Variable(
            initial_value=w_init(shape=(input_dim, units), dtype="float32"),
            trainable=True,
        ) #then make variable from constant w
        b_init = tf.zeros_initializer()
        self.b = tf.Variable(
            initial_value=b_init(shape=(units,), dtype="float32"), trainable=True
        )

    def call(self, inputs): #forward propogation
        return tf.matmul(inputs, self.w) + self.b

In [12]:
# Instantiate our layer.
linear_layer = Linear(units=4, input_dim=2)

# The layer can be treated as a function.
# Here we call it on some data.
y = linear_layer(tf.ones((2, 2)))
assert y.shape == (2, 4)

In [13]:
y

<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[ 0.07693196,  0.13292654, -0.12853672,  0.02693346],
       [ 0.07693196,  0.13292654, -0.12853672,  0.02693346]],
      dtype=float32)>

In [14]:
class Linear(keras.layers.Layer):
    """y = w.x + b"""

    def __init__(self, units=32):
        super(Linear, self).__init__()
        self.units = units

    def build(self, input_shape):
        self.w = self.add_weight(
            shape=(input_shape[-1], self.units),
            initializer="random_normal",
            trainable=True,
        )
        self.b = self.add_weight(
            shape=(self.units,), initializer="random_normal", trainable=True
        )

    def call(self, inputs):
        return tf.matmul(inputs, self.w) + self.b


# Instantiate our lazy layer.
linear_layer = Linear(4)

# This will also call `build(input_shape)` and create the weights.
y = linear_layer(tf.ones((2, 2)))


### Layer gradients

You can automatically retrieve the gradients of the weights of a layer by calling it inside a GradientTape. Using these gradients, you can update the weights of the layer, either manually, or using an optimizer object. Of course, you can modify the gradients before using them, if you need to.

In [15]:
# Prepare a dataset.
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data() #
dataset = tf.data.Dataset.from_tensor_slices(
    (x_train.reshape(60000, 784).astype("float32") / 255, y_train) #convert the x_train into tensor format
)
dataset = dataset.shuffle(buffer_size=1024).batch(64) #in for loop, each time there will be a tuple with the size of 64. 


In [16]:
# Instantiate our linear layer (defined above) with 10 units.
linear_layer = Linear(10)

# Instantiate a logistic loss function that expects integer targets.
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Instantiate an optimizer.
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)


In [17]:
# Iterate over the batches of the dataset.
for step, (x, y) in enumerate(dataset):

    # Open a GradientTape.
    with tf.GradientTape() as tape:

        # Forward pass.
        logits = linear_layer(x)

        # Loss value for this batch.
        loss = loss_fn(y, logits)
        
    # Get gradients of the loss wrt the weights.
    gradients = tape.gradient(loss, linear_layer.trainable_weights)

    # Update the weights of our linear layer.
    optimizer.apply_gradients(zip(gradients, linear_layer.trainable_weights))

    # Logging.
    if step % 100 == 0:
        print("Step:", step, "Loss:", float(loss))

Step: 0 Loss: 2.496181011199951
Step: 100 Loss: 2.318586826324463
Step: 200 Loss: 2.200226068496704
Step: 300 Loss: 2.0211074352264404
Step: 400 Loss: 2.0411062240600586
Step: 500 Loss: 1.937130093574524
Step: 600 Loss: 1.9069676399230957
Step: 700 Loss: 1.897742509841919
Step: 800 Loss: 1.8282357454299927
Step: 900 Loss: 1.5662264823913574


### Layers that own layers

Layers can be recursively nested to create bigger computation blocks. Each layer will track the weights of its sublayers (both trainable and non-trainable).

In [18]:
# Let's reuse the Linear class
# with a `build` method that we defined above.(gan2-1)


class MLP(keras.layers.Layer):
    """Simple stack of Linear layers.""" 

    def __init__(self):
        super(MLP, self).__init__() #the above class is used here. 
        self.linear_1 = Linear(32) #here three further layers are created whose units are 32, 32 and 10 respectively. 
        self.linear_2 = Linear(32) #in call it will be determined how these 3 layers are connected to each other. 
        self.linear_3 = Linear(10)

    def call(self, inputs): #in this method, it is determined how the inputs feed into our nn. 
        x = self.linear_1(inputs)
        x = tf.nn.relu(x)
        x = self.linear_2(x)
        x = tf.nn.relu(x)
        return self.linear_3(x)


In [19]:
mlp = MLP()

# The first call to the `mlp` object will create the weights.
y = mlp(tf.ones(shape=(3, 64)))

# Weights are recursively tracked.
assert len(mlp.weights) == 6

In [20]:
len(mlp.weights)

6

### Examples of How to Use 1×1 Convolutions
We can make the use of a 1×1 filter concrete with some examples.

Consider that we have a convolutional neural network that expected color images input with the square shape of 256x256x3 pixels.

https://towardsdatascience.com/understanding-and-calculating-the-number-of-parameters-in-convolution-neural-networks-cnns-fc88790d530d

In [21]:
#from IPython.display import Image
#Image(filename='conv_cal.png') 

In [22]:

Image(filename='conv_param_cal.png') 

NameError: name 'Image' is not defined

### 
Basically, the number of parameters in a given layer is the count of “learnable” (assuming such a word exists) elements for a filter aka parameters for the filter for that layer. Parameters in general are weights that are learnt during training. They are weight matrices that contribute to model’s predictive power, changed during back-propagation process. Who governs the change? Well, the training algorithm you choose, particularly the optimization strategy makes them change their values.
Now that you know what “parameters” are, let’s dive into calculating the number of parameters in the sample image we saw above. But, I’d want to include that image again here to avoid your scrolling effort and time.

https://towardsdatascience.com/understanding-and-calculating-the-number-of-parameters-in-convolution-neural-networks-cnns-fc88790d530d

### 

Input layer: Input layer has nothing to learn, at it’s core, what it does is just provide the input image’s shape. So no learnable parameters here. Thus number of parameters = 0.

CONV layer: This is where CNN learns, so certainly we’ll have weight matrices. To calculate the learnable parameters here, all we have to do is just multiply the by the shape of width m, height n, previous layer’s filters d and account for all such filters k in the current layer. Don’t forget the bias term for each of the filter. Number of parameters in a CONV layer would be : ((m * n * d)+1)* k), added 1 because of the bias term for each filter. The same expression can be written as follows: ((shape of width of the filter * shape of height of the filter * number of filters in the previous layer+1)*number of filters). Where the term “filter” refer to the number of filters in the current layer. ((5*5*3)+1)*8 = 608 or ((5*5*8)+1)*16 = 3216

POOL layer: This has got no learnable parameters because all it does is calculate a specific number, no backprop learning involved! Thus number of parameters = 0.

Fully Connected Layer (FC): This certainly has learnable parameters, matter of fact, in comparison to the other layers, this category of layers has the highest number of parameters, why? because, every neuron is connected to every other neuron! So, how to calculate the number of parameters here? You probably know, it is the product of the number of neurons in the current layer c and the number of neurons on the previous layer p and as always, do not forget the bias term. Thus number of parameters here are: ((current layer neurons c * previous layer neurons p)+1*c).

In [23]:
# example of simple cnn model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
# create model
model = Sequential()
model.add(Conv2D(512, (3,3), padding='same', activation='relu', input_shape=(256, 256, 3)))
# summarize model
model.summary()
#((3*3*3)+1)*512 = 14336
#num_filters = 512
#kernel_size = (3, 3)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 256, 256, 512)     14336     
Total params: 14,336
Trainable params: 14,336
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.add(Conv2D(512, (1,1), activation='relu')) 

In [25]:
model.summary()
#(1*1*512(#filters_last_layer)+1)*512(#filters_last_layer)= 262656

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 256, 256, 512)     14336     
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 256, 256, 512)     262656    
Total params: 276,992
Trainable params: 276,992
Non-trainable params: 0
_________________________________________________________________


In [26]:
import tensorflow as tf

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10)
])

predictions = model(x_train[:1]).numpy()
tf.nn.softmax(predictions).numpy()

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

loss_fn(y_train[:1], predictions).numpy()

model.compile(optimizer = 'sgd', loss = loss_fn)
model.fit(x_train, y_train, epochs=10)

Epoch 1/10
  18/1875 [..............................] - ETA: 5s - loss: 2.3829  

2021-10-29 12:51:12.479171: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-29 12:51:12.479429: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-10-29 12:51:12.550801: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17708c910>