# How to train a very deep CNN model?
### Some major villains when training ‘very deep’ networks:
* Vanishing Gradient
* Exploding Gradient
* Internal Covariate Shift

### `ReLU` and `BatchNormalization` greatly resolve these
* But, BatchNormalization slows down training


In [None]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
import matplotlib.pyplot as plt

In [None]:
( train_images, train_labels ), ( test_images, test_labels ) = mnist.load_data()
print('Validation dataset:')
print(test_images.shape)
print(test_labels.shape)

In [None]:
plt.matshow( test_images[2], cmap = 'gray')
plt.show()
print(test_labels[2])

In [None]:
# Use the Test Set to do the training (because it is smaller)
train_images = test_images.reshape( ( 10000, 28, 28, 1 ) )
train_images = train_images.astype( 'float32' ) / 255
print("Before:")
print( test_labels.shape )
print( test_labels[0] )

train_labels = to_categorical( test_labels )
print("After:")
print( train_labels.shape )
print( train_labels[0] )

### I. A model with 4 CNN layers does reasonably well

In [None]:
model = Sequential()
model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu'))
model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu'))
model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu'))
model.add(MaxPooling2D(pool_size = 2, strides = 2))
model.add(Flatten())
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 10, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
print(model.summary())

In [None]:
history = model.fit( train_images, train_labels, epochs = 4, batch_size = 10, validation_split = 0.2 )

## II. A model with ~1000 CNN layers cannot learn



In [None]:
import sys
print(sys.getrecursionlimit())

In [None]:
model = Sequential()
model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu', input_shape = (28,28,1)))

for i in range(800):
    model.add(Conv2D(filters = 4, kernel_size = 3, activation = 'relu', padding='same'))

model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu'))
model.add(MaxPooling2D(pool_size = 2, strides = 2))
model.add(Flatten())
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 10, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

print(model.summary())

In [None]:
history = model.fit( train_images, train_labels, epochs = 8, batch_size = 10, validation_split = 0.2 )

## III. A model with ~32 CNN layers also may not learn

In [None]:
model = Sequential()
model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu', input_shape = (28,28,1)))

for i in range(16):
    model.add(Conv2D(filters = 4, kernel_size = 3, activation = 'relu', padding='same'))
    model.add(Conv2D(filters = 4, kernel_size = 3, activation = 'relu', padding='same'))

model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu'))
model.add(MaxPooling2D(pool_size = 2, strides = 2))
model.add(Flatten())
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 10, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

print(model.summary())

In [None]:
history = model.fit( train_images, train_labels, epochs = 4, batch_size = 10, validation_split = 0.2 )

## IV. BatchNormalization comes to rescue a model with reasonable # of layers

In [None]:
model = Sequential()
model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu', input_shape = (28,28,1)))

for i in range(16):
    model.add(BatchNormalization())
    model.add(Conv2D(filters = 4, kernel_size = 3, activation = 'relu', padding='same'))
    model.add(Dropout(rate=0.4))
    model.add(Conv2D(filters = 4, kernel_size = 3, activation = 'relu', padding='same'))

model.add(Conv2D(filters = 16, kernel_size = 3, activation = 'relu'))
model.add(MaxPooling2D(pool_size = 2, strides = 2))
model.add(Flatten())
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 10, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

print(model.summary())

In [None]:
history = model.fit( train_images, train_labels, epochs = 8, batch_size = 10, validation_split = 0.2 )

**Summary:**  
A model with 4 CNN layers does reasonably well  
A model with ~1000 CNN layers did not learn  
A model with ~32 CNN layers also did not learn  
BatchNormalization comes to rescue a model with reasonable # of layers