### Vanishing Gradient Problem
During Backpropagation gradients are calculated by chain differentiation of loss function w.r.t different dependent variables.<br>
So if gradient of one varible w.r.t other variable < 1 then product of all partial differentials in chain differentiation tends to 0.<br><br>
We have, weight_final = weight_initial - LR x gradient<br><br>
If gradient is very low then change in weight is very small and hence Loss function will not get reduced.
<br><br>
It occurs in Deep Neural Networks having sigmoid/tanh activation function.In worst case, it stops NN from further training.

In [262]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [263]:
X,y=make_classification(n_samples=100,n_features=2,n_classes=2,n_redundant=0,n_repeated=0,n_clusters_per_class=1,random_state=2121)


### Having too many deep layers causes Vanishing gradient problem

In [264]:
test_nn=Sequential(
[Dense(8,activation="sigmoid",input_shape=(2,)),
Dense(8,activation="sigmoid"),
Dense(8,activation="sigmoid"),
Dense(8,activation="sigmoid"),
Dense(8,activation="sigmoid"),
Dense(8,activation="sigmoid"),
Dense(8,activation="sigmoid"),
Dense(8,activation="sigmoid"),
Dense(1,activation="sigmoid")]
)

In [265]:
inital_wt=np.array(test_nn.weights[0])

  inital_wt=np.array(test_nn.weights[0])


In [266]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2121)

In [267]:
optimizer=Adam(learning_rate=0.1)
test_nn.compile(    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy'])

In [268]:
test_nn.summary()

In [269]:
test_nn.fit(X_train,y_train,epochs=1)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.4133 - loss: 0.7679


<keras.src.callbacks.history.History at 0x173023d08a0>

In [270]:
final_weigts=np.array(test_nn.weights[0])
lr=test_nn.optimizer.get_config()["learning_rate"]

  final_weigts=np.array(test_nn.weights[0])


In [271]:
gradient=(inital_wt-final_weigts)/lr

In [272]:
gradient

array([[ 2.1639452 , -1.4732965 ,  1.1970878 ,  1.6995776 ,  0.42705566,
         0.9939125 ,  1.3632133 , -1.4903407 ],
       [-2.0851734 ,  1.3766339 , -1.0930878 , -1.6908276 , -0.48531234,
        -0.85935473, -1.267721  ,  1.3742449 ]], dtype=float32)

<h3>Solution 1: Reduce No of layers</h3>

In [273]:
test_nn=Sequential(
[Dense(8,activation="sigmoid",input_shape=(2,)),
Dense(8,activation="sigmoid"),
Dense(1,activation="sigmoid")]
)
inital_wt=np.array(test_nn.weights[0])
optimizer=Adam(learning_rate=0.1)
test_nn.compile(    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy'])
test_nn.fit(X_train,y_train,epochs=1)
final_weigts=np.array(test_nn.weights[0])
diff=final_weigts-inital_wt

  inital_wt=np.array(test_nn.weights[0])


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.5600 - loss: 0.6663


  final_weigts=np.array(test_nn.weights[0])


In [274]:
lr=test_nn.optimizer.get_config()["learning_rate"]
gradient=(inital_wt-final_weigts)/lr
gradient

array([[-0.9269976 , -3.0030394 , -2.0295124 ,  2.9451587 , -2.605168  ,
        -0.24376482, -2.6741471 ,  2.5677633 ],
       [ 0.846422  ,  2.9996865 ,  1.9463426 , -2.9868808 ,  2.5971391 ,
         0.29589534,  2.7272024 , -2.5260043 ]], dtype=float32)

#### After reducing layer gradient is more than initially was

### Solution 2: Using Relu as activation function

In [275]:
test_nn=Sequential(
[Dense(8,activation="relu",input_shape=(2,)),
Dense(8,activation="relu"),
Dense(8,activation="relu"),
Dense(8,activation="relu"),
Dense(8,activation="relu"),
Dense(8,activation="relu"),
Dense(8,activation="relu"),
Dense(1,activation="sigmoid")]
)
inital_wt=np.array(test_nn.weights[0])
optimizer=Adam(learning_rate=0.1)
test_nn.compile(    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy'])
test_nn.fit(X_train,y_train,epochs=1)
final_weigts=np.array(test_nn.weights[0])
lr=test_nn.optimizer.get_config()["learning_rate"]
gradient=(inital_wt-final_weigts)/lr
gradient

  inital_wt=np.array(test_nn.weights[0])


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.5333 - loss: 0.6802


  final_weigts=np.array(test_nn.weights[0])


array([[-0.29969215,  0.29403627, -0.1008594 ,  2.587285  , -2.991471  ,
         0.08816451, -2.832948  ,  0.5314074 ],
       [ 0.2987673 , -0.30447274, -0.21257162, -0.7289976 ,  2.9930012 ,
         1.1152732 ,  2.840048  , -0.29563606]], dtype=float32)