# Deep Learning
### Part 1: Understanding Neural Network Forward Propagation and Backpropagation

#### Package installation and Import statments

In [1]:
# !pip install numpy

In [2]:
import numpy as np

##### 1. Implement forward propagation 
- Input and Xavier Initialization of Weights

In [3]:
np.random.seed(42) # so that we get same random numbers

# Inputs
i1, i2 = 2, 4

# Xavier initialization helper function
def xavier_init(n_in, n_out):
    sd = 2/(n_in+n_out)
    return np.random.normal(0, sd, size=(n_out, n_in)) # to keep the Intial weight within[-1,1]

# Initialize weights (2 inputs -> 2 hidden, 2 hidden -> 1 output)
W_hidden_base = xavier_init(2, 2)   # shape (2,2) : {w1, w2, w3, w4}
W_output_base = xavier_init(2, 1)   # shape (1,2) : {w5, w6}

print("Initial Weights (Xavier Init):")
print("W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] =", [W_hidden_base[0].tolist(), W_hidden_base[1].tolist()])
print("W_output [2 hidden -> 1 output] : [w5, w6] =", W_output_base[0].tolist())

Initial Weights (Xavier Init):
W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] = [[0.24835707650561634, -0.06913215058559233], [0.32384426905034625, 0.7615149282040127]]
W_output [2 hidden -> 1 output] : [w5, w6] = [-0.15610224981555731, -0.15609130463278703]


- Forward Propagation (Linear Activation Function)

In [4]:
# Creating a function for forward propagation using linear activation function in both Input -> HL -> Output Layer
def forward_linear(X, W_hidden, W_output):
    h = np.dot(W_hidden, X) # linear activation for 2D array for our case : [H1, H2] Hidden layer
    out = np.dot(W_output, h) # output layer taking [H1, H2] as input
    return h, out

In [5]:
X = np.array([i1, i2]) # intializing Input array
W_hidden_linear = W_hidden_base.copy() # copying the hidden weights for the Linear activation function
W_output_linear = W_output_base.copy() # copying the output weights

h_linear, out_linear = forward_linear(X, W_hidden_linear, W_output_linear)

print("\nForward Propagation (Linear Activation Function in Hidden layers and Output Layer):")
print("h_linear =", h_linear)
print("out_linear =", out_linear)



Forward Propagation (Linear Activation Function in Hidden layers and Output Layer):
h_linear = [0.22018555 3.69374825]
out_linear = [-0.61093344]


#### 2 & 3. Derivation for Backpropagation and Weight Update
- activation function for hidden layer : Linear
- activation function for output layer : Linear
- Loss Function : MSE
- Learning rate : 0.05
- True O/p : 1.0

In [6]:
y_true = 1.0 # actual output (Given)
lr = 0.05 # learning rate (Given)

##### Backpropagation Derivation steps
- Loss function MSE (L) = (1/2)*(sum((prediction - actual)^2)) # to hidden layer
- dL = (prediction - actual) { first derivative of L w.r.t W_i }
- W_i_update = W_i - lr * dL

In [7]:
############# Initial Numbers in the same print statement #############
print("\nW_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] =", [W_hidden_linear[0].tolist(), W_hidden_linear[1].tolist()])
print("W_output [2 hidden -> 1 output] : [w5, w6] =", W_output_linear[0].tolist())
print("h_linear =", h_linear)
print("out_linear =", out_linear)
############ end #############

output_linear = [] # to save final output each EPOCH

# Backpropagation loop - Created to see the EPOC evolution but printing only 1 epoch for the question
for i in range(10):
    output_linear.append((i,out_linear[0]))
    dL = (out_linear - y_true)     # Calculating the error in the final output
    dL_hl = dL * h_linear # Multiplying the error by the intermediate variable h_linear to calculate the error in the hidden layer weights
    dL_in = dL_hl * W_hidden_linear # Multiplying the error in the hidden layer weights by the weights of the input layer to calculate the error in the input layer weights
    
    if i==0:
        print("\nerror in final output =", dL)
        print("error in Hidden Layers Wieghts =", dL_hl)
        print("error in Input Layers Weights =", [dL_in[0].tolist(), dL_in[1].tolist()])

    # Updating the weights in backpropagation using the given lr and calculated error at for different layers
    W_hidden_linear = W_hidden_linear - lr * dL_in # [[w1, w2], [w3, w4]] 
    W_output_linear = W_output_linear - lr * dL_hl # [w5, w6]
    
    if i==0:
        print("\nUpdated W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] =", [W_hidden_linear[0].tolist(), W_hidden_linear[1].tolist()])
        print("Updated W_output [2 hidden -> 1 output] : [w5, w6] =", W_output_linear[0].tolist())
        print("\nEpoch", i+1, "Completed")
    
    # Using the Updated Weight in Forward propagation calculation to check the new O/p
    h_linear, out_linear = forward_linear(X, W_hidden_linear, W_output_linear)

    if i==0:
        print("\nForward Propagation (Linear hidden layer):")
        print("Updated h_linear =", h_linear)
        print("New out_linear =", out_linear)



W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] = [[0.24835707650561634, -0.06913215058559233], [0.32384426905034625, 0.7615149282040127]]
W_output [2 hidden -> 1 output] : [w5, w6] = [-0.15610224981555731, -0.15609130463278703]
h_linear = [0.22018555 3.69374825]
out_linear = [-0.61093344]

error in final output = [-1.61093344]
error in Hidden Layers Wieghts = [-0.35470427 -5.95038259]
error in Input Layers Weights = [[-0.08809331485204337, 0.4113627451541031], [-0.11486894417456574, -4.531305169712425]]

Updated W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] = [[0.2527617422482185, -0.08970028784329748], [0.3295877162590745, 0.9880801866896339]]
Updated W_output [2 hidden -> 1 output] : [w5, w6] = [-0.1383670364502858, 0.14142782479513089]

Epoch 1 Completed

Forward Propagation (Linear hidden layer):
Updated h_linear = [0.14672233 4.61149618]
New out_linear = [0.63189234]


#### 4 & 5. Derivation for Backpropagation and Weight Update
- activation function for hidden layer : ReLU
- activation function for output layer : Linear
- Loss Function : MSE
- Learning rate : 0.05
- True O/p : 1.0

Backpropagation Derivation steps remains the same as the loss function is same (MSE).

In [8]:
# Creating a function for forward propagation using ReLu activation function in Input -> HL and Linear as HL -> Output Layer
def forward_relu(X, W_hidden, W_output):
    h_pre = np.dot(W_hidden, X)
    h = np.maximum(0, h_pre) # Appling the relu function only to the Hidden layer which takes the positive values as it is rest are set to 0
    out = np.dot(W_output, h) # Output wights are still kept Linear
    return h, out

In [9]:
# Initialize input and weights
X = np.array([i1, i2])  # Input 
W_hidden_relu = W_hidden_base.copy()  # Weights copy for ReLu setup input to hidden layer 
W_output_relu = W_output_base.copy()  # Weights Copy for ReLu setup hidden to output layer

# Forward pass
h_relu, out_relu = forward_relu(X, W_hidden_relu, W_output_relu)

print("W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] =", [W_hidden_relu[0].tolist(), W_hidden_relu[1].tolist()])
print("W_output [2 hidden -> 1 output] : [w5, w6] =", W_output_relu[0].tolist())
print("Hidden layer activations (ReLU):", h_relu)
print("Output:", out_relu)

output_relu = [] # to save final output each EPOCH
for i in range(10):
    output_relu.append((i,out_relu[0]))
    dL = (out_relu - y_true)      
    # Backpropagate error to hidden layer
    dL_hl = dL * h_relu  # Liner used for error of HL to output layer
    dL_in = np.maximum(0, dL_hl * W_hidden_relu)  # ReLu used for error of input to HL layer
    
    if i == 0:
        print("\nError at output layer =", dL)
        print("Gradient at hidden layer =", dL_hl)
        print("Gradient at input layer =", [dL_in[0].tolist(), dL_in[1].tolist()])

    # Update weights 
    W_hidden_relu = W_hidden_relu - lr * dL_in  # Update hidden layer weights
    W_output_relu = W_output_relu - lr * dL_hl  # Update output layer weights
    
    if i == 0:
        print("\nUpdated Weights:")
        print("W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] =", [W_hidden_relu[0].tolist(), W_hidden_relu[1].tolist()])
        print("W_output [2 hidden -> 1 output] : [w5, w6] =", W_output_relu[0].tolist())

    if i == 0: print("\nEpoch", i+1, "Completed")

    h_relu, out_relu = forward_relu(X, W_hidden_relu, W_output_relu) # forward pass with updated weights
    if i == 0:
        print("\nForward Pass with Updated Weights:")
        print("Updated hidden layer activations (ReLU):", h_relu)
        print("New network output:", out_relu)


W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] = [[0.24835707650561634, -0.06913215058559233], [0.32384426905034625, 0.7615149282040127]]
W_output [2 hidden -> 1 output] : [w5, w6] = [-0.15610224981555731, -0.15609130463278703]
Hidden layer activations (ReLU): [0.22018555 3.69374825]
Output: [-0.61093344]

Error at output layer = [-1.61093344]
Gradient at hidden layer = [-0.35470427 -5.95038259]
Gradient at input layer = [[0.0, 0.4113627451541031], [0.0, 0.0]]

Updated Weights:
W_hidden [2 inputs -> 2 hidden] : [[w1, w2], [w3, w4]] = [[0.24835707650561634, -0.08970028784329748], [0.32384426905034625, 0.7615149282040127]]
W_output [2 hidden -> 1 output] : [w5, w6] = [-0.1383670364502858, 0.14142782479513089]

Epoch 1 Completed

Forward Pass with Updated Weights:
Updated hidden layer activations (ReLU): [0.137913   3.69374825]
New network output: [0.50331617]


In [10]:
output_linear

[(0, -0.6109334433069566),
 (1, 0.6318923392708231),
 (2, 1.1039604059348322),
 (3, 0.9530622196975309),
 (4, 1.0186922741009243),
 (5, 0.9921224293960885),
 (6, 1.0032459690499789),
 (7, 0.9986497371950417),
 (8, 1.0005594922719192),
 (9, 0.9997677924608643)]

In [11]:
output_relu

[(0, -0.6109334433069566),
 (1, 0.5033161671433549),
 (2, 0.8470620510455015),
 (3, 0.9529630642661389),
 (4, 0.985540466404757),
 (5, 0.995555736690121),
 (6, 0.9986340862656211),
 (7, 0.9995802024068039),
 (8, 0.9998709807490244),
 (9, 0.9999603476946268)]

We can see from EPOC wise evolution of the output that the output is getting closer to the desired output as the epochs increase.
- Linear Function takes on either side of the true value and converges.
- ReLu Function takes gradual incremental steps to reach the true value and converges.
- Epoch 10 is enough are roughly same for bith methods.