<a href="https://colab.research.google.com/github/amansingh202/Machine-Learning/blob/main/Deep_Learning/lec05_in_class_exercise_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
from sklearn.datasets import load_iris

n_classes = 3  # Number of classes

X, y = load_iris(return_X_y = True)
n = X.shape[0]
X = X[:,:2]                 # Keep only the first two features
X = X - np.mean(X, axis=0)  # Center the data


In [5]:
def init_params(input_dim, hidden_dim, output_dim):
    np.random.seed(0)  # Ensure reproducibility
    W1 = np.random.randn(input_dim, hidden_dim)
    b1 = np.random.randn(hidden_dim)
    W2 = np.random.randn(hidden_dim,output_dim)
    b2 = np.random.randn(output_dim)
    return W1, b1, W2, b2


In [6]:
input_dim = 2       # Number of input dimension/features
hidden_dim = 10     # Number of hidden neurons
output_dim = 3      # Number of classes

W1, b1, W2, b2 = init_params(input_dim, hidden_dim, output_dim)

theta = {
    "W1": W1,
    "b1": b1,
    "W2": W2,
    "b2": b2
}


In [7]:
def relu(z):
    return np.maximum(0, z)

def forward(X,theta):
    W1, b1, W2, b2 = theta["W1"], theta["b1"], theta["W2"], theta["b2"]
    h = relu(np.dot(X, W1) + b1)
    z = np.dot(h, W2) + b2
    return h,z


In [8]:
def relu(z):
    return np.maximum(0, z)

def forward_nonvectorized(X, theta):
    W1, b1, W2, b2 = theta["W1"], theta["b1"], theta["W2"], theta["b2"]
    h = np.zeros((X.shape[0], W1.shape[1]))  # Initialize h
    z = np.zeros((X.shape[0], W2.shape[1]))  # Initialize z

    for i in range(X.shape[0]):
        h[i, :] = relu(np.dot(X[i, :], W1) + b1)
        z[i, :] = np.dot(h[i, :], W2) + b2
    return h, z


# Timing

In [9]:
import time


X_big = np.vstack([X for i in range(100)])
# Vectorized forward function
start_time = time.time()
h_vec, z_vec = forward(X_big, theta)
end_time = time.time()
print(f"Vectorized time: {end_time - start_time:.5f} seconds")

# Non-vectorized forward function
start_time = time.time()
h_nonvec, z_nonvec = forward_nonvectorized(X_big, theta)
end_time = time.time()
print(f"Non-vectorized time: {end_time - start_time:.5f} seconds")


Vectorized time: 0.01950 seconds
Non-vectorized time: 0.69442 seconds


# softmax computation

In [10]:
p = np.zeros((n,n_classes))
h,z = forward(X,theta)
for i in range(n):
    p[i,:] = np.exp(z[i,:])/np.sum(np.exp(z[i,:]))

p[:5,:] # sanity check
# should be
# array([[0.0025722 , 0.97505337, 0.02237443],
#        [0.09804607, 0.77617999, 0.12577394],
#        [0.12661246, 0.65719734, 0.21619019],
#        [0.27429138, 0.4847382 , 0.24097042],
#        [0.00291315, 0.96559662, 0.03149023]])


array([[0.0025722 , 0.97505337, 0.02237443],
       [0.09804607, 0.77617999, 0.12577394],
       [0.12661246, 0.65719734, 0.21619019],
       [0.27429138, 0.4847382 , 0.24097042],
       [0.00291315, 0.96559662, 0.03149023]])

# softmax computation (vectorized)

In [11]:
h,z = forward(X,theta)

# your code here

p[:5,:] # sanity check

array([[0.0025722 , 0.97505337, 0.02237443],
       [0.09804607, 0.77617999, 0.12577394],
       [0.12661246, 0.65719734, 0.21619019],
       [0.27429138, 0.4847382 , 0.24097042],
       [0.00291315, 0.96559662, 0.03149023]])

# Loss derivative calculation

In [12]:
loss_der = np.zeros((n,n_classes))
E = np.eye(n_classes)
h,z = forward(X,theta)
for i in range(n):
    loss_der[i,:] = p[i,:] - E[y[i], :]

loss_der[:5,:] # sanity check
# should say
# array([[-0.9974278 ,  0.97505337,  0.02237443],
#        [-0.90195393,  0.77617999,  0.12577394],
#        [-0.87338754,  0.65719734,  0.21619019],
#        [-0.72570862,  0.4847382 ,  0.24097042],
#        [-0.99708685,  0.96559662,  0.03149023]])



array([[-0.9974278 ,  0.97505337,  0.02237443],
       [-0.90195393,  0.77617999,  0.12577394],
       [-0.87338754,  0.65719734,  0.21619019],
       [-0.72570862,  0.4847382 ,  0.24097042],
       [-0.99708685,  0.96559662,  0.03149023]])

# dJdb2 calculation (vectorized)

In [15]:

# y_one_hot should be a n-by-K tensor where y_one_hot[i,:] is the one-hot vector at y[i]
# should be a 1-liner that does not exceed this many chars ->|          (minus the comment)
# hint 1: "==" is a pairwise operator
# hint 2: use np.arange
# hint 3: broadcast!

#y_one_hot = # YOUR CODE HERE

#loss_der = p - y_one_hot

dJdb2 = np.mean(loss_der,axis=0)


print(dJdb2,"\n")
# should print...
# array([-0.29643435,  0.59559785, -0.2991635 ])
# ... if your answer is correct

loss_der[:5,:] # snaity check


[-0.29643435  0.59559785 -0.2991635 ] 



array([[-0.9974278 ,  0.97505337,  0.02237443],
       [-0.90195393,  0.77617999,  0.12577394],
       [-0.87338754,  0.65719734,  0.21619019],
       [-0.72570862,  0.4847382 ,  0.24097042],
       [-0.99708685,  0.96559662,  0.03149023]])

# dJdW2 calculation

In [16]:
dJdW2 = np.zeros_like(W2)
# h,z = forward(X,theta) # already computed previously
for i in range(n):
    dJdW2 += np.outer(relu(h[i,:]), loss_der[i,:])

dJdW2 /= n
dJdW2[:3,:] # sanity check
# your first three rows should be
# array([[ 4.78180322e-11,  3.17365767e-02, -3.17365767e-02],
#        [-2.89296255e-01,  5.62840257e-01, -2.73544002e-01],
#        [-1.42266016e-01,  6.53427386e-01, -5.11161370e-01],

array([[ 4.78180322e-11,  3.17365767e-02, -3.17365767e-02],
       [-2.89296255e-01,  5.62840257e-01, -2.73544002e-01],
       [-1.42266016e-01,  6.53427386e-01, -5.11161370e-01]])

# dJdW2 calculation (vectorized)

In [18]:

#dJdW2 = # YOUR CODE HERE
dJdW2[:3,] # sanity check

array([[ 4.78180322e-11,  3.17365767e-02, -3.17365767e-02],
       [-2.89296255e-01,  5.62840257e-01, -2.73544002e-01],
       [-1.42266016e-01,  6.53427386e-01, -5.11161370e-01]])

# dJdb1 calculation

In [19]:
def relu_derivative(z):
    return 1.0*(z > 0)

dJdb1 = np.zeros_like(b1)

# h,z = forward(X,theta)
for i in range(n):
    dJdb1 += (W2@(loss_der[i,:])) * relu_derivative(h[i,:])
dJdb1 /=n
dJdb1
# should be
# array([ 5.90775856e-02,  3.33066422e-01,  5.00043538e-01,  9.65803254e-02,
        # 1.74727058e+00, -1.54422652e-03,  3.68069930e-01, -1.10518462e-02,
        # 2.43568198e-01, -3.58945020e-01])

array([ 5.90775856e-02,  3.33066422e-01,  5.00043538e-01,  9.65803254e-02,
        1.74727058e+00, -1.54422652e-03,  3.68069930e-01, -1.10518462e-02,
        2.43568198e-01, -3.58945020e-01])

# dJdb1 calculation (vectorized)

In [21]:
#dJdb1 = # YOUR CODE HERE
dJdb1

array([ 5.90775856e-02,  3.33066422e-01,  5.00043538e-01,  9.65803254e-02,
        1.74727058e+00, -1.54422652e-03,  3.68069930e-01, -1.10518462e-02,
        2.43568198e-01, -3.58945020e-01])

# dJdW1 calculation

In [22]:
dJdW1 = np.zeros_like(W1)

# h,z = forward(X,theta)
for i in range(n):
    dJdW1 += np.outer((W2@loss_der[i,:])  *
                      relu_derivative(h[i,:]), X[i,:]).T
dJdW1 /=n
dJdW1
# should be
# array([[ 1.07999453e-01, -5.09697354e-01,  4.20123306e-01,
#          9.36336787e-02, -6.47689038e-02,  2.30465388e-03,
#          7.38056259e-02,  2.24393341e-02, -5.33412589e-03,
#          1.45179361e-01],
#        [ 3.36460923e-03,  2.20529616e-01, -4.57539715e-02,
#         -8.32287862e-04,  4.33449051e-01, -6.86691098e-05,
#          1.32692205e-01, -5.30041434e-03,  5.17620420e-02,
#         -1.19088483e-01]])

array([[ 1.07999453e-01, -5.09697354e-01,  4.20123306e-01,
         9.36336787e-02, -6.47689038e-02,  2.30465388e-03,
         7.38056259e-02,  2.24393341e-02, -5.33412589e-03,
         1.45179361e-01],
       [ 3.36460923e-03,  2.20529616e-01, -4.57539715e-02,
        -8.32287862e-04,  4.33449051e-01, -6.86691098e-05,
         1.32692205e-01, -5.30041434e-03,  5.17620420e-02,
        -1.19088483e-01]])

# dJdW1 calculation (vectorized)

In [25]:
# hint: use the same trick for expressing sum of outer products as a matmul
# hint: you can use 2 lines
grad = np.dot(loss_der, W2.T) * relu_derivative(h)
dJdw1 = np.dot(grad.T, X)/n # YOUR CODE HERE
dJdW1

array([[ 1.07999453e-01, -5.09697354e-01,  4.20123306e-01,
         9.36336787e-02, -6.47689038e-02,  2.30465388e-03,
         7.38056259e-02,  2.24393341e-02, -5.33412589e-03,
         1.45179361e-01],
       [ 3.36460923e-03,  2.20529616e-01, -4.57539715e-02,
        -8.32287862e-04,  4.33449051e-01, -6.86691098e-05,
         1.32692205e-01, -5.30041434e-03,  5.17620420e-02,
        -1.19088483e-01]])

# `compute_gradient` from last time

In [26]:

def compute_gradients(X,y,theta):
    dJdW1 = np.zeros_like(W1)
    dJdb1 = np.zeros_like(b1)
    dJdW2 = np.zeros_like(W2)
    dJdb2 = np.zeros_like(b2)
    h, z = forward(X, theta)
    for i in range(n):
        p = np.exp(z[i,:])/np.sum(np.exp(z[i,:]))
        loss_der = p - E[y[i], :]
        dJdW1 += np.outer((W2@loss_der) * relu_derivative(h[i,:]), X[i,:]).T
        dJdb1 += (W2@(loss_der)) * relu_derivative(h[i,:])
        dJdW2 += np.outer(relu(h[i,:]), loss_der)
        dJdb2 += loss_der
    dJdW1 /=n
    dJdb1 /=n
    dJdW2 /=n
    dJdb2 /=n

    gradients = {
        "W1": dJdW1,
        "b1": dJdb1,
        "W2": dJdW2,
        "b2": dJdb2
    }

    return gradients

# vectorize the `compute_gradient` function from last time
should not use any for loop in the function

In [None]:
def compute_gradients(X,y,theta):

    return None
    # return gradients