# 梯度检查

- 梯度检查提供了除算例外的另外一种验算方法
- 运算量较大的原因：需要对每一个参数计算一次数值梯度，然后比较整体的误差


<img src="my_file/gradient_check.jpg" style="width:1000px;height:800px;transform:rotate(90deg);">


In [121]:
import numpy as np


## 1 一维情况

In [122]:
def forward_propagation(x,theta):
    """
    正向传播

    Parameters
    ----------
    x 固定值
    theta 需要求梯度的参数

    Returns
    -------
    J : 计算值            
    """
    J=x*theta
    return J

In [123]:
x,theta=2,4
J=forward_propagation(x,theta)
print("J=",J)

J= 8


In [124]:
def backward_propagation(x,theta):
    """
    反向传播
    

    Parameters
    ----------
    x : 
    theta

    Returns
    -------
    dtheta : 
  
    """
    dtheta=x
    return dtheta
    

In [125]:
x,theta=2,4
dtheta=backward_propagation(x,theta)
print("dtheta=",dtheta)

dtheta= 2


In [126]:
def gradient_check(x,theta,epsilon=1e-7):
    """
    梯度检查
    
    Parameters
    ----------
    x : 
    theta
    epsilon

    Returns
    -------
    difference : 
    """
    
    theta_plus=theta+epsilon
    theta_minus=theta-epsilon    
    J_plus=forward_propagation(x,theta_plus)
    J_minus=forward_propagation(x,theta_minus)
    gradapprox=(J_plus-J_minus)/(2*epsilon)


    grad=backward_propagation(x,theta)
    
    numerator=np.linalg.norm(grad-gradapprox)
    denominator=(np.linalg.norm(grad)+np.linalg.norm(gradapprox))
    difference=numerator/denominator

    if difference<1e-7:
        print("The gradient is correct")
    else:
        print("The gradient is wrong")        
    return difference



In [127]:
x, theta = 5., 10.
difference = gradient_check(x, theta,epsilon=1)
print("difference = " + str(difference))

The gradient is correct
difference = 0.0


## N维情况

In [128]:
def sigmoid(x):
    """
    Compute the sigmoid of x

    Arguments:
    x -- A scalar or numpy array of any size.

    Return:
    s -- sigmoid(x)
    """
    s = 1/(1+np.exp(-x))
    return s

def relu(x):
    """
    Compute the relu of x

    Arguments:
    x -- A scalar or numpy array of any size.

    Return:
    s -- relu(x)
    """
    s = np.maximum(0,x)
    
    return s

#### 遇到错误
- 【bug1】
```
There is a mistake in the backward propagation! difference = 0.974463960867091
```
    - 原因 损失函数写错了，忘记了 log
    `    cost=-(np.dot(Y.ravel(),A3.ravel())+np.dot((1-Y).ravel(),(1-A3).ravel()))/m`


In [129]:

def forward_propagation_n(X,Y,parameters):
    """
    正向传播
    

    Parameters
    ----------
    X : 
    Y 
    parameters :参数 W b

    Returns
    -------
    cost
    cache :         
    """
    m=X.shape[1]
    W1=parameters['W1']
    b1=parameters['b1']
    W2=parameters['W2']
    b2=parameters['b2']
    W3=parameters['W3']
    b3=parameters['b3']    
    
    
    Z1=np.matmul(W1,X)+b1
    A1=relu(Z1)
    Z2=np.matmul(W2,A1)+b2
    A2=relu(Z2)
    Z3=np.matmul(W3,A2)+b3
    A3=sigmoid(Z3)
    
    cost=-(np.dot(Y.ravel(),np.log(A3.ravel()))+np.dot((1-Y).ravel(),np.log((1-A3).ravel())))/m
    cache=(Z1,A1,W1,b1,Z2,A2,W2,b2,Z3,A3,W3,b3)
    return cost,cache
    
    
    

<img src="my_file/back_propagation_gradient.png" style="width:400px;height:250px;">

#### 遇到bug

- element-wise 操作用 np.multiply而不是 np.matmul

```
<ipython-input-26-daacff1163cd> in backward_propagation(X, Y, cache)
     12 
     13     dA1=np.matmul(W2.T,dZ2)
---> 14     dZ1=np.matmul(dA1,A1>0)
     15     dW1=np.matmul(dZ1,X.T)/m
     16     db1=np.sum(dZ1,axis=1,keepdims=True)/m

ValueError: shapes (5,3) and (5,3) not aligned: 3 (dim 1) != 5 (dim 0)

```

In [130]:
def backward_propagation_n(X,Y,cache):
    m=X.shape[1]
    Z1,A1,W1,b1,Z2,A2,W2,b2,Z3,A3,W3,b3=cache
    dZ3=A3-Y
    dW3=np.matmul(dZ3,A2.T)/m
    db3=np.sum(dZ3,axis=1,keepdims=True)/m
    
    dA2=np.matmul(W3.T,dZ3)
    dZ2=np.multiply(dA2,A2>0)
    dW2=np.matmul(dZ2,A1.T)/m
    db2=np.sum(dZ2,axis=1,keepdims=True)/m    
    
    dA1=np.matmul(W2.T,dZ2)
    dZ1=np.multiply(dA1,A1>0)
    dW1=np.matmul(dZ1,X.T)/m
    db1=np.sum(dZ1,axis=1,keepdims=True)/m
     
    
    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
                 "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
                 "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
    
    return gradients    
    
    

#### dict 与 vector之间进行相互转换：原因就是需要遍历每一个vector元素，然后对比梯度差异



In [131]:
def dictionary_to_vector(parameters):
    """
    Roll all our parameters dictionary into a single vector satisfying our specific required shape.
    """
    keys = []
    count = 0
    for key in ["W1", "b1", "W2", "b2", "W3", "b3"]:
        
        # flatten parameter
        new_vector = np.reshape(parameters[key], (-1,1))
        keys = keys + [key]*new_vector.shape[0]
        
        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta, new_vector), axis=0)
        count = count + 1

    return theta, keys

def vector_to_dictionary(theta):
    """
    Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
    """
    parameters = {}
    parameters["W1"] = theta[:20].reshape((5,4))
    parameters["b1"] = theta[20:25].reshape((5,1))
    parameters["W2"] = theta[25:40].reshape((3,5))
    parameters["b2"] = theta[40:43].reshape((3,1))
    parameters["W3"] = theta[43:46].reshape((1,3))
    parameters["b3"] = theta[46:47].reshape((1,1))

    return parameters

def gradients_to_vector(gradients):
    """
    Roll all our gradients dictionary into a single vector satisfying our specific required shape.
    """
    
    count = 0
    for key in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]:
        # flatten parameter
        new_vector = np.reshape(gradients[key], (-1,1))
        
        if count == 0:
            theta = new_vector
        else:
            theta = np.concatenate((theta, new_vector), axis=0)
        count = count + 1

    return theta

In [132]:
def gradient_check_n(parameters,gradients,X,Y,epsilon=1e-7):
    parameters_values,_=dictionary_to_vector(parameters)
    grad=gradients_to_vector(gradients)

    parameters_values,_=dictionary_to_vector(parameters)
    grad=gradients_to_vector(gradients)
    num_parameters=parameters_values.size
    J_plus=np.zeros((num_parameters,1))
    J_minus=np.zeros((num_parameters,1))
    gradapprox=np.zeros((num_parameters,1))

    for i in range(num_parameters):
        theta_plus=parameters_values.copy()
        theta_plus[i][0]+=epsilon
        J_plus,_= forward_propagation_n(X,Y,vector_to_dictionary(theta_plus))

        theta_minus=parameters_values.copy()
        theta_minus[i][0]-=epsilon
        J_minus,_= forward_propagation_n(X,Y,vector_to_dictionary(theta_minus))


        gradapprox[i][0]=(J_plus-J_minus)/(2*epsilon)


    numerator=np.linalg.norm(grad-gradapprox)
    denominator=(np.linalg.norm(grad)+np.linalg.norm(gradapprox))
    difference=numerator/denominator


    if difference > 2e-7:
        print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
    else:
        print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
    return difference

In [133]:
def gradient_check_n_test_case(): 
    np.random.seed(1)
    x = np.random.randn(4,3)
    y = np.array([1, 1, 0])
    W1 = np.random.randn(5,4) 
    b1 = np.random.randn(5,1) 
    W2 = np.random.randn(3,5) 
    b2 = np.random.randn(3,1) 
    W3 = np.random.randn(1,3) 
    b3 = np.random.randn(1,1) 
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}

    
    return x, y, parameters

In [134]:
X, Y, parameters = gradient_check_n_test_case()
cost,cache=forward_propagation_n(X,Y,parameters)
gradients=backward_propagation_n(X,Y,cache)
difference = gradient_check_n(parameters, gradients, X, Y)


[92mYour backward propagation works perfectly fine! difference = 1.1909939369700076e-07[0m
