https://github.com/stephencwelch/Neural-Networks-Demystified/blob/master/Part%204%20Backpropagation.ipynb

In [1]:
import numpy as np

In [2]:
X=np.array(([3,5],[5,1],[10,2]),dtype=float)
y=np.array(([0.75],[0.82],[0.93]),dtype=float)

In [3]:
X = X/np.amax(X, axis=0)

In [4]:
#New complete class, with changes:
class Neural_Network(object):
    def __init__(self, Lambda=0):        
        #Define Hyperparameters
        self.inputLayerSize = 2
        self.outputLayerSize = 1
        self.hiddenLayerSize = 3
        
        #Weights (parameters)
        self.W1 = np.random.randn(self.inputLayerSize,self.hiddenLayerSize)
        self.W2 = np.random.randn(self.hiddenLayerSize,self.outputLayerSize)
        
        #Regularization Parameter:
        self.Lambda = Lambda
        
    def forward(self, X):
        #Propogate inputs though network
        self.z2 = np.dot(X, self.W1)
        self.a2 = self.sigmoid(self.z2)
        self.z3 = np.dot(self.a2, self.W2)
        yHat = self.sigmoid(self.z3) 
        return yHat
        
    def sigmoid(self, z):
        #Apply sigmoid activation function to scalar, vector, or matrix
        return 1/(1+np.exp(-z))
    
    def sigmoidPrime(self,z):
        #Gradient of sigmoid
        return np.exp(-z)/((1+np.exp(-z))**2)
    # to avoid overfitting using regularization!!!!!!!!
    def costFunction(self, X, y):
        #Compute cost for given X,y, use weights already stored in class.
        self.yHat = self.forward(X)
        J = 0.5*sum((y-self.yHat)**2)/X.shape[0] + (self.Lambda/2)*(np.sum(self.W1**2)+np.sum(self.W2**2))
        return J
        
    def costFunctionPrime(self, X, y):
        #Compute derivative with respect to W and W2 for a given X and y:
        self.yHat = self.forward(X)
        
        delta3 = np.multiply(-(y-self.yHat), self.sigmoidPrime(self.z3))
        #Add gradient of regularization term:
        dJdW2 = np.dot(self.a2.T, delta3)/X.shape[0] + self.Lambda*self.W2
        
        delta2 = np.dot(delta3, self.W2.T)*self.sigmoidPrime(self.z2)
        #Add gradient of regularization term:
        dJdW1 = np.dot(X.T, delta2)/X.shape[0] + self.Lambda*self.W1
        
        return dJdW1, dJdW2
    
    #Helper functions for interacting with other methods/classes
    def getParams(self):
        #Get W1 and W2 Rolled into vector:
        params = np.concatenate((self.W1.ravel(), self.W2.ravel()))
        return params
    
    def setParams(self, params):
        #Set W1 and W2 using single parameter vector:
        W1_start = 0
        W1_end = self.hiddenLayerSize*self.inputLayerSize
        self.W1 = np.reshape(params[W1_start:W1_end], \
                             (self.inputLayerSize, self.hiddenLayerSize))
        W2_end = W1_end + self.hiddenLayerSize*self.outputLayerSize
        self.W2 = np.reshape(params[W1_end:W2_end], \
                             (self.hiddenLayerSize, self.outputLayerSize))
        
    def computeGradients(self, X, y):
        dJdW1, dJdW2 = self.costFunctionPrime(X, y)
        return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))


In [5]:
from scipy import optimize

In [6]:
class trainer(object):
    def __init__(self, N):
        #Make Local reference to network:
        self.N = N
        
    def callbackF(self, params):
        self.N.setParams(params)
        self.J.append(self.N.costFunction(self.X, self.y))   
        
    def costFunctionWrapper(self, params, X, y):
        self.N.setParams(params)
        cost = self.N.costFunction(X, y)
        grad = self.N.computeGradients(X,y)
        
        return cost, grad
        
    def train(self, X, y):
        #Make an internal variable for the callback function:
        self.X = X
        self.y = y

        #Make empty list to store costs:
        self.J = []
        
        params0 = self.N.getParams()

        options = {'maxiter': 100, 'disp' : True}
        _res = optimize.minimize(self.costFunctionWrapper, params0, jac=True, method='BFGS', \
                                 args=(X, y), options=options, callback=self.callbackF)

        self.N.setParams(_res.x)
#         self.optimizationResults = _res
        


In [7]:
NN=Neural_Network(Lambda=0.0001)


In [8]:
yHat=NN.forward(X)


In [9]:
# before training, the W1W2 are not good enough to do the prediction,so 
yHat

array([[ 0.62476081],
       [ 0.58653168],
       [ 0.57779355]])

In [10]:
cost1=NN.costFunction(X,y)

In [11]:
cost1

array([ 0.03275782])

In [12]:
dJdW1,dJdW2=NN.costFunctionPrime(X,y)

In [13]:
dJdW1

array([[-0.00421802, -0.00022228, -0.00251458],
       [-0.00234603, -0.00023902, -0.00140282]])

In [14]:
dJdW2

array([[-0.02597291],
       [-0.04646499],
       [-0.02279032]])

In [15]:
scalar=3

move up hill cost increase(cost1 to cost2), down hill cost decrease(cost2 to cost3) 

In [16]:
NN.W1=NN.W1+scalar*dJdW1
NN.W2=NN.W2+scalar*dJdW2
cost2=NN.costFunction(X,y)

In [17]:
cost2

array([ 0.04400676])

In [18]:
print (cost1,cost2)

[ 0.03275782] [ 0.04400676]


In [19]:
dJdW1,dJdW2=NN.costFunctionPrime(X,y)# since W1 W2 changed,so derivative of cost function changed, it has to call costFunctionPrime function again
NN.W1=NN.W1-scalar*dJdW1
NN.W2=NN.W2-scalar*dJdW2
cost3=NN.costFunction(X,y)

In [20]:
print (cost2,cost3)

[ 0.04400676] [ 0.03062503]


In [21]:
NN1=Neural_Network()
T = trainer(NN1)

In [22]:
T.train(X,y)

Optimization terminated successfully.
         Current function value: 0.000000
         Iterations: 66
         Function evaluations: 72
         Gradient evaluations: 72


In [23]:
yHat2=NN1.forward(X)

In [24]:
yHat2

array([[ 0.7499906 ],
       [ 0.81999609],
       [ 0.92998501]])