In [3]:
import numpy as np
np = np.random.seed(812)

In [4]:
import random

In [10]:
X = np.array(([2, 9],
             [1, 5],
             [3, 6]),dtype=float)
y = np.array (([90],
              [72],
              [80]),dtype=float)

In [15]:
X= X/np.amax(X, axis=0)
y=y/100

#normalizing data - max normalization on X and y

## NN Architecture

In [27]:
class NeuralNetwork:
    
    def __init__(self):
        self.inputs=2
        self.hiddenNodes=3
        self.outputNodes=1
        # no bias in this example - keep in mind
        # initialize weight 2x3 input X hidden
        self.weights1 = np.random.randn(self.inputs, self.hiddenNodes)
        
        # initialize weights 3x1 hidden X output
        self.weights2 = np.random.randn(self.hiddenNodes, self.outputNodes)
        
    def sigmoid(self, s):
        return 1 / (1 + np.exp(-s))
    
    def sigmoidPrime(self, s):
        sx = self.sigmoid(s)
        return sx * (1-sx)
    
    
    def feed_forward(self, X):
        self.hidden_sum = np.dot(X, self.weights1)
        self.activated_hidden = self.sigmoid(self.hidden_sum)
        self.output_sum = np.dot(self.activated_hidden, self.weights2)
        self.final_output = self.sigmoid(self.output_sum)
        
        return self.final_output
    
    def backward(self, X, y, o):
        """
        Backprop thru network
        """
        self.o_error = y - o
        self.o_delta = self.o_error * self.sigmoidPrime(self.output_sum)
        self.z2_error = self.o_delta.dot(self.weights2.T)
        self.z2_delta = self.z2_error * self.sigmoidPrime(self.hidden_sum)
        
        self.weights1 += X.T.dot(self.z2_delta) # adjust first set of weights
        self.weights2 += self.activated_hidden.T.dot(self.o_delta)
        
    def train(self, X, y):
        o = self.feed_forward(X)
        self.backward(X,y,o)
        
        
        
        

In [28]:
nn = NeuralNetwork()
nn.train(X,y)

In [29]:
nn.o_error
# error times derivative

array([[0.78112341],
       [0.59922654],
       [0.70202301]])

### First gradient - how much sigmoid activation would've pushed us to the right layer?

In [39]:
nn.o_delta
# error times derivative times sigmoid prime between output layer and z2 (activation)

array([[0.08181873],
       [0.06363021],
       [0.06204304]])

In [31]:
nn.sigmoid(nn.output_sum)

array([[0.11887659],
       [0.12077346],
       [0.09797699]])

In [32]:
nn.sigmoidPrime(nn.output_sum)

array([[0.10474495],
       [0.10618723],
       [0.0883775 ]])

We have the derivatives, how we need to related it to the cost function

In [34]:
# are these helping or hurting?
nn.o_delta

array([[0.08181873],
       [0.06363021],
       [0.06204304]])

In [36]:
# the dot product of the o_delta and the weight gives us the propagated error
nn.z2_error
# for each input, have an error for each hidden neuron

array([[-0.06517483, -0.22381276, -0.02592879],
       [-0.05068629, -0.17405859, -0.02016475],
       [-0.04942199, -0.16971693, -0.01966177]])

In [38]:
# the product of the derivative sigmoid function - showing the slope of the relationship 
# between the node and the output - and the error that's been propagated to the second
# weight layer.
nn.z2_delta


array([[-0.00698305, -0.05480675, -0.00629866],
       [-0.00951447, -0.04314046, -0.00501545],
       [-0.00824883, -0.04202595, -0.00402255]])

Has same shape as inputs

We multiply the gradient by the inputs - why? so more effective - larger errors in larger inputs get larger adjustments.
Why do we need to transpose the input? - error with respect to the weights in each observations. I still don't get it.

In [40]:
X.T

array([[0.66666667, 0.33333333, 1.        ],
       [1.        , 0.55555556, 0.66666667]])

In [41]:
X.T.dot(nn.z2_delta)
# 2x3 - two input neurons into 3 hidden neurons

array([[-0.01607569, -0.09294394, -0.00989347],
       [-0.01776808, -0.10679097, -0.01176672]])

In [42]:
nn.activated_hidden.T.dot(nn.o_delta)
# weights
# 3x1 - 3 neurons into 1 output

array([[0.16843549],
       [0.09796428],
       [0.08136732]])

In [20]:
nn = NeuralNetwork()
output = nn.feed_forward(X)
print(X)
print(output)
error = y - output
error

[[0.66666667 1.        ]
 [0.33333333 0.55555556]
 [1.         0.66666667]]
[[0.37771334]
 [0.41477855]
 [0.40348484]]


array([[0.52228666],
       [0.30522145],
       [0.39651516]])

In [None]:
# error high because prediction is low
# prediction is low because random weights 
# specifically - either because the ssecond layer weights are low or the activation from first later are low
# how is activation determined?
# input * weights
# no control over inputs
# control over weights - need to increase weights. 

In [43]:
for i in range(10000):
    if (i+1 in [1,2,3,4,5]) or ((i+1) % 1000 ==0):
        print('+' + '---' * 3 + f'EPOCH {i+1}' + '---'*3 + '+')
        print('Input: \n', X)
        print('Actual Output: \n', y)
        print('Predicted Output: \n', str(nn.feed_forward(X)))
        print("Loss: \n", str(np.mean(np.square(y - nn.feed_forward(X)))))
    nn.train(X,y)

+---------EPOCH 1---------+
Input: 
 [[0.66666667 1.        ]
 [0.33333333 0.55555556]
 [1.         0.66666667]]
Actual Output: 
 [[0.9 ]
 [0.72]
 [0.8 ]]
Predicted Output: 
 [[0.15855806]
 [0.15244832]
 [0.13028834]]
Loss: 
 0.44012158591700096
+---------EPOCH 2---------+
Input: 
 [[0.66666667 1.        ]
 [0.33333333 0.55555556]
 [1.         0.66666667]]
Actual Output: 
 [[0.9 ]
 [0.72]
 [0.8 ]]
Predicted Output: 
 [[0.21470235]
 [0.19633166]
 [0.17688981]]
Loss: 
 0.37737590210695654
+---------EPOCH 3---------+
Input: 
 [[0.66666667 1.        ]
 [0.33333333 0.55555556]
 [1.         0.66666667]]
Actual Output: 
 [[0.9 ]
 [0.72]
 [0.8 ]]
Predicted Output: 
 [[0.28938067]
 [0.25475515]
 [0.24058087]]
Loss: 
 0.3007528345109189
+---------EPOCH 4---------+
Input: 
 [[0.66666667 1.        ]
 [0.33333333 0.55555556]
 [1.         0.66666667]]
Actual Output: 
 [[0.9 ]
 [0.72]
 [0.8 ]]
Predicted Output: 
 [[0.37770888]
 [0.32572686]
 [0.3184181 ]]
Loss: 
 0.2200534833092657
+---------EPOCH 5-