An algorithm is either unsupervised or supervised and it is either parametric or non-parametric. Parametricism is about the way the learning is **stored** and often by extension, the **method for learning**.

A paramteric model is characterized by having a fixed number of parameters whereas a non-parametric model's number of parameters is infinite (determined by data).

In [None]:
weight = 0.1

def neural_network(input, weight):
    prediction = input * weight
    return prediction

number_of_toes = [8.5, 9.5, 10, 9]
input = number_of_toes[0]
pred = neural_network(input, weight)
print(pred)

In [12]:
#T his dataset is the current status at the beginning of each game for the first 4 games in a season.

# toes = current number of toes
# wlrec = current games won (percent)
# nfans = fan count (in millions) 

toes = [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0] 

# input corresponds to every entry
# for the first game of the season
weights = [0.1, 0.2, 0]

def neural_network(inputs, weights):
    pred = w_sum(input, weights)
    return pred

def w_sum(a, b):
    assert(len(a) == len(b))
    output = 0
    
    for i in range(len(a)):
        output += (a[i] * b[i])
    
    return output
    

input = [toes[0], wlrec[0], nfans[0]]
pred = neural_network(input, weights)

print(pred)

0.9800000000000001


The intuition behind how and why a dot product (weighted sum) works is easily one of the most important parts of truly understanding how neural networks make predictions. Loosely stated, a dot product gives us __notion of similarity__ between two vectors.

In [1]:
# Same above code in Numpy

import numpy as np

weights = np.array([0.1, 0.2, 0])

def neural_network(input, weights):
    pred = input.dot(weights)

    return pred

toes = np.array([8.5, 9.5, 9.9, 9.0])
wlrec = np.array([0.65, 0.8, 0.8, 0.9])
nfans = np.array([1.2, 1.3, 0.5, 1.0])

#input corresponds to every entry
#for the first game of season
                 
input = np.array([toes[0], wlrec[0], nfans[0]])
pred = neural_network(input, weights)
print(pred)
    

0.98


In [10]:
# single input to multiple outputs
''' instead of predicting just whether the 
team won or lost now we're also predicting 
whether they are happy/sad AND the percentage
of the team that is hurt. We are making this 
prediction using only the current
win/loss record'''

weights = [0.3, 0.2, 0.9]

def neural_network(input, weights):
    
    pred = ele_mul(input, weights)
    
    return pred

def ele_mul(number, vector):
    output = [0, 0, 0]
    assert(len(output) == len(vector))
    
    for i in range(len(vector)):
        output[i] = number * vector[i]
    
    return output

wlrec = [0.65, 0.65, 1.2]

input = wlrec[0]

pred = neural_network(input, weights)

print(pred)

[0.195, 0.13, 0.5850000000000001]


In [18]:
# Multiple inputs to multiple outputs
#ntoes #%win #fans
weights = [[0.1, 0.1, -0.3],# hurt?
           [0.1, 0.2, 0.0], # win?
           [0.0, 1.3, 0.1]] # sad?


def neural_network(input, weights):
    pred = vect_mat_mul(input, weights)
    return pred

def vect_mat_mul(vect, matrix):
    a = vect
    b = matrix
    
    assert(len(a) == len(b)
           
    # fixed 
    output = []
    
    for i in range(len(a))
        # fixed 
        output.append(np.dot(a, b[i]))
        
    return output

''' This dataset is the current status at
the beginning of each game for the first 4
games in a season

toes = current number of toes
wlrec = current games won (percent)
nfans = fan count (in millions) '''

toes = [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]

# input corresponds to every entry
# for the first game of the season
input = [toes[0], wlrec[0], nfans[0]]
pred = neural_network(input, weights)
print(pred)


[0.55500000000000005, 0.98000000000000009, 0.96500000000000008]


In [23]:
# Another hidden layer added
in_wgt = np.array([[0.1, 0.2, -0.1],
         [-0.1, 0.1, 0.9],
         [0.1, 0.4, 0.1]])

hp_wgt = np.array([[0.3, 1.1, -0.3],
          [0.1, 0.2, 0.0],
          [0.0, 1.3, 0.1]])

weights = [in_wgt, hp_wgt]

def neural_network(input, weights):
    hid = input.dot(weights[0])
    pred = hid.dot(weights[1])
    return pred

toes = np.array([8.5, 9.5, 9.9, 9.0])
wlrec = np.array([0.65, 0.8, 0.8, 0.9])
nfans = np.array([1.2, 1.3, 0.5, 1.0])

input = np.array([toes[0], wlrec[0], nfans[0]])

pred = neural_network(input, weights)

print(pred)

[ 0.496  1.256 -0.286]


In [3]:
# Hot and Cold Learning
weight = 0.5
input = 0.5
goal_prediction = 0.8

step_amount = 0.001

for iterration in range(1101):
    
    prediction = input * weight
    error = (prediction - goal_prediction) ** 2
    
    print("Error: " + str(error) + " Prediction: " + str(prediction))
    
    up_prediction = input * (weight + step_amount)
    up_error = (goal_prediction - up_prediction) ** 2
    
    down_prediction = input * (weight - step_amount)
    down_error = (goal_prediction - down_prediction) ** 2
    
    if (down_error < up_error):
        weight = weight - step_amount
        
    if (down_error > up_error):
        weight = weight + step_amount


Error: 0.30250000000000005 Prediction: 0.25
Error: 0.3019502500000001 Prediction: 0.2505
Error: 0.30140100000000003 Prediction: 0.251
Error: 0.30085225 Prediction: 0.2515
Error: 0.30030400000000007 Prediction: 0.252
Error: 0.2997562500000001 Prediction: 0.2525
Error: 0.29920900000000006 Prediction: 0.253
Error: 0.29866224999999996 Prediction: 0.2535
Error: 0.29811600000000005 Prediction: 0.254
Error: 0.2975702500000001 Prediction: 0.2545
Error: 0.29702500000000004 Prediction: 0.255
Error: 0.29648025 Prediction: 0.2555
Error: 0.29593600000000003 Prediction: 0.256
Error: 0.2953922500000001 Prediction: 0.2565
Error: 0.294849 Prediction: 0.257
Error: 0.29430625 Prediction: 0.2575
Error: 0.293764 Prediction: 0.258
Error: 0.2932222500000001 Prediction: 0.2585
Error: 0.292681 Prediction: 0.259
Error: 0.29214025 Prediction: 0.2595
Error: 0.2916 Prediction: 0.26
Error: 0.2910602500000001 Prediction: 0.2605
Error: 0.29052100000000003 Prediction: 0.261
Error: 0.28998225 Prediction: 0.2615
Error: 

In [10]:
# Calculating both direction and amount from error
# This is gradient descent!

weight = 0.5
goal_pred = 0.8
input = 0.5

for iteration in range(20):
    pred = input * weight
    error = (pred - goal_pred) ** 2
    # pure error * scaling, negative reversal and stopping
    direction_and_amount = (pred - goal_pred) * input
    weight = weight - direction_and_amount
    
    print("Error: " + str(error) + " Direction and amount: " + str(direction_and_amount) + " Prediction: " + str(pred))
    

    

Error: 0.30250000000000005 Direction and amount: -0.275 Prediction: 0.25
Error: 0.17015625000000004 Direction and amount: -0.20625000000000002 Prediction: 0.3875
Error: 0.095712890625 Direction and amount: -0.1546875 Prediction: 0.49062500000000003
Error: 0.05383850097656251 Direction and amount: -0.11601562500000001 Prediction: 0.56796875
Error: 0.03028415679931642 Direction and amount: -0.08701171875000002 Prediction: 0.6259765625
Error: 0.0170348381996155 Direction and amount: -0.06525878906250004 Prediction: 0.669482421875
Error: 0.00958209648728372 Direction and amount: -0.04894409179687503 Prediction: 0.70211181640625
Error: 0.005389929274097089 Direction and amount: -0.03670806884765626 Prediction: 0.7265838623046875
Error: 0.0030318352166796153 Direction and amount: -0.02753105163574221 Prediction: 0.7449378967285156
Error: 0.0017054073093822882 Direction and amount: -0.020648288726806685 Prediction: 0.7587034225463867
Error: 0.0009592916115275371 Direction and amount: -0.01548

# One iteration of gradient
**This performs a weight update on a single training example (input->True) pair**

In [11]:
# 1.An empty network
weight = 0.1
alpha = 0.01

def neural_network(input, weight):
    prediction = input * weight
    
    return prediction

In [13]:
# 2. PREDICT: Making a prediction and evaluating error
number_of_toes = [8.5]
win_or_lose_binary = [1] # won!

input = number_of_toes[0]
goal_pred = win_or_lose_binary[0]

pred = neural_network(input, weight)

error = (pred - goal_pred) ** 2 #MSE

In [14]:
# 3. COMPARE: Calculating "Node Delta" and Putting it on the Output Node
delta = pred - goal_pred

Delta is a measurement of "how much this node missed". Thus, since the true prediction was 1.0 and our networks prediction was 0.85, the network was too **low** by 0.15. Thus delta is **negative** 0.15

   The primary difference between the gradient descent in previous code block and the implementation here just happened. The delta is a new variable. It is the "raw amount that the node was too high or too low". Instead of cumpouting direction_and_amount directly, we first calculate how much we wanted our output node to be different. Only then do we compute our direction_and_amount to change the weight ("weight_delta")

In [15]:
# 4. LEARN: Calculating "Weight Delta" and Putting it on the Weight
weight_delta = input * delta

Weight delta is a measure of "how much this weight caused the network to miss". We calculate it by multiplying the weights output "Node Delta" by the weights input. Thus we create each "Weight Delta" by scaling its output "Node Delta" by the weights input.
This accounts for the 3 aforementioned properties of our "direction_and_amount" - scaling, negative reversal and stopping.

In [17]:
# 5. LEARN: Update the Weight
weight -= weight_delta * alpha

We multiply our weight_delta by a small number "alpha" before using it to update our weight. This allows us to control how fast the network learns. If it learns too fast, it can update weights too agressively and overshoot. More on this later. Note that the weight update made the same change (small increase) as Hot and Cold learning.

## Learning Is Just Reducing Error
### Modifying weight to reduce our error

In [22]:
weight = 0.5
goal_pred = 0.8
input = 0.5

for iteration in range(20):
    # The following 2 lines have a secret:
    pred = input * weight # Line 1
    error = (pred - goal_pred) ** 2 # Line 2
    delta = pred - goal_pred
    weight_delta  = delta * input
    weight = weight - weight_delta
    
    print("--------------------")
    print("Error:" + str(error))
    print("Delta:" + str(delta))
    print("Weight Delta:" + str(weight_delta))
    print("Prediction:" + str(pred))
    print("Updated weight:" + str(weight))

--------------------
Error:0.30250000000000005
Delta:-0.55
Weight Delta:-0.275
Prediction:0.25
Updated weight:0.775
--------------------
Error:0.17015625000000004
Delta:-0.41250000000000003
Weight Delta:-0.20625000000000002
Prediction:0.3875
Updated weight:0.9812500000000001
--------------------
Error:0.095712890625
Delta:-0.309375
Weight Delta:-0.1546875
Prediction:0.49062500000000003
Updated weight:1.1359375
--------------------
Error:0.05383850097656251
Delta:-0.23203125000000002
Weight Delta:-0.11601562500000001
Prediction:0.56796875
Updated weight:1.251953125
--------------------
Error:0.03028415679931642
Delta:-0.17402343750000004
Weight Delta:-0.08701171875000002
Prediction:0.6259765625
Updated weight:1.33896484375
--------------------
Error:0.0170348381996155
Delta:-0.1305175781250001
Weight Delta:-0.06525878906250004
Prediction:0.669482421875
Updated weight:1.4042236328125
--------------------
Error:0.00958209648728372
Delta:-0.09788818359375007
Weight Delta:-0.048944091796875

# The Golden Method for Learning
Adjusting each **weight** in the correct **direction** and by correct **amount** so that our **error** reduces to 0.

All we are trying to do is figure out the right direction and amount to midfy weight so that our error goes down. The secret to this lies in our pred and error calculations. Notice that we actually use our pred inside the error calculation. 

## The Secret
For any **input** and **goal_pred** there is an exact relationship defined between our error and knob_weight, found by combining our prediction and error formulas. In this case:
```error = ((0.5 * weight) - 0.8 ** 2```

# Several stop of Learning

In [25]:
weight = 0.0
goal_pred = 0.8
input = 1.1

for iteration in range(4):
    pred = input * weight
    error = (pred - goal_pred) ** 2
    delta = (pred - goal_pred)
    weight_delta = delta * input
    weight = weight - weight_delta
    
    print("--------------------")
    print("Prediction:" + str(pred))
    print("Error:" + str(error))
    print("Delta:" + str(delta))
    print("Weight Delta:" + str(weight_delta))
    print("Updated weight:" + str(weight))

--------------------
Prediction:0.0
Error:0.6400000000000001
Delta:-0.8
Weight Delta:-0.8800000000000001
Updated weight:0.8800000000000001
--------------------
Prediction:0.9680000000000002
Error:0.02822400000000005
Delta:0.16800000000000015
Weight Delta:0.1848000000000002
Updated weight:0.6951999999999999
--------------------
Prediction:0.76472
Error:0.0012446784000000064
Delta:-0.03528000000000009
Weight Delta:-0.0388080000000001
Updated weight:0.734008
--------------------
Prediction:0.8074088
Error:5.4890317439999896e-05
Delta:0.007408799999999993
Weight Delta:0.008149679999999992
Updated weight:0.72585832


# Tunnel Vision on  One Concept
## Concept: "Learning is adjusting our weight to reduce the error to zero"

In [26]:
error = ((input * weight) - goal_pred) ** 2

How can we use this formula to know how to change our *weight* so that our *error*
moves in a **particular** direction? Now **THAT** is the right question!

This formula is the exact relationship between these two variables, and now we're going to figure out how to change one variable so that we move the other variable in a *particular* direction.

Given a function, the **derivative** represents the *direction* and the *amount* that one variable changes if you change the other variable! If derivative is explained like *"the slope at a point on a line or curve"* then the slope's sign gives us **direction** and the slope's steepness gives us **amount**. We can use both of these to thelp find the goal weight!

## With derivatives we can pick any two variables in any formula and know how they interact!
** For any function we can learn how to change one variable so that we can move another variable in a direction! It is important you know this in your bones! **

In [48]:
# This thing introduces divergence - we overcorrect way too much
weight = 1.0
goal_pred = 0.8
input = 2

for iteration in range(20):
    pred = input * weight
    error = (pred - goal_pred) ** 2
    delta = (pred - goal_pred)
    weight_delta = delta * input
    weight = weight - weight_delta
    
    print("--------------------")
    print("Prediction:" + str(pred))
    print("Error:" + str(error))
    print("Delta:" + str(delta))
    print("Weight Delta:" + str(weight_delta))
    print("Updated weight:" + str(weight))

--------------------
Prediction:2.0
Error:1.44
Delta:1.2
Weight Delta:2.4
Updated weight:-1.4
--------------------
Prediction:-2.8
Error:12.959999999999997
Delta:-3.5999999999999996
Weight Delta:-7.199999999999999
Updated weight:5.799999999999999
--------------------
Prediction:11.599999999999998
Error:116.63999999999994
Delta:10.799999999999997
Weight Delta:21.599999999999994
Updated weight:-15.799999999999995
--------------------
Prediction:-31.59999999999999
Error:1049.7599999999995
Delta:-32.39999999999999
Weight Delta:-64.79999999999998
Updated weight:48.999999999999986
--------------------
Prediction:97.99999999999997
Error:9447.839999999995
Delta:97.19999999999997
Weight Delta:194.39999999999995
Updated weight:-145.39999999999998
--------------------
Prediction:-290.79999999999995
Error:85030.55999999998
Delta:-291.59999999999997
Weight Delta:-583.1999999999999
Updated weight:437.79999999999995
--------------------
Prediction:875.5999999999999
Error:765275.0399999999
Delta:874.8

### Enter the Alpha
`weight = weight - (alpha * derivative)`

instead of 

`weight = weight - derivate`


In [55]:
# This thing introduces divergence - we overcorrect way too much
weight = 1.0
goal_pred = 0.8
input = 2
alpha = 0.1

for iteration in range(20):
    pred = input * weight
    error = (pred - goal_pred) ** 2
    delta = (pred - goal_pred)
    weight_delta = delta * input
    weight = weight - (weight_delta * alpha)
    
    print("--------------------")
    print("Prediction:" + str(pred))
    print("Error:" + str(error))
    print("Delta:" + str(delta))
    print("Weight Delta:" + str(weight_delta))
    print("Updated weight:" + str(weight))

--------------------
Prediction:2.0
Error:1.44
Delta:1.2
Weight Delta:2.4
Updated weight:0.76
--------------------
Prediction:1.52
Error:0.5184
Delta:0.72
Weight Delta:1.44
Updated weight:0.616
--------------------
Prediction:1.232
Error:0.18662399999999996
Delta:0.43199999999999994
Weight Delta:0.8639999999999999
Updated weight:0.5296
--------------------
Prediction:1.0592
Error:0.06718463999999993
Delta:0.2591999999999999
Weight Delta:0.5183999999999997
Updated weight:0.47775999999999996
--------------------
Prediction:0.9555199999999999
Error:0.024186470399999962
Delta:0.15551999999999988
Weight Delta:0.31103999999999976
Updated weight:0.446656
--------------------
Prediction:0.893312
Error:0.008707129343999991
Delta:0.09331199999999995
Weight Delta:0.1866239999999999
Updated weight:0.42799360000000003
--------------------
Prediction:0.8559872000000001
Error:0.0031345665638400017
Delta:0.055987200000000015
Weight Delta:0.11197440000000003
Updated weight:0.41679616
------------------

# DELTA
A measure of how much we want a node's value to be higher or lower to predict "perfectly" given the current training example.

**weight_delta** on the other hand, is an estimate for the direction and amount we should move our *weights* to reduce our *node delta*, inferred by the *derivative*. How do we transform our delta into a weight_delta? We multiply delta by a weight's input.


### Several steps of Learning:

How about not updating one weight?

In [17]:
def neural_network(input, weights):
    out = 0
    for i in range(len(input)):
        out += (input[i] * weights[i])
    return out

def ele_mul(scalar, vector):
    out = [0, 0, 0]
    for i in range(len(out)):
        out[i] = vector[i] * scalar
    return out

toes = [8.5, 9.4, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]

win_or_lose_binary = [1, 1, 0 , 1]
true = win_or_lose_binary[0]

alpha = 0.4
weights = [0.1, 0.2, -0.1]
inputs = [toes[0], wlrec[0], nfans[0]]

for iter in range(3):
    pred = neural_network(inputs, weights)
    
    error = (pred - true) ** 2
    delta = pred - true
    
    weight_deltas = ele_mul(delta, inputs)
    weight_deltas[0] = 0
    
    print("Iteration {}".format(iter+1))
    print("Pred: {}".format(pred))
    print("Error: {}".format(error))
    print("Delta: {}".format(delta))
    print("Weights: {}".format(weights))
    print("Weight deltas: {}".format(weight_deltas))
    print("-------------------")
    
    for i in range(len(weights)):
        weights[i] -= alpha * weight_deltas[i]
        

Iteration 1
Pred: 0.8600000000000001
Error: 0.01959999999999997
Delta: -0.1399999999999999
Weights: [0.1, 0.2, -0.1]
Weight deltas: [0, -0.09099999999999994, -0.16799999999999987]
-------------------
Iteration 2
Pred: 0.9642999999999999
Error: 0.0012744900000000046
Delta: -0.035700000000000065
Weights: [0.1, 0.2364, -0.03280000000000005]
Weight deltas: [0, -0.02320500000000004, -0.04284000000000008]
-------------------
Iteration 3
Pred: 0.9908965000000001
Error: 8.287371224999874e-05
Delta: -0.009103499999999931
Weights: [0.1, 0.245682, -0.015664000000000018]
Weight deltas: [0, -0.005917274999999955, -0.010924199999999917]
-------------------


Turns out, as the error is shared, when one weight finds the "bottom of the bowl", all of the weights have found it. Now consider a network with multiple outputs:

In [34]:
"""
Instead of predicting just whether the team won or lost,
now we're also predicting whether they are happy/sad AND
the percentage of the team that is hurt. We are making 
this prediction using only the current win/loss record
"""

weights = [0.3, 0.2, 0.9]

def neural_network(inputs, weights):
    pred = ele_mul(input, weights)
    return pred

wlrec = [0.9, 1.0, 1.0, 0.9]

hurt = [0.1, 0.0, 0.0, 0.1]
win = [1, 1, 0, 1]
sad = [0.1, 0.0, 0.1, 0.2]

input = wlrec[0]
true = [hurt[0], win[0], sad[0]]

for iter in range(10):

    pred = neural_network(input, weights)

    error = [0, 0, 0]
    deltas = [0, 0, 0]

    for i in range(len(true)):
        error[i] = (pred[i] - true[i]) ** 2
        deltas[i] = pred[i] - true[i]

    weight_deltas = ele_mul(input, deltas)

    alpha = 0.5

    print("Pred: {}".format(pred))
    print("Error: {}".format(error))
    print("Delta: {}".format(delta))
    print("Weights: {}".format(weights))
    print("Weight deltas: {}".format(weight_deltas))
    print("-------------------")

    for i in range(len(weights)):
        weights[i] -= (weight_deltas[i] * alpha)



Pred: [0.27, 0.18000000000000002, 0.81]
Error: [0.028900000000000006, 0.6723999999999999, 0.5041000000000001]
Delta: [0.17, -0.82, 0.7100000000000001]
Weights: [0.3, 0.2, 0.9]
Weight deltas: [0.15300000000000002, -0.738, 0.6390000000000001]
-------------------
Pred: [0.20115, 0.5121, 0.5224500000000001]
Error: [0.010231322499999997, 0.23804641, 0.17846400250000008]
Delta: [0.17, -0.82, 0.7100000000000001]
Weights: [0.22349999999999998, 0.569, 0.5805]
Weight deltas: [0.09103499999999999, -0.43911, 0.3802050000000001]
-------------------
Pred: [0.16018425, 0.7096994999999999, 0.35135774999999997]
Error: [0.0036221439480624996, 0.08427438030025004, 0.06318071848506247]
Delta: [0.17, -0.82, 0.7100000000000001]
Weights: [0.1779825, 0.7885549999999999, 0.39039749999999995]
Weight deltas: [0.054165824999999994, -0.2612704500000001, 0.22622197499999994]
-------------------
Pred: [0.13580962875, 0.8272712025, 0.24955786124999998]
Error: [0.0012823295112128258, 0.02983523748579602, 0.02236755386

# Gradient Descent with Multiple Inputs & Outputs


In [70]:
# 1. An empty Network With Multiple Inputs & Outputs
            #toes %win #fans
weights = [ [0.1, 0.1, -0.3], #hurt?
            [0.1, 0.2, 0.0], # win?
            [0.0, 1.3, 0.1]] #sad?

def neural_network(input, weights):
    pred = vect_mat_mul(inputs, weights)
    return pred

def vect_mat_mul(vector, matrix):
    assert(len(vector) == len(matrix[0]))
    
    result = []
    for row in matrix:
        result.append([x*y for x,y in zip(vector,row)])
    return result

# 2. PREDICT: Make a Prediction and Calculate Error and Delta

toes = [8.5, 9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2, 1.3, 0.5, 1.0]

hurt = [0.1, 0.0, 0.0, 0.1]
win = [1, 1, 0, 1]
sad = [0.1, 0.0, 0.1, 0.2]

alpha = 0.020

input = [toes[0], wlrec[0], nfans[0]]
true = [hurt[0], win[0], sad[0]]

for i in range(30):

    pred = neural_network(input, weights)
    pred = pred[0]
    
    print(pred)

    error = [0, 0, 0]
    deltas = [0,0,0]

    def zeros_matrix(m,n):
        return [[0 for i in range(n)] for i in range(m)]

    for i in range(len(true)):
        error[i] = (pred[i] - true[i]) ** 2
        deltas[i] = pred[i] - true[i]

# 3. COMPARE: Calculating Each weight_delta and putting it on each weight

    def outer_prod(vec_a, vec_b):
        out = zeros_matrix(len(vec_a), len(vec_b))
        for i in range(len(vec_a)):
            for j in range(len(vec_b)):
                out[i][j] = vec_a[i] * vec_b[j]

        return out



    weight_deltas = outer_prod(input, deltas)

# 4. LEARN: Updating the weights

    print(weight_deltas)

    for i in range(len(weights)):
        for j in range(len(weights[0])):
            weights[i][j] -= alpha * weight_deltas[i][j]

    print(weights)
    print("--------------------")

[0.8500000000000001, 0.065, -0.36]
[[6.375000000000001, -7.947500000000001, -3.9099999999999997], [0.4875000000000001, -0.60775, -0.299], [0.9000000000000001, -1.122, -0.5519999999999999]]
[[-0.027500000000000024, 0.25895, -0.2218], [0.09025, 0.212155, 0.00598], [-0.018000000000000002, 1.32244, 0.11104]]
--------------------
[-0.2337500000000002, 0.1683175, -0.26616]
[[-2.836875000000002, -7.06930125, -3.1123600000000002], [-0.21693750000000014, -0.540593625, -0.23800400000000002], [-0.40050000000000024, -0.998019, -0.43939200000000006]]
[[0.029237500000000013, 0.400336025, -0.1595528], [0.09458875, 0.22296687250000002, 0.01074008], [-0.009989999999999997, 1.34240038, 0.11982784]]
--------------------
[0.24851875000000012, 0.26021841625, -0.19146336]
[[1.262409375000001, -6.288143461875, -2.47743856], [0.09653718750000008, -0.48085802943750006, -0.18945118400000002], [0.17822250000000014, -0.8877379005, -0.349756032]]
[[0.003989312499999995, 0.5260988942375, -0.11000402879999999], [0.0

# Introduction to Backpropagation

### A street light problem


In [75]:
import numpy as np

weights = np.array([0.5, 0.48, -0.7])

alpha = 0.1

streetlights = np.array([[1, 0, 1],
                        [0, 1, 1], 
                        [0, 0, 1],
                        [1, 1, 1],
                        [0, 1, 1],
                        [1, 0, 1]])

walk_vs_stop = np.array([[0], [1], [0], [1], [1], [0]])

input = streetlights[0] # [1, 0, 1]
goal_prediction = walk_vs_stop[0] # equals 0 i.e. stop

# training on one example 
for iteration in range(20):
    prediction = input.dot(weights)
    error = (prediction - goal_prediction) ** 2
    delta = prediction - goal_prediction
    weights = weights - alpha * (input * delta)
    
    print("Error: {} Prediction: {}".format(error, prediction))

Error: [ 0.04] Prediction: -0.19999999999999996
Error: [ 0.0256] Prediction: -0.15999999999999992
Error: [ 0.016384] Prediction: -0.1279999999999999
Error: [ 0.01048576] Prediction: -0.10239999999999982
Error: [ 0.00671089] Prediction: -0.08191999999999977
Error: [ 0.00429497] Prediction: -0.06553599999999982
Error: [ 0.00274878] Prediction: -0.05242879999999994
Error: [ 0.00175922] Prediction: -0.04194304000000004
Error: [ 0.0011259] Prediction: -0.03355443200000008
Error: [ 0.00072058] Prediction: -0.02684354560000002
Error: [ 0.00046117] Prediction: -0.021474836479999926
Error: [ 0.00029515] Prediction: -0.01717986918399994
Error: [ 0.00018889] Prediction: -0.013743895347199997
Error: [ 0.00012089] Prediction: -0.010995116277759953
Error: [  7.73712525e-05] Prediction: -0.008796093022207963
Error: [  4.95176016e-05] Prediction: -0.007036874417766459
Error: [  3.16912650e-05] Prediction: -0.0056294995342132115
Error: [  2.02824096e-05] Prediction: -0.004503599627370569
Error: [  1.29

In [76]:
# Expanding for all street lights
for iteration in range(40):
    error_for_all_lights = 0
    # This thing learns one at a time - Stochastic Gradient Descent
    for row_index in range(len(walk_vs_stop)):
        input = streetlights[row_index]
        goal_prediction = walk_vs_stop[row_index]
        
        prediction = input.dot(weights)
        
        error = (prediction - goal_prediction) ** 2
        error_for_all_lights += error
        
        delta = prediction - goal_prediction
        weights = weights - (alpha * (input * delta))
        print("Prediction: {}".format(prediction))
    print("Weights: {}".format(weights))
    print("Error: {} \n".format(error_for_all_lights))
        
        

Prediction: -0.0023058430092137705
Prediction: -0.1209223372036855
Prediction: -0.4888301034833169
Prediction: 0.7512228033816979
Prediction: 0.20190057990904375
Prediction: 0.2886959509940853
Weights: [ 0.59508579  0.6967799  -0.36412903]
Error: [ 2.2776252] 

Prediction: 0.23095676079526822
Prediction: 0.3095551927482997
Prediction: -0.31818022191782463
Prediction: 1.0514522876696315
Prediction: 0.46917171885649595
Prediction: 0.3284202821335913
Weights: [ 0.53400285  0.81376198 -0.27126663]
Error: [ 1.02357959] 

Prediction: 0.26273622570687305
Prediction: 0.5162217243011503
Prediction: -0.24916242359281637
Prediction: 1.1456228535284017
Prediction: 0.6087690510945215
Prediction: 0.2934815746795325
Weights: [ 0.46381879  0.88670061 -0.22903353]
Error: [ 0.62555276] 

Prediction: 0.23478525974362602
Prediction: 0.6341885574333014
Prediction: -0.21593091088147304
Prediction: 1.1692841999678323
Prediction: 0.6950870970412221
Prediction: 0.24263689344202927
Weights: [ 0.39914815  0.9368

The network **identified correlation** between the middle input and output! Inversely, *randomness* with respect to the output was found at the far left and far right weights (values near 0).

Above is *Stochastic Gradient Descent*. *Full/Average Gradient Descent* configuration would calculate the average weight_delta over the entire dataset, only actually changing the weights each time it computes a full average, instead of computing weight_delta for each of the rows above.

There is a 3rd configuration *Batch Gradient Descent*, more about it later.

The greatest challenge you will face with deep learning is convincing your neural network to *generalize* instead of just *memorize*. Beware of overfitting when the neural network stops learning!

Neural networks search for corellation between their input and output *layers*. We set the values of our input layer to be individual rows of our input data and we try to train the network so that our output layer equals our output dataset. The neural network doesnt actually *know* about data. It just searches for corellation between the input and output layers.

# Backpropagation: Long Distance Error Atribution
### If your data doesn't have correlation... let's create intermediate data that does!

Stack the network.

Here is a problem - For any two consecutive weighted sums of the input, there exists a single weighted sum with exactly identical behavior. Aka... anything that our 3 layer network can do... our 2 layer network can also do.

## Enter Non-linearity

By turning any middle node off whenever it would be negative, we allow the network to sometimes subscribe to correlation from various inputs. This is impossible for 2-layer neural networks... thus adding power to 3-layer nets.

"If the node would be negative then set it to 0" logic is called a **nonlinearity**. This is because without this tweak, our neural network is **linear**. Without this technique, our output layer only gets to pick from the same correlation that it had in the 2-layer network. It's still just subscribing to pieces of the input layer, which means that it can't solve our new streetlights dataset.

There are many kinds of nonlinearities. However, the one we discussed above is, in
many cases, the best one to use. It's also the simplest. (It's called "relu")

## * Sidenote *

*We can compute the relationship between our error and any one of our weights so that we know how changing the weight changes the error. We can then use this to reduce our error down to 0.*

*Adjusting our weights to reduce our error over a series of training examples ultimately just searches for correlation between our input and our output layers. If no correlation exists, then error will never reach 0.*

## Our first "Deep" Neural Network


In [87]:
import numpy as np

np.random.seed(1)

def relu(x):
    return (x > 0) * x

alpha = 0.2
hidden_size = 4

streetlights = np.array([[1, 0, 1],
                         [0, 1, 1],
                         [0, 0, 1],
                         [1, 1, 1]])

walk_vs_stop = np.array([[1, 1, 0, 0]]).T

# 2 sets of weights now to connect our 3 layers (randomly initialized)
# 3 by hidden_size array of random numbers from [-1, 1)
weights_0_1 = 2 * np.random.random((3, hidden_size)) - 1
# hidden_size by 1 array of random numbers from [-1, 1)
weights_1_2 = 2 * np.random.random((hidden_size, 1)) - 1

layer_0 = streetlights[0]
# The output of layer_1 is sent through "relu" where 
# negative values become 0
# This is then the input for the next layer, layer_2
layer_1 = relu(np.dot(layer_0, weights_0_1))
# This works because of numpy array. It applys relu to each element
# So layer_1 provides an output (input to layer_2) only if the error is positive
layer_2 = np.dot(layer_1, weights_1_2)
print(np.dot(layer_0, weights_0_1))
print(layer_1)
print(layer_2)



[-0.37242104  0.51828245 -1.16138222 -0.02489585]
[-0.          0.51828245 -0.         -0.        ]
[ 0.39194327]


# Backpropagation in Code

In [100]:
import numpy as np

np.random.seed(1)

# returns x if x > 0
# returns 0 otherwise
def relu(x):
    return (x > 0) * x

# returns 1 for input > 0
# return 0 otherwise
def relu2deriv(output):
    return output > 0

alpha = 0.2
hidden_size = 4

streetlights = np.array([[1, 0, 1],
                         [0, 1, 1],
                         [0, 0, 1],
                         [1, 1, 1]])

walk_vs_stop = np.array([[1, 1, 0, 0]]).T

weights_0_1 = 2 * np.random.random((3, hidden_size)) - 1
weights_1_2 = 2 * np.random.random((hidden_size, 1)) - 1

for iteration in range(60):
    layer_2_error = 0
    
    for i in range(len(streetlights)):
        layer_0 = streetlights[i:i+1]
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        layer_2 = np.dot(layer_1, weights_1_2)
        
        # using i:i+1 so that it returns [[0]] not just [0], which would 
        # cause shapes to mismatch (0 dimensions)
        layer_2_error += np.sum((layer_2 - walk_vs_stop[i:i+1]) ** 2)
        
        # How much higher or lower we want the output prediction to be??
        layer_2_delta = (walk_vs_stop[i:i+1] - layer_2)
        
        # How much we want each middle node to move up or down??
        
        # This line computes the delta at layer_1 given the delta at layer_2 by
        # taking the layer_2_delta and multiplying it by its connecting weights_1_2
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)*relu2deriv(layer_1)
        
        # Note the goal_pred - pred instead of pred - goal_pred for layer_2_delta, hence the += not -=
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
        
    if(iteration % 10 == 9):
        print("Error: {}".format(layer_2_error))
        
        

Error: 0.6342311598444467
Error: 0.35838407676317513
Error: 0.0830183113303298
Error: 0.006467054957103705
Error: 0.0003292669000750734
Error: 1.5055622665134859e-05


The *relu2deriv* function returns 1 when output is > 0 and it returns 0 otherwise. This is actually the *slope* of our relu function. It's the *derivative* of our relu function. It serves a very important purpose.

The goal here is **error attribution**. It is all about figuring out how much each weight contributed to the final error. layer_2_delta tells us how much we want the final prediction to move up or down.  layer_1_delta will tell us how much we want each middle node to move up or down. Multiplying the output delta by each weight attached to it will give us a weighting of how much each weight contributed to that error!!

Theres one more thing though - If the relu set the output to a  layer_1 node to be 0, then it didn't contribute to the error at all. So, when this was true, we should also set the delta of that node to be zero. Multiplying each layer_1 node by the relu2deriv function accomplishes this! relu2deriv is either a 1 or a  depending on whether the layer_1 value was > 0 or not.

## One Iteration of Backpropagation

In [104]:
# 1. Initialize the Network's Weights and Data
import numpy as np

np.random.seed(1)

def relu(x):
    return (x > 0) * x

def relu2deriv(output):
    return output > 0

lights = np.array( [[1, 0, 1],
                    [0, 1, 1],
                    [0, 0, 1],
                    [1, 1, 1]])

walk_stop = np.array([[1, 1, 0, 0]]).T

alpha = 0.2
hidden_size = 4

weights_0_1 = 2 * np.random.random((3, hidden_size)) - 1
weights_1_2 = 2 * np.random.random((hidden_size, 1)) - 1

# 2. PREDICT & COMPARE: Make a Prediction, Calculate Output & Delta
layer_0 = lights[0:1]
layer_1 = np.dot(layer_0, weights_0_1)
layer_1 = relu(layer_1)
layer_2 = np.dot(layer_1, weights_1_2)

error = (layer_2 - walk_stop[0:1])**2
layer_2_delta = (layer_2 - walk_stop[0:1])

# 3. Backpropagate from Layer2 to Layer1
layer_1_delta = layer_2_delta.dot(weights_1_2.T) 
layer_1_delta *= relu2deriv(layer_1)

# 4. Generate Weight Deltas and Update Weights
weights_delta_1_2 = layer_1.T.dot(layer_2_delta)
weights_delta_0_1 = layer_0.T.dot(layer_1_delta)

weights_1_2 -= alpha * weights_delta_1_2
weights_0_1 -= alpha * weights_delta_0_1



# Putting it all together

In [108]:
# Here is a self sufficient program that should run
# MEMORIZE AND KNOW THIS BY HEART!

import numpy as np

np.random.seed(1)

def relu(x):
    return (x > 0) * x

def relu2deriv(x):
    return x > 0

streetlights = np.array([[1, 0, 1],
                         [0, 1, 1],
                         [0, 0, 1],
                         [1, 1, 1]])

walk_stop = np.array([[1, 1, 0, 0]]).T

alpha = 0.2
hidden_size = 4
input_size = 3

weights_0_1 = 2 * np.random.random((input_size, hidden_size)) - 1
weights_1_2 = 2 * np.random.random((hidden_size, 1)) - 1

for iteration in range(60):
    layer_2_error = 0
    for i in range(len(streetlights)):
        layer_0 = streetlights[i:i+1]
        layer_1 = np.dot(layer_0, weights_0_1)
        layer_1 = relu(layer_1)
        layer_2 = np.dot(layer_1, weights_1_2)
        
        layer_2_error += (layer_2 - walk_stop[i:i+1])**2
        layer_2_delta = layer_2 - walk_stop[i:i+1]
        
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        layer_1_delta *= relu2deriv(layer_1)
        
        weights_1_2_delta = layer_1.T.dot(layer_2_delta)
        weights_0_1_delta = layer_0.T.dot(layer_1_delta)
        
        weights_1_2 -= weights_1_2_delta * alpha
        weights_0_1 -= weights_0_1_delta * alpha
        
    if(iteration % 10 == 9):
        print("Error: {}".format(layer_2_error))
        

Error: [[ 0.63423116]]
Error: [[ 0.35838408]]
Error: [[ 0.08301831]]
Error: [[ 0.00646705]]
Error: [[ 0.00032927]]
Error: [[  1.50556227e-05]]


# Why do deep networks matter?
