In [2]:
import numpy as np
import tensorflow as tf

## From Scratch

In [2]:
def sigmoid(x):
    return 1.0/(1+np.exp(-x))

In [3]:
# Gradient of loss function: L'(W1, b1, W2, b2).
def L_prime(X, Y, W1, b1, W2, b2):
    """ L'(W,b) function. 
    X:  Feature matrix.    Shape: [n,2].
    Y:  Label vector.      Shape: [n,1].
    W1: Weight matrix W1.  Shape: [2,3].
    b1: Bias vector b1.    Shape: [3,1].
    W2: Weight matrix W2.  Shape: [3,1].
    b2: Bias vector b2.    Shape: [1,1].
    Return the gradients: dL/dW1 (Shape: [2,3]), dL/db1 (Shape: [3,1]),
                          dL/dW2 (Shape: [3,1]), dL/db2 (Shape: [1,1]).
    """
    # Get dimensions.
    n = X.shape[0]
    
    # Calculate feed-forward values.

    H = sigmoid(W1.T.dot(X.T) + b1).T                          # Shape: [n, 3].
    P = Y*(W2.T.dot(H.T)+b2).T                           # Shape: [n, 1].
#     print(P.shape)
    # Calculate the gradients: dL/dW1, dL/db1, dL/dW2, dL/db2.
    dL_by_dW2 = H.T.dot((P-1)*Y)                            # Shape: [3,1].
    
#     dL_by_db2 =  (P-1).T.dot(Y)                           # Shape: [1,1].
    dL_by_db2 = np.ones((n,1)).T.dot((P-1)*Y)
    
#     print(W2.shape)
    dL_by_dH  = ((P-1)*Y).dot(W2.T)                           # Shape: [n,3].
    dL_by_dW1  = X.T.dot(dL_by_dH*H*(1-H))                   # Shape: [2,3].
#     print(dL_by_dW1.shape)
    dL_by_db1  = (dL_by_dH*H*(1-H)).T.dot(np.ones((n,1)))                        # Shape: [3,1].
#     print(dL_by_db1.shape)
    return dL_by_dW1, dL_by_db1, dL_by_dW2, dL_by_db2

In [4]:
# Loss function
def L(X, Y, W1, b1, W2, b2):
    """ L(W,b) function. 
    X:  Feature matrix.    Shape: [n,2].
    Y:  Label vector.      Shape: [n,1].
    W1: Weight matrix W1.  Shape: [2,3].
    b1: Bias vector b1.    Shape: [3,1].
    W2: Weight matrix W2.  Shape: [3,1].
    b2: Bias vector b2.    Shape: [1,1].
    Return the loss.       Shape: Scalar.
    """
    # Get dimensions.
    n = X.shape[0]
    
    # Calculate feed-forward values.
#     print(X.shape)
    
    H = sigmoid(W1.T.dot(X.T) + b1).T                             # Shape: [n, 3].
#     print(H.shape)
#     print(W2.T.dot(H.T).shape)
    P = sigmoid(Y*(W2.T.dot(H.T)+b2).T)                             # Shape: [n, 1].
    
#     print((W2.T.dot(H.T)+b2).shape)
#     print(P.shape)
    # Get the loss.
    L =    -np.sum(np.log(P))                        # Shape: Scalar.
    
    return L

In [5]:
# lets generate some data using a function mapping from R^2 -> R^1 (2d coordinates to scalar values)
def generate_data():
    
    # generates 1000 ordered data points from 0 to 1 with a bit of noise using random.uniform
    def generate_linear_noisy():
        return np.linspace(0, 1, num=1000) + np.random.uniform(-0.05, 0.05, (1000,))
    
    X_train = np.array([generate_linear_noisy(), generate_linear_noisy()]).T
    
    # the function modeled here is F(x, y) -> x / 2 + y / 2
    Y_train = (X_train[:,0] * 0.5 + X_train[:,1] * 0.5).reshape(1000, 1)
    return X_train, Y_train
X_train, Y_train = generate_data()

In [6]:
print(X_train)

[[ 0.04739148  0.03284548]
 [-0.01272119 -0.0220772 ]
 [ 0.0338837  -0.03660984]
 ...
 [ 1.04537137  1.00214967]
 [ 1.01571349  0.97313234]
 [ 0.97557795  0.9906448 ]]


In [7]:
# gradient descent
# supposed to find where loss is minimized

In [160]:
learning_rate = 0.0001
n_iter = 200000                        # Number of iterations
np.random.seed(0)
W1 = np.random.randn(2,3)/((2*3)**2)   # Weight matrix 1.
b1 = np.random.randn(3,1)/((3*1)**2)   # Bias vector 1.
W2 = np.random.randn(3,1)/((3*1)**2)   # Weight matrix 2.
b2 = np.random.randn(1,1)/((1*1)**2)   # Bias vector 2.

# We will keep track of training loss over iterations.
iterations = [0]
L_list = [L(X_train, Y_train, W1, b1, W2, b2)]

for i in range(n_iter):
    
    # gradient descent 
    
    gradient_W1, gradient_b1, gradient_W2, gradient_b2 = \
        L_prime(X_train, Y_train, W1, b1, W2, b2)
    
    W1_new = W1 - learning_rate * gradient_W1
    b1_new = b1 - learning_rate * gradient_b1
    W2_new = W2 - learning_rate * gradient_W2
    b2_new = b2 - learning_rate * gradient_b2
    
    iterations.append(i+1)
    L_list.append(L(X_train, Y_train, W1_new, b1_new, W2_new, b2_new))
    
    # L1-norm of weight/bias changing.
    norm = np.abs(W1_new-W1).sum() + np.abs(b1_new-b1).sum() + \
           np.abs(W2_new-W2).sum() + np.abs(b2_new-b2).sum() 
    
    if i%500 == 0:
        print('i: {:6d} L: {:.3f} norm:{:.6f}'.format(i, L_list[-1], norm))
        
    W1 = W1_new
    b1 = b1_new
    W2 = W2_new
    b2 = b2_new
    
print ('W1 matrix: \n' + str(W1))
print ('b1 vector: \n' + str(b1))
print ('W2 matrix: \n' + str(W2))
print ('b2 vector: \n' + str(b2))


i:      0 L: 498.674 norm:0.055257
i:    500 L: 406.410 norm:0.002239
i:   1000 L: 403.164 norm:0.003733
i:   1500 L: 396.925 norm:0.005403
i:   2000 L: 386.843 norm:0.006417
i:   2500 L: 374.793 norm:0.005971
i:   3000 L: 364.575 norm:0.004444
i:   3500 L: 357.869 norm:0.002940
i:   4000 L: 353.861 norm:0.002231
i:   4500 L: 351.339 norm:0.001968
i:   5000 L: 349.543 norm:0.001782
i:   5500 L: 348.109 norm:0.001648
i:   6000 L: 346.888 norm:0.001552
i:   6500 L: 345.816 norm:0.001480
i:   7000 L: 344.861 norm:0.001408
i:   7500 L: 344.005 norm:0.001337
i:   8000 L: 343.233 norm:0.001268
i:   8500 L: 342.533 norm:0.001202
i:   9000 L: 341.897 norm:0.001137
i:   9500 L: 341.317 norm:0.001076
i:  10000 L: 340.786 norm:0.001018
i:  10500 L: 340.299 norm:0.000964
i:  11000 L: 339.850 norm:0.000914
i:  11500 L: 339.436 norm:0.000873
i:  12000 L: 339.053 norm:0.000836
i:  12500 L: 338.697 norm:0.000802
i:  13000 L: 338.366 norm:0.000770
i:  13500 L: 338.057 norm:0.000740
i:  14000 L: 337.767

i: 117500 L: 327.207 norm:0.000121
i: 118000 L: 327.194 norm:0.000121
i: 118500 L: 327.182 norm:0.000120
i: 119000 L: 327.169 norm:0.000120
i: 119500 L: 327.156 norm:0.000119
i: 120000 L: 327.144 norm:0.000119
i: 120500 L: 327.132 norm:0.000118
i: 121000 L: 327.119 norm:0.000118
i: 121500 L: 327.107 norm:0.000118
i: 122000 L: 327.095 norm:0.000117
i: 122500 L: 327.083 norm:0.000117
i: 123000 L: 327.071 norm:0.000116
i: 123500 L: 327.059 norm:0.000116
i: 124000 L: 327.047 norm:0.000115
i: 124500 L: 327.035 norm:0.000115
i: 125000 L: 327.024 norm:0.000115
i: 125500 L: 327.012 norm:0.000114
i: 126000 L: 327.000 norm:0.000114
i: 126500 L: 326.989 norm:0.000113
i: 127000 L: 326.978 norm:0.000113
i: 127500 L: 326.966 norm:0.000113
i: 128000 L: 326.955 norm:0.000112
i: 128500 L: 326.944 norm:0.000112
i: 129000 L: 326.933 norm:0.000111
i: 129500 L: 326.922 norm:0.000111
i: 130000 L: 326.911 norm:0.000111
i: 130500 L: 326.900 norm:0.000110
i: 131000 L: 326.889 norm:0.000110
i: 131500 L: 326.879

## Backpropagation In Tensorflow

In [3]:
# See how simple this is using keras :)
# Try adding more layers, changing activation to e.g. 'tanh' or 'relu' or 'sigmoid' and compare results!
# You may notice that "linear" works best and thats obvious because our data is fairly linear
# You can also try changing the data generated
model = tf.keras.Sequential([
    tf.keras.layers.Dense(2, activation='linear'),
    # typically, more neurons, the more capable the network, but be wary of overfitting
    tf.keras.layers.Dense(32, activation='linear'), 
    tf.keras.layers.Dense(1)
])

# we specify we want to use something known as the Adam optimizer to optimize the loss and minimize it
# Adam, like SGD, tries to minimize the loss function. In a future workshop we will explain why Adam runs much 
# faster and has higher accuracy
# the loss we use here is known as Mean Squared Error
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='mse',
              metrics=['mse'])


In [7]:
# fit the model onto our dataset and run for 100 epochs
model.fit(X_train, Y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x14eef29d0>

In [8]:
# lets look at 20 data points and see how we do
for x, y in zip(X_train[0:80:4], Y_train[0:80:4]):
    print("X = {}, Y = {}, Predicted - {}".format(x, y, model.predict([[x[0], x[1]]])))

X = [0.03923083 0.03530981], Y = [0.03727032], Predicted - [[0.03727032]]
X = [0.00582148 0.03699618], Y = [0.02140883], Predicted - [[0.02140883]]
X = [-0.03350683 -0.03459055], Y = [-0.03404869], Predicted - [[-0.0340487]]
X = [-0.00560108  0.03889558], Y = [0.01664725], Predicted - [[0.01664725]]
X = [ 0.06239572 -0.02463851], Y = [0.0188786], Predicted - [[0.01887859]]
X = [ 0.00440151 -0.00883391], Y = [-0.0022162], Predicted - [[-0.00221621]]
X = [0.05087184 0.0126296 ], Y = [0.03175072], Predicted - [[0.03175072]]
X = [0.05500249 0.0538404 ], Y = [0.05442145], Predicted - [[0.05442144]]
X = [ 0.0174499  -0.00063104], Y = [0.00840943], Predicted - [[0.00840943]]
X = [0.04804347 0.0659605 ], Y = [0.05700198], Predicted - [[0.05700199]]
X = [0.01794367 0.0351309 ], Y = [0.02653729], Predicted - [[0.02653729]]
X = [0.00580836 0.09100084], Y = [0.0484046], Predicted - [[0.0484046]]
X = [0.07179332 0.0032296 ], Y = [0.03751146], Predicted - [[0.03751146]]
X = [0.04355678 0.05336648], 