In [20]:
import numpy as np


## From Scratch

In [21]:
def sigmoid(x):
    return 1.0/(1+np.exp(-x))

In [22]:
# Gradient of loss function: L'(W1, b1, W2, b2).
def L_prime(X, Y, W1, b1, W2, b2):
    """ L'(W,b) function. 
    X:  Feature matrix.    Shape: [n,2].
    Y:  Label vector.      Shape: [n,1].
    W1: Weight matrix W1.  Shape: [2,3].
    b1: Bias vector b1.    Shape: [3,1].
    W2: Weight matrix W2.  Shape: [3,1].
    b2: Bias vector b2.    Shape: [1,1].
    Return the gradients: dL/dW1 (Shape: [2,3]), dL/db1 (Shape: [3,1]),
                          dL/dW2 (Shape: [3,1]), dL/db2 (Shape: [1,1]).
    """
    # Get dimensions.
    n = X.shape[0]
    
    # Calculate feed-forward values.

    H = sigmoid(W1.T.dot(X.T) + b1).T                          # Shape: [n, 3].
    P = Y*(W2.T.dot(H.T)+b2).T                           # Shape: [n, 1].
#     print(P.shape)
    # Calculate the gradients: dL/dW1, dL/db1, dL/dW2, dL/db2.
    dL_by_dW2 = H.T.dot((P-1)*Y)                            # Shape: [3,1].
    
#     dL_by_db2 =  (P-1).T.dot(Y)                           # Shape: [1,1].
    dL_by_db2 = np.ones((n,1)).T.dot((P-1)*Y)
    
#     print(W2.shape)
    dL_by_dH  = ((P-1)*Y).dot(W2.T)                           # Shape: [n,3].
    dL_by_dW1  = X.T.dot(dL_by_dH*H*(1-H))                   # Shape: [2,3].
#     print(dL_by_dW1.shape)
    dL_by_db1  = (dL_by_dH*H*(1-H)).T.dot(np.ones((n,1)))                        # Shape: [3,1].
#     print(dL_by_db1.shape)
    return dL_by_dW1, dL_by_db1, dL_by_dW2, dL_by_db2

In [23]:
# Loss function
def L(X, Y, W1, b1, W2, b2):
    """ L(W,b) function. 
    X:  Feature matrix.    Shape: [n,2].
    Y:  Label vector.      Shape: [n,1].
    W1: Weight matrix W1.  Shape: [2,3].
    b1: Bias vector b1.    Shape: [3,1].
    W2: Weight matrix W2.  Shape: [3,1].
    b2: Bias vector b2.    Shape: [1,1].
    Return the loss.       Shape: Scalar.
    """
    # Get dimensions.
    n = X.shape[0]
    
    # Calculate feed-forward values.
#     print(X.shape)
    
    H = sigmoid(W1.T.dot(X.T) + b1).T                             # Shape: [n, 3].
#     print(H.shape)
#     print(W2.T.dot(H.T).shape)
    P = sigmoid(Y*(W2.T.dot(H.T)+b2).T)                             # Shape: [n, 1].
    
#     print((W2.T.dot(H.T)+b2).shape)
#     print(P.shape)
    # Get the loss.
    L =    -np.sum(np.log(P))                        # Shape: Scalar.
    
    return L

In [50]:
# lets generate some data using a function mapping from R^2 -> R^1 (2d coordinates to scalar values)
def generate_data():
    
    # generates 1000 ordered data points from 0 to 1 with a bit of noise using random.uniform
    def generate_linear_noisy():
        return np.linspace(0, 1, num=1000) + np.random.uniform(-0.05, 0.05, (1000,))
    
    X_train = np.array([generate_linear_noisy(), generate_linear_noisy()]).T
    
    # the function modeled here is F(x, y) -> x / 2 + y / 2
    Y_train = (X_train[:,0] * 0.5 + X_train[:,1] * 0.5).reshape(1000, 1)
    return X_train, Y_train
X_train, Y_train = generate_data()

(1000, 2) (1000, 1)


In [51]:
print(X_train)

[[-4.28963942e-02 -4.79049931e-02]
 [-4.02860690e-02 -7.83744764e-03]
 [-4.59761583e-02  9.47865545e-04]
 ...
 [ 9.58745255e-01  9.74808924e-01]
 [ 9.77753449e-01  9.62158684e-01]
 [ 9.95670363e-01  9.53921054e-01]]


In [52]:
# gradient descent
# supposed to find where loss is minimized

In [None]:
learning_rate = 0.0001
n_iter = 200000                        # Number of iterations
np.random.seed(0)
W1 = np.random.randn(2,3)/((2*3)**2)   # Weight matrix 1.
b1 = np.random.randn(3,1)/((3*1)**2)   # Bias vector 1.
W2 = np.random.randn(3,1)/((3*1)**2)   # Weight matrix 2.
b2 = np.random.randn(1,1)/((1*1)**2)   # Bias vector 2.

# We will keep track of training loss over iterations.
iterations = [0]
L_list = [L(X_train, Y_train, W1, b1, W2, b2)]

for i in range(n_iter):
    
    # gradient descent 
    
    gradient_W1, gradient_b1, gradient_W2, gradient_b2 = \
        L_prime(X_train, Y_train, W1, b1, W2, b2)
    
    W1_new = W1 - learning_rate * gradient_W1
    b1_new = b1 - learning_rate * gradient_b1
    W2_new = W2 - learning_rate * gradient_W2
    b2_new = b2 - learning_rate * gradient_b2
    
    iterations.append(i+1)
    L_list.append(L(X_train, Y_train, W1_new, b1_new, W2_new, b2_new))
    
    # L1-norm of weight/bias changing.
    norm = np.abs(W1_new-W1).sum() + np.abs(b1_new-b1).sum() + \
           np.abs(W2_new-W2).sum() + np.abs(b2_new-b2).sum() 
    
    if i%500 == 0:
        print('i: {:6d} L: {:.3f} norm:{:.6f}'.format(i, L_list[-1], norm))
        
    W1 = W1_new
    b1 = b1_new
    W2 = W2_new
    b2 = b2_new
    
print ('W1 matrix: \n' + str(W1))
print ('b1 vector: \n' + str(b1))
print ('W2 matrix: \n' + str(W2))
print ('b2 vector: \n' + str(b2))


i:      0 L: 499.138 norm:0.055375
i:    500 L: 406.237 norm:0.002231
i:   1000 L: 403.019 norm:0.003711
i:   1500 L: 396.847 norm:0.005377
i:   2000 L: 386.841 norm:0.006411
i:   2500 L: 374.796 norm:0.006002
i:   3000 L: 364.492 norm:0.004493
i:   3500 L: 357.689 norm:0.002976
i:   4000 L: 353.621 norm:0.002239
i:   4500 L: 351.071 norm:0.001972
i:   5000 L: 349.263 norm:0.001783
i:   5500 L: 347.826 norm:0.001650
i:   6000 L: 346.602 norm:0.001552
i:   6500 L: 345.529 norm:0.001480
i:   7000 L: 344.573 norm:0.001409
i:   7500 L: 343.716 norm:0.001338
i:   8000 L: 342.943 norm:0.001269
i:   8500 L: 342.243 norm:0.001203
i:   9000 L: 341.606 norm:0.001138
i:   9500 L: 341.026 norm:0.001077
i:  10000 L: 340.496 norm:0.001018
i:  10500 L: 340.009 norm:0.000963
i:  11000 L: 339.562 norm:0.000912
i:  11500 L: 339.150 norm:0.000868
i:  12000 L: 338.769 norm:0.000831
i:  12500 L: 338.415 norm:0.000797
i:  13000 L: 338.087 norm:0.000764
i:  13500 L: 337.780 norm:0.000734
i:  14000 L: 337.494

i: 118500 L: 327.310 norm:0.000123
i: 119000 L: 327.297 norm:0.000123
i: 119500 L: 327.285 norm:0.000123
i: 120000 L: 327.272 norm:0.000122
i: 120500 L: 327.259 norm:0.000122
i: 121000 L: 327.247 norm:0.000121
i: 121500 L: 327.234 norm:0.000121
i: 122000 L: 327.222 norm:0.000121
i: 122500 L: 327.210 norm:0.000120
i: 123000 L: 327.198 norm:0.000120
i: 123500 L: 327.186 norm:0.000119
i: 124000 L: 327.174 norm:0.000119
i: 124500 L: 327.162 norm:0.000119
i: 125000 L: 327.150 norm:0.000118
i: 125500 L: 327.138 norm:0.000118
i: 126000 L: 327.126 norm:0.000117
i: 126500 L: 327.115 norm:0.000117
i: 127000 L: 327.103 norm:0.000117
i: 127500 L: 327.091 norm:0.000116
i: 128000 L: 327.080 norm:0.000116
i: 128500 L: 327.069 norm:0.000116
i: 129000 L: 327.057 norm:0.000115
i: 129500 L: 327.046 norm:0.000115
i: 130000 L: 327.035 norm:0.000114
i: 130500 L: 327.024 norm:0.000114
i: 131000 L: 327.013 norm:0.000114
i: 131500 L: 327.002 norm:0.000113
i: 132000 L: 326.991 norm:0.000113
i: 132500 L: 326.980

In [19]:
# backpropogation 

## In Tensorflow

In [None]:
# to do 

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10)
])

In [None]:
model.compile(optimizer='SGD',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
# optimizer=tf.train.GradientDescentOptimizer(0.01).minimize(f_error)


model.fit(train_images, train_labels, epochs=10)


## In Pytorch