# ToDo
- SGD 
- Mini Batch GD


# Example: A two layer network using pure numpy.
"we can easily use numpy to fit a two-layer network to random data by *manually* implementing the forward and backward passes through the network using numpy operations"
[Ref](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html#tensorflow-static-graphs)<br>
Training Algorithm: Batch-gradient descent with a fixed learning schedule.<br>

In [None]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500+1):
    # Forward pass: compute predicted y
    h = x.dot(w1) # x: (64, 1000), w1: (1000, 100)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2) # w2: (100, 10)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 0:
        loss_=loss
        print("Epoch", t, "loss =", loss_)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

## using tensorflow's Gradient Descent optimizer

In [None]:
import numpy as np
import tensorflow as tf

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

n_epochs = 500 #500
learning_rate = 1e-6

# Create random input and output data
inputs = np.random.randn(N, D_in)
targets = np.random.randn(N, D_out)

tf.reset_default_graph()

X = tf.constant(inputs, name="X")
y = tf.constant(targets, name="y")

# Randomly initialize weights
w1 = tf.Variable(np.random.randn(D_in, H),name="weight1")
w2 = tf.Variable(np.random.randn(H, D_out),name="weight2")

# Forward pass: compute predicted y
h = tf.matmul(X, w1, name="layer1")
h_relu = tf.maximum(h, 0)
y_pred = tf.matmul(h_relu, w2, name="layer2-predictions")

error = y_pred - y
loss = tf.reduce_sum(tf.square(error),name="squared_error")

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss) # optimizer.minimize(loss,var_list=[var for var in tf.trainable_variables()])

# Execution phase
session_conf=tf.ConfigProto(device_count={'GPU': 0})
sess=tf.Session(config=session_conf)

init = tf.global_variables_initializer()

sess.run(init)
for epoch in range(n_epochs+1):
    if epoch % 100 == 0:
        loss_=sess.run(loss)
        print("Epoch", epoch, "loss =", loss_)
    sess.run(training_op)

sess.close()