# Question 1

#### Derive the update rule and show how to train a 2-layer (1 hidden layer and 1 output layer) neural network with backpropagation for regression using the Mean Square Error loss. Assume that you are using the Sigmoid activation function for the hidden layer. Explain briefly how this is different from the update rule for the network trained for binary classification using log loss.

Let $x$ and $\hat{y}$ be the input features and network output, respectively. Also, let $w_{1}$ and $b_{1}$, and $w_{2}$ and $b^{(2)}$ be the parameters of the hidden and output layer, respectively. 

The hidden layer takes $x$ as input and has output 
$z_{1} = w x + b_{1}$, and $a_{1}= \sigma(z_{1}) = \frac{1}{1 + exp(-z_{1})}$.

The output layer has $a_{1}$ as input and $\hat{y}=w_{2}a_{1} + b_{2}$ as output.

The loss function is given by $\mathcal{L} = \frac{1}{2N}\sum\limits_{k=1}^{N}(y_{k}-\hat{y}_{k})^{2}$, where $N$ is the number of samples.

We used gradient descend to update the parameters:

$dw_{2} = -\alpha\frac{d\mathcal{L}}{dw_{2}} = -\alpha\frac{d\mathcal{L}}{d\hat{y}}\frac{d\hat{y}}{dw_{2}} = \frac{\alpha}{N}\sum\limits_{k}(y_{k}-\hat{y}_{k})a_{1} $

$db_{2} = -\alpha\frac{d\mathcal{L}}{db_{2}} = -\alpha\frac{d\mathcal{L}}{d\hat{y}}\frac{d\hat{y}}{db_{2}} = \frac{\alpha}{N}\sum\limits_{k}(y_{k}-\hat{y}_{k}) $

$dw_{1} = -\alpha\frac{d\mathcal{L}}{dw_{1}} = -\alpha\frac{d\mathcal{L}}{d\hat{y}}\frac{d\hat{y}}{da_{1}}\frac{da_{1}}{dz_{1}}\frac{dz_{1}}{dw_{1}} = \frac{\alpha}{N}\sum\limits_{k}(y_{k}-\hat{y}_{k})w_{2}a_{1}(1-a_{1})x $

$db_{1} = -\alpha\frac{d\mathcal{L}}{db_{1}} = -\alpha\frac{d\mathcal{L}}{d\hat{y}}\frac{d\hat{y}}{da_{1}}\frac{da_{1}}{dz_{1}}\frac{dz_{1}}{db_{1}} = \frac{\alpha}{N}\sum\limits_{k}(y_{k}-\hat{y}_{k})w_{2}a_{1}(1-a_{1}) $

# Question 2

### 1. What is the activation function that you will choose for the output layer?

Since this a regression problem, i.e., the targets are continuous real values, I'll use a linear activation function. 

### 2. How many neurons should there be in the output layer? Why?

One, since the output is only one real value. 

In [7]:
import numpy as np

In [8]:
#activation function
def sigmoid(z):
	return 1/(1 + np.exp(-z))

#derivative of activation function
def sigmoid_prime(z):
  return sigmoid(z)*(1-sigmoid(z))

In [27]:
#Define network structure 
def structure(X, Y):
    input_size = X.shape[0] # size of input layer
    features = X.shape[1] #number of features
    neurons = 10 #number of neurons 10
    output_size = Y.shape[0] # size of output layer
    return (input_size, features, neurons, output_size)

#Initialize parameters
def par_init(input_size, neurons, output_size): 
    W1 = np.random.rand(neurons, input_size)
    b1 = np.random.rand(neurons, 1)
    W2 = np.random.rand(output_size, input_size)
    b2 = np.zeros((output_size, 1))
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

In [26]:
X_train = np.loadtxt("content/drive/MyDrive/X_train.csv")
Y_train = np.loadtxt("content/drive/MyDrive/Y_train.csv")

print(np.shape(X_train))

stru = structure(X_train,Y_train)

# Define initial parameters
W1 = np.random.rand(stru[0],stru[2])
b1 = np.random.rand(stru[2],stru[1])
W2 = np.random.rand(stru[2],stru[3])
b2 = np.random.rand(stru[3],1)

(100, 2)


In [8]:
# Backprop
def backprop(W1, W2, b1, b2):
	for i in range(25):
		dz2 = (a2 - y)
		dW2 = np.dot(dz2, a1.T)
		db2 = dz2
		dz1 = np.dot(W2.T, dz2) * sigmoid(z1) * (1-sigmoid(z1))
		dW1 = np.dot(dz1, x.T)
		db1 = dz1
		W1 = W1 - dW1
		W2 = W2 - dW2
		b1 = b1 - db1
		b2 = b2 - db2
		z1New = np.dot(W1, x) + b1
		a1New = sigmoid(z1New)
		z2New = np.dot(W2, a1New) + b2
		a2New = sigmoid(z2New)
		print(a2New, a2)

In [None]:
def forward_propagation(X, parameters):
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    
    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2)
    cache = {"Z1": Z1,"A1": A1,"Z2": Z2,"A2": A2}
    
    return A2, cache

In [None]:
def backward_propagation(parameters, cache, X, Y):
    #number of training example
    m = X.shape[1]
    
    W1 = parameters['W1']
    W2 = parameters['W2']
    A1 = cache['A1']
    A2 = cache['A2']
   
    dZ2 = A2-Y
    dW2 = (1/m) * np.dot(dZ2, A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2))
    dW1 = (1/m) * np.dot(dZ1, X.T) 
    db1 = (1/m)*np.sum(dZ1, axis=1, keepdims=True)
    
    grads = {"dW1": dW1, "db1": db1, "dW2": dW2,"db2": db2}
    
    return grads

In [None]:
def gradient_descent(parameters, grads, learning_rate = 0.01):
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
   
    dW1 = grads['dW1']
    db1 = grads['db1']
    dW2 = grads['dW2']
    db2 = grads['db2']    W1 = W1 - learning_rate * dW1
    b1 = b1 - learning_rate * db1
    W2 = W2 - learning_rate * dW2
    b2 = b2 - learning_rate * db2
    
    parameters = {"W1": W1, "b1": b1,"W2": W2,"b2": b2}
    
    return parameters

In [None]:
def NN(X, Y, neurons, num_iterations = 1000):
    np.random.seed(3)
    input_unit = structure(X, Y)[0]
    output_unit = structure(X, Y)[2]
    
    parameters = parameters_initialization(input_unit, neurons, output_unit)
   
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    
    for i in range(0, num_iterations):
        A2, cache = forward_propagation(X, parameters)
        cost = cross_entropy_cost(A2, Y, parameters)
        grads = backward_propagation(parameters, cache, X, Y)
        parameters = gradient_descent(parameters, grads)
        if i % 5 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))    
    return parameters

parameters = NN(X_train, Y_train, 4, num_iterations=1000)