# Building a Neural Network from Scratch

## Requirements 

- A working fully-connected deep neural network from scratch using only numpy.
- Includes dense layers, activations, optimizers, loss functions and sigmoid or softmax in case of classification. 
- Runtime and results on a public dataset.
- Documented code that includes brief summary, technical details, and results. 

## Extensions 

A comparison of the modelâ€™s runtime and performance with/without each component:
- More than 1 optimizer - SGD, Momentum, RMSProp, Adam etc.
- Regularization - L2/weight decay, dropout, possibly augmentations if image data etc.
- Results on more than 1 dataset.


### Load the dataset

In [83]:
import numpy as np

from sklearn.datasets import fetch_openml
#load MNIST dataset
mnist = fetch_openml('mnist_784')
X = mnist.data
y = mnist.target

In [84]:
X = X /255
#y = np.where(y=='0', 0, 1)

In [239]:
m = 60000
m_test = X.shape[0] - m

#switch rows and columns and reshape
print(X.shape, y.shape)
X_train, X_test = X[:m], X[m:]
y = y.T



# y_train, y_test = y[:m].reshape(1,m), y[m:].reshape(1,m_test)

# y_train, y_test = np.array(y[:m]), np.array(y[m:])
# y_train, y_test = np.array(y[:m]).reshape(1,m), np.array(y[m:]).reshape(1,m_test)
# print(X_train.shape, y_train.shape, X_test.shape,  y_test.shape) 

(70000, 784) (1, 70000)
(60000, 784) (1, 60000) (10000, 784) (1, 10000)


In [240]:
#one-hot encode mnist data

digits =10 
examples = 70000

y_new = np.eye(digits)[y.astype('int32')]
y_new = Y_new.reshape(digits, examples).T

Y_train, Y_test = y_new[:m], y_new[m:]

print(X_train.shape, Y_train.shape, X_test.shape,  Y_test.shape)

(60000, 784) (60000, 10) (10000, 784) (10000, 10)


In [256]:
def sigmoid(Z):
    """
    Sigmoid activation function.
    """
    return 1/(1+np.exp(-Z))

def der_sigmoid(Z):
    """
    Derivative of sigmoid activation function.
    """
    return sigmoid(Z) * (1 - sigmoid(Z))

def cross_entropy(Y, Y_hat):
    """
    Binary cross entropy loss function.
    """
#     print(Y, Y_hat)
#     print(Y.shape)
#     n = Y.shape[0] 
#     L = -(1/n) * (np.sum( np.multiply(np.log(Y_hat),Y) ) + np.sum( np.multiply(np.log(1-Y_hat),(1-Y)) ) )
    L= -np.log(np.max(Y_hat * Y, axis=1))
    return L

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

In [257]:
cross_entropy(np.array([[0,1,0],[1,0,0]]), np.array([[0.5,0.5,0],[0.2,0.8,0]]))

array([0.69314718, 1.60943791])

In [221]:

input_size = 784

#Initialization: weights and biases
w1 = np.random.rand(input_size,64)
w2 = np.random.rand(64,10)
b1 = np.zeros(784)
b2 = np.zeros(64)

def forward_propagation(X,y):
    
    #single forward pass
    z1 = np.dot(X+b1,w1)
    s1 = sigmoid(z1)
    z2 = np.dot(s1+b2,w2) 
    
    #pass it through loss function 
    y_hat = softmax(z2) #model output
#   y_hat = np.argmax(s2, axis=0)
    
    print(y_hat.shape, y.shape)
    L = cross_entropy(y,y_hat)
    
    return L

### Backward 
# get that d_w and d_b for each layer 

In [258]:
forward_propagation(X_train,Y_train)

(60000, 10) (60000, 10)


  L= -np.log(np.max(Y_hat * Y, axis=1))


array([        inf, 11.00209935,         inf, ...,         inf,
               inf, 11.00209935])