In [6]:
import numpy as np

In [21]:
def relu(x): 
    return np.maximum(x, 0)

In [22]:
def identity(x): 
    return x

In [23]:
def sigmoid(x): 
    return (1/(1+np.exp(-x)))

In [52]:
def softmax(h): 
    """take as input an n x q matrix of unnormalized probabilities, 
    and return n x q matrix of softmax probabilities"""
    hx = np.exp(h)
    return hx / (hx.sum(axis=1, keepdims=True)+1e-10)

In [25]:
def squared_error(y, yhat): 
    return (1/len(y))*np.sum((y-yhat)**2)

In [26]:
def binary_cross_entropy(y, yhat): 
    return -(1/len(y))*np.sum(y*np.log(yhat)+(1-y)*np.log(1-yhat))

In [27]:
def categorical_cross_entropy(y, yhat): 
    return -(1/len(y))*np.sum(np.log(yhat[range(len(yhat)),y]))

In [28]:
def loss_gradient(loss, y, yhat): 
    # compute gradient of loss with respect to output of terminal layer
    # these all have the same form -- consolidate?. exponential family
    if loss == squared_error: 
        return -2/len(y)*(y-yhat)
    if loss == binary_cross_entropy: 
        return -(1/len(y))*(y-yhat)
    if loss == categorical_cross_entropy: 
        yind = np.zeros_like(yhat)
        yind[range(len(yhat)),y] = 1
        return -(1/len(y))*(yind-yhat)

In [62]:
class Layer: 
    def __init__(self, num_inputs, num_outputs, activation, dropout=None):
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.activation = activation
        self.parent = None
        self.is_terminal = True
        self.dropout = dropout
        
    def init_params(self):
        # initialize parameters
        self.W = np.random.normal(scale=0.03, size=(self.num_inputs, self.num_outputs))
        self.b = np.zeros(self.num_outputs)
    
    def forward(self, X, predict=False): 
        # forward propagation
        x = self.get_input(X)  
        self.h = np.dot(x, self.W) + self.b
        self.a = self.activation(self.h)
        if (self.dropout is not None) &  (predict!=True): 
            mask = np.random.binomial(1, self.dropout, size=(self.num_outputs,))
            self.a = self.a * mask.T * self.dropout
        self.output = self.a
    
    def activation_derivative(self): 
        # compute derivative of activation function
        if self.activation == relu: 
            return (self.a>0).astype('int')
        elif self.activation == identity: 
            return np.ones_like(self.a)
        elif self.activation == sigmoid: 
            return self.a*(1-self.a)
        elif self.activation == softmax:
            return np.ones_like(self.a)
            
    def backward(self, loss, X, y): 
        # backward propagation
            # if terminal node, then gradient with respect to output is just gradient of loss
        if self.is_terminal:
            child_grad_h = loss_gradient(loss, y, self.a)
        else:
            child_grad_h = self.child.grad_h
        self.grad_w = np.dot(self.get_input(X).T, child_grad_h *self.activation_derivative())
        self.grad_b = np.sum(child_grad_h *self.activation_derivative(), axis=0)
        self.grad_h = np.dot(child_grad_h*self.activation_derivative(), self.W.T)
        
    def set_parent(self, parent): 
        self.parent = parent
        self.is_terminal = True
        
    def set_child(self, child): 
        self.child = child
        self.is_terminal = False
        
    def get_input(self, X): 
        if self.parent is None: 
            return X
        else: 
            return self.parent.output

In [41]:
class Net: 
    def __init__(self): 
        self.layers = []
    
    def add_layer(self, num_inputs, num_outputs, activation, dropout=None): 
        layer = Layer(num_inputs, num_outputs, activation, dropout)
        if len(self.layers)>0: 
            layer.set_parent(self.terminal)
            layer.parent.set_child(layer)
            self.layers[-1].set_child(layer)
        self.layers.append(layer)
        self.terminal = layer
        
    def forward(self, X, predict=False):
        for layer in self.layers: 
            layer.forward(X, predict)
        if predict==True:
            return self.terminal.output
        else: 
            self.output = self.terminal.output
            
    def backward(self, loss, X, y): 
        for layer in self.layers[::-1]:
            layer.backward(loss, X,y)
            
    def init_params(self): 
        for layer in self.layers: 
            layer.init_params()
            
    def get_params(self): 
        self.params = []
        for layer in layers: 
            self.params += self.get_params()
            
    def get_grads(self): 
        self.grads = []
        for layer in layers: 
            self.grads += self.get_grads()
            
    def set_params(self, params): 
        for i in range(len(layers)): 
            layers[i].set_params(params[i])

    def train(self, train_data, train_labels, num_epochs, batch_size, lr, loss, eval_data):
        idx = np.arange(len(train_data))
        np.random.shuffle(idx)
        J = len(train_data) // batch_size  #steps per epoch
        self.train_loss = []
        self.test_loss = []
        self.test_acc = []
        for i in range(num_epochs):
            for j in range(J): 
                idxj = idx[(j*batch_size):((j+1)*batch_size)]
                X = train_data[idxj]
                y = train_labels[idxj]
                self.forward(X)
                self.backward(loss,X,y)
                self.descend(lr)
            self.forward(train_data)
            yhat = self.output
            l, acc = self.evaluate(*eval_data,loss=loss).values()
            self.test_loss.append(l)
            self.test_acc.append(acc)
            self.train_loss.append(loss(train_labels, yhat))
                  
    def descend(self, lr): 
        for layer in self.layers: 
            layer.W -= lr*layer.grad_w
            layer.b -= lr*layer.grad_b
            
    def evaluate(self, X, y, loss): 
        yprob = self.forward(X, predict=True)
        loss_ = loss(y, yprob)
        ymax = np.argmax(yprob, axis=1)
        accuracy = np.mean((ymax==y).astype('int'))
        return {'loss':loss_, 'accuracy':accuracy}

In [33]:
net = Net()
net.add_layer(num_inputs=20, num_outputs=32, activation=relu)
net.add_layer(num_inputs=32, num_outputs=32, activation=relu)
net.add_layer(num_inputs=32, num_outputs=1, activation=sigmoid)
net.init_params()
X = np.random.randn(100,20)
y = np.random.binomial(1,0.5, size=(100,1))

In [136]:
net.forward(X)

In [None]:
train(self, train_data, train_labels, num_epochs, batch_size, lr, loss):

In [82]:
net.train(X,y, num_epochs=3, batch_size=25, lr=0.5, loss=binary_cross_entropy)

In [61]:
# test on MNIST data
from keras.datasets import mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

In [62]:
test_labels.shape

(10000,)

In [22]:
train_images.shape

(60000, 28, 28)

In [23]:
from keras import models
from keras import layers

network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(28 * 28,)))
network.add(layers.Dense(10, activation='softmax'))

In [24]:
network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [34]:
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype('float32') / 255

In [26]:
test_images.shape

(10000, 784)

In [96]:
from keras.utils import to_categorical
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

In [48]:
hist = network.fit(train_images, train_labels, epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [97]:
network.evaluate(test_images, test_labels)



[0.07554391357868226, 0.9805999994277954]

In [52]:
hist.history.keys()

dict_keys(['loss', 'accuracy'])

In [53]:
hist.history['loss']

[0.029248603525509436,
 0.022612789740165076,
 0.017091255979736646,
 0.01382947520862023,
 0.010397070344164967]

In [54]:
hist.history['accuracy']

[0.99146664, 0.99333334, 0.99505, 0.99585, 0.9969]

In [63]:
# test on MNIST data
from keras.datasets import mnist

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

In [64]:
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype('float32') / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype('float32') / 255

In [65]:
net = Net()
net.add_layer(num_inputs = 28*28, num_outputs = 512, activation=relu, dropout=0.9)
net.add_layer(num_inputs = 512, num_outputs = 10, activation=softmax)

In [72]:
net.init_params()

In [75]:
net.train(train_images,train_labels, num_epochs=10, batch_size=128, lr=0.9, loss=categorical_cross_entropy, 
         eval_data=[test_images, test_labels])

In [68]:
net.test_loss

[0.11375146970666508,
 0.09017385104341524,
 0.07731697978918539,
 0.07454350236066237,
 0.07090721367067954,
 0.06898676989354276,
 0.06654926582216308,
 0.0664493844955413,
 0.06984915171328947,
 0.06603321729445721]

In [143]:
net.test_loss

[0.06935149911935265,
 0.06752984286302323,
 0.06646451590490071,
 0.06582464976182537,
 0.0658733916880659]

In [77]:
net.test_acc

[0.9829, 0.9828, 0.9838, 0.9831, 0.9839, 0.984, 0.9834, 0.984, 0.9841, 0.9838]

In [148]:
net.test_acc

[0.9826, 0.9822, 0.9835, 0.9832, 0.9829, 0.983, 0.9833, 0.9833, 0.9834, 0.9836]

In [93]:
net.train_loss

[0.15436858797210787, 0.09109607427928047, 0.06460457099397682]