In [65]:
import numpy as np


def unit_norm_layer_init(input_shape, output_shape):
    return np.random.normal(size=(input_shape, output_shape))


def ones_layer_init(input_shape, output_shape):
    return np.ones((input_shape, output_shape))


def zeros_init_layer(length):
    return np.zeros(length)


def softmax(x):
    # https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
    exp = np.exp(x - np.max(x))
    return exp / np.sum(exp, axis=1)[:, None]


def linear(x):
    return x


def linear_derivative(X):
    return np.ones((X.shape[0]))

'''
def softmax_grad(x):
    s = sigmoid(x)
    _, K = s.shape
    return s @ (np.eye(K, K) - s).T
'''

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)


def ReLU(x):
    return np.maximum(x, 0, x)


def ReLU_derivative(x):
    return (x > 0).astype(int)


def cross_entropy(y_true, y_pred):
    N = len(y_true)
    return -np.sum(y_true * np.log(y_pred)) / N


def cross_entropy_derivative(y_true, y_pred):
    N = len(y_true)
    return -y_true/y_pred / N


def one_hot_encode(y, n_classes):
    y_onehot = np.zeros((len(y), n_classes))
    for i, y_i in enumerate(y):
        y_onehot[i, y_i] = 1
    return y_onehot


class NeuralNetwork():
    
    def __init__(self, 
                 hidden=(8, 6),
                 init_weights='unit_norm',
                 init_bias='zeros',
                 activation='sigmoid',
                 loss='cross_entropy',
                 mode='classification',
                 random_state=1):
        self.hidden = hidden
        self.init_weights = init_weights
        self.init_bias = init_bias
        self.activation = activation
        self.loss = loss
        self.random_state = random_state
        self.mode = mode
        np.random.seed(self.random_state)
        self._set_act_func()
        self._set_loss()
        
        
    def _init_neural_network(self):
        implemented_weight_inits = {'unit_norm': unit_norm_layer_init,
                                    'ones': ones_layer_init
                                   }
        implemented_bias_inits = {'zeros': zeros_init_layer,
                                   }
        try:
            init_layer_weight = implemented_weight_inits[self.init_weights]
            init_layer_bias = implemented_bias_inits[self.init_bias]
        except KeyError:
            raise Exception('{} or {} not accepted'.format(self.init_weights,
                                                           self.init_bias))

        self.weights = []
        self.biases = []
        for layer in range(len(self.hidden) + 1):
            if layer == 0:
                input_shape = self.n_features
                output_shape = self.hidden[layer]
            elif layer == len(self.hidden):
                input_shape = self.hidden[layer - 1]
                output_shape = self.n_classes
            else:
                input_shape = self.hidden[layer - 1]
                output_shape = self.hidden[layer]                
            w_l = init_layer_weight(input_shape, output_shape)
            b_l = init_layer_bias(output_shape)
            self.weights.append(w_l)
            self.biases.append(b_l)
        
            
    def _set_act_func(self):
        implemented_activations = {'sigmoid': sigmoid,
                                   'ReLU': ReLU,
                                   'linear': linear_derivative}
        # set activation function
        try:
            self.act = implemented_activations[self.activation]
        except KeyError:
            raise Exception('{} not accepted'.format(self.activation))
            
        implemented_derivatives = {'sigmoid': sigmoid_derivative,
                                   'ReLU': ReLU_derivative,
                                   'linear': linear_derivative}
        
        # set activation derivative (da/dz)
        try:
            self.act_derivative = implemented_derivatives[self.activation]
        except KeyError:
            raise Exception('derivative not implemented for {}'.format(self.activation))

        # set activation for last layer (softmax for classification and linear for regression)
        if self.mode == 'classification':
            self.last_act = softmax
        elif self.mode == 'regression':
            self.last_act = linear

    def _set_loss(self):
        implemented_losses = {'cross_entropy': cross_entropy,}
        try:
            self.loss_func = implemented_losses[self.loss]
        except KeyError:
            raise Exception('{} not accepted'.format(self.loss))
    
    def train(self, X, y, n_epochs=10, lr=0.001, n_classes=None):
        self.lr = lr
        self.n_samples, self.n_features = X.shape
        self.classes = n_classes
        if n_classes is None:
            self.classes = set(y)
            self.n_classes = len(self.classes)
        
        y_one_hot = one_hot_encode(y, self.n_classes)
        self._init_neural_network()
        
        print(self.biases[1])
        for e in range(n_epochs):
            # implement shuffle
            # implement batch
            self._feed_forward(X)
            self.loss_e = self.loss_func(y_one_hot, self.activations[-1])
            print('loss', e, self.loss_e)
            self._back_prop(X, y_one_hot)
        
        print(self.biases[1])
                

    
    def _feed_forward(self, X):
        self.activations = []
        self.Z_list = []
        act = self.act
        for layer, (w_l, b_l) in enumerate(zip(self.weights, self.biases)):
            if layer == 0:
                prev = X
            else:
                prev = self.activations[-1]

            if layer == len(self.hidden):
                act = self.last_act
            Z_l = np.dot(prev, w_l) + b_l
            act_l = act(Z_l)    
            self.activations.append(act_l)
            self.Z_list.append(Z_l)
            
    def predict(self, X):
        self._feed_forward(X)
        return self.activations[-1]
            
    def _dE_dZ(self, y, p):
        # dE/dz where E(y) - cross entropy and a(z) is the softmax activation function
        return p - y
            
    def _back_prop(self, X, y):
        y_pred = self.activations[-1]
        
        new_weights, new_biases = [], []
        L = len(self.activations)
        for layer in range(L-1, -1, -1):
            w_l, b_l = self.weights[layer], self.biases[layer]
            Z_l = self.Z_list[layer]
            
            if layer == 0:
                act_prev = X
            else:
                act_prev = self.activations[layer-1]
            
            if layer == L-1:
                self.dE_dz = self._dE_dZ(y, y_pred)
            else:
                dE_da = self.dE_dz @ self.weights[layer+1].T # dE_da wrt activation of current layer
                da_dz = self.act_derivative(Z_l)
                self.dE_dz = np.multiply(da_dz, dE_da)
            
            dE_dW = act_prev.T @ self.dE_dz
            dE_db = np.sum(self.dE_dz, axis=0)
            #print(layer, act_prev.T.shape, self.dE_dz.shape, dE_dW.shape, w_l.shape)
            w_l -= self.lr * dE_dW
            b_l -= self.lr * dE_db
            
            new_weights.append(w_l)
            new_biases.append(b_l)
            
        self.weights = new_weights[::-1]
        self.biases = new_biases[::-1]

In [9]:
# Load Iris Dataset
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X = iris.data  
y = iris.target


feature_idxs = [1, 3] # SET FEATURES BY INDEX <------------------

feature_names = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']
xlbl, ylbl = feature_names[feature_idxs[0]], feature_names[feature_idxs[1]] 
# We will also split the dataset into training and testing so we can evaluate the kNN classifier
X_trn_, X_test_, y_trn, y_test = train_test_split(X, 
                                                 y, 
                                                 test_size=0.333, 
                                                 random_state=0,
                                                 stratify=y)
X_trn, X_test = X_trn_[:, feature_idxs], X_test_[:, feature_idxs]

print("X_trn.shape = {}, X_test.shape = {}".format(X_trn.shape, X_test.shape))

X_trn.shape = (100, 2), X_test.shape = (50, 2)


In [66]:
X = iris.data[:, :2]
y = iris.target
nn = NeuralNetwork(hidden=(6,), 
                   init_weights='unit_norm', 
                   activation='ReLU',
                   )
nn.train(X_trn, y_trn, n_epochs=100, lr=0.0001)
y_pred_trn = nn.predict(X_trn).argmax(axis=1)
y_pred_test = nn.predict(X_test).argmax(axis=1)

print('trn acc', accuracy_score(y_pred_trn, y_trn))

print('test acc', accuracy_score(y_pred_test, y_test))

[0. 0. 0.]
loss 0 5.94952518477602
loss 1 5.4802021973966335
loss 2 5.032410031042765
loss 3 4.6106821170967285
loss 4 4.2234712466361195
loss 5 3.8832408444212647
loss 6 3.6024557359770224
loss 7 3.384519418752235
loss 8 3.2181016183615028
loss 9 3.084200333980726
loss 10 2.9672382474854384
loss 11 2.858478977981857
loss 12 2.75394746343708
loss 13 2.652015202987791
loss 14 2.5520516291240427
loss 15 2.4538304479739264
loss 16 2.3572941575469173
loss 17 2.2624665692246264
loss 18 2.169423460665378
loss 19 2.07828586360507
loss 20 1.9892222937502595
loss 21 1.9024544421194447
loss 22 1.8182636725847419
loss 23 1.7369963399550317
loss 24 1.6590657051731008
loss 25 1.5849476345801838
loss 26 1.5151667442623178
loss 27 1.4502697087553589
loss 28 1.3907837658615205
loss 29 1.337161621844385
loss 30 1.2897189972294614
loss 31 1.248576665409246
loss 32 1.2136222447011271
loss 33 1.1845050642026873
loss 34 1.160669057662905
loss 35 1.1414167562387185
loss 36 1.1259878895226294
loss 37 1.11363

In [55]:
nn.Wi[1] - nn.Wf[1]

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [57]:
nn.Bi[0] - nn.Bf[0]

array([0., 0., 0., 0., 0., 0.])

In [34]:
nn.biases[1]

array([ 0.02601067,  0.1209011 , -0.14691177])

In [14]:
y_pred_trn.shape, y_trn.shape

((100,), (100,))