In [20]:

def init_layer_weight_unit_norm(input_shape, output_shape):
    return np.random.normal(size=(input_shape, output_shape))


def init_layer_weights_ones(input_shape, output_shape):
    return np.ones((input_shape, output_shape))


def init_layer_bias(length):
    return np.zeros(length)


def softmax(x):
    # https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
    exp = np.exp(x - np.max(x))
    return exp / np.sum(exp, axis=1)[:, None]


def softmax_gradient(z, sm=None):
    # https://stackoverflow.com/questions/57741998/vectorizing-softmax-cross-entropy-gradient
    if sm is None:
        sm = softmax(z)
    res = np.einsum('ij,ik->ijk', sm, -sm)
    np.einsum('ijj->ij',res)[...] += sm
    return res


def linear(x):
    return x


def linear_derivative(X):
    #np.repeat(np.eye(K, K)[np.newaxis, :, :], N, axis=0).shape
    return np.ones((X.shape[0]))


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)


def ReLU(x):
    return np.maximum(x, 0, x)


def ReLU_derivative(x):
    return (x > 0).astype(int)


def cross_entropy(y_true, y_pred):
    N = len(y_true)
    return -np.sum(y_true * np.log(y_pred)) / N


def cross_entropy_derivative(y_true, y_pred):
    N = len(y_true)
    return -(y_true / y_pred) # / N


def one_hot_encode(y, n_classes):
    y_onehot = np.zeros((len(y), n_classes))
    for i, y_i in enumerate(y):
        y_onehot[i, y_i] = 1
    return y_onehot


def softmax_gradient(z,sm=None):
    if sm is None:
        sm = softmax(z)
    res = np.einsum('ij,ik->ijk',sm,-sm)
    np.einsum('ijj->ij',res)[...] += sm
    return res

def dE_dz__(y, z, sm=None):
    if sm is None:
        sm = softmax(z)
    dE_da = cross_entropy_derivative(y, sm)
    da_dz = softmax_gradient(z, sm)
    return np.einsum('ij,ijk->ik', dE_da, da_dz)


def norm_data(X):
    """
    normalize data to have zero mean and unit variance
    :param X: input data (array) - X.shape = (n_samples, m_features)
    :return:
    """
    mean, std = X.mean(axis=0), X.std(axis=0)
    return (X - mean) / std, (mean, std)


def shuffle_data(X, y):
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

def batch_iterator(X, y, batch_size):
    N, _ = X.shape
    batch_idxs = np.arange(0, N, batch_size)

    for start in batch_idxs:
        stop = start + batch_size
        X_batch, y_batch = X[start:stop], y[start:stop]
        yield X_batch, y_batch


class NeuralNetwork():
    
    def __init__(self, 
                 hidden=(8, 6),
                 init_weights='unit_norm',
                 init_bias='zeros',
                 activation='ReLU',
                 loss='cross_entropy',
                 mode='classification',
                 shuffle=True,
                 verbose=False,
                 batch_size=10,
                 random_state=1):
        self.hidden = hidden
        self.init_weights = init_weights
        self.init_bias = init_bias
        self.activation = activation
        self.loss = loss
        self.random_state = random_state
        self.mode = mode
        self.verbose = verbose
        self.shuffle = shuffle
        self.batch_size = batch_size
        np.random.seed(self.random_state)
        self._set_act_func()
        self._set_loss()
        
        
    def _init_neural_network(self):
        implemented_weight_inits = {'unit_norm': init_layer_weight_unit_norm,
                                    'ones': init_layer_weights_ones
                                   }
        implemented_bias_inits = {'zeros': init_layer_bias,
                                   }
        try:
            init_layer_weight = implemented_weight_inits[self.init_weights]
            init_layer_bias = implemented_bias_inits[self.init_bias]
        except KeyError:
            raise Exception('{} or {} not accepted'.format(self.init_weights,
                                                           self.init_bias))

        self.weights = []
        self.biases = []
        for layer in range(len(self.hidden) + 1):
            if layer == 0:
                input_shape = self.n_features
                output_shape = self.hidden[layer]
            elif layer == len(self.hidden):
                input_shape = self.hidden[layer - 1]
                output_shape = self.n_classes
            else:
                input_shape = self.hidden[layer - 1]
                output_shape = self.hidden[layer]                
            w_l = init_layer_weight(input_shape, output_shape)
            b_l = init_layer_bias(output_shape)
            self.weights.append(w_l)
            self.biases.append(b_l)
        
            
    def _set_act_func(self):
        implemented_activations = {'sigmoid': sigmoid,
                                   'ReLU': ReLU,
                                   'linear': linear_derivative, 
                                    'softmax': softmax}
        # set activation function
        try:
            self.act = implemented_activations[self.activation]
        except KeyError:
            raise Exception('{} not accepted'.format(self.activation))
            
        implemented_derivatives = {'sigmoid': sigmoid_derivative,
                                   'ReLU': ReLU_derivative,
                                   'linear': linear_derivative,
                                    'softmax': softmax_derivative}
        
        # set activation derivative (da/dz)
        try:
            self.act_derivative = implemented_derivatives[self.activation]
        except KeyError:
            raise Exception('derivative not implemented for {}'.format(self.activation))

        # set activation for last layer (softmax for classification and linear for regression)
        if self.mode == 'classification':
            self.last_act = softmax
            self.last_act_grad = softmax_gradient
        elif self.mode == 'regression':
            self.last_act = linear
            self.last_act_grad = linear_gradient


    def _set_loss(self):
        implemented_losses = {'cross_entropy': cross_entropy,}
        loss_gradients = {'cross_entropy': cross_entropy_derivative,}
        try:
            self.loss_func = implemented_losses[self.loss]
            self.loss_grad_func = loss_gradients[self.loss]
        except KeyError:
            raise Exception('{} not accepted'.format(self.loss))
    
    def train(self, X, y, n_epochs=10, lr=0.001, n_classes=None):
        self.lr = lr
        self.n_samples, self.n_features = X.shape
        self.classes = n_classes
        if n_classes is None:
            self.classes = set(y)
            self.n_classes = len(self.classes)
        
        y_one_hot = one_hot_encode(y, self.n_classes)
        self._init_neural_network()
        
        
        for e in range(n_epochs):
            self.loss_e = 0
            # shuffle data
            if self.shuffle:
                X, y_one_hot = shuffle_data(X, y_one_hot)
            # iterate through batches
            for X_batch, y_batch in batch_iterator(X, y_one_hot, self.batch_size):
                self._feed_forward(X_batch)
                self._back_prop(X_batch, y_batch)
                self.loss_batch = self.loss_func(y_batch, self.activations[-1])
                self.loss_e += self.loss_batch
                
        if self.verbose:
            print(e, 'trn loss = {}'.format(self.loss_e))
        print('epoch {}: final trn loss = {}'.format(e, self.loss_e))
            
    
    def _feed_forward(self, X):
        self.activations = []
        self.Z_list = []
        act = self.act
        for layer, (w_l, b_l) in enumerate(zip(self.weights, self.biases)):
            if layer == 0:
                prev = X
            else:
                prev = self.activations[-1]

            if layer == len(self.hidden):
                act = self.last_act
            Z_l = np.dot(prev, w_l) + b_l
            act_l = act(Z_l)    
            self.activations.append(act_l)
            self.Z_list.append(Z_l)
            
    def predict(self, X):
        self._feed_forward(X)
        return self.activations[-1]
    
    
    def _get_gradient(self, y, a, z):
        # https://stackoverflow.com/questions/57741998/vectorizing-softmax-cross-entropy-gradient
        dE_da = self.loss_grad_func(y, a)
        da_dz = self.last_act_grad(z)
        return np.einsum('ij,ijk->ik', dE_da, da_dz)
    '''
    def _dE_dZ(self, y, p):
        # dE/dz where E(y) - cross entropy and a(z) is the softmax activation function
        return p - y
    
    def _get_grad(dE_da, da_dz):
        return np.tensordot(dE_da, da_dz, axes=([-1],[0]))
    '''
    def _back_prop(self, X, y):
        y_pred = self.activations[-1]
        z_last = self.Z_list[-1]
        self.dE_dz = self._get_gradient(y, y_pred, z_last)
        #self.dE_da = self.loss_grad_func(y, y_pred)
        #self.da_dz = self.last_act_grad(z_last) # gradient of last activation layer
        #self.dE_dz = get_dE_dz(self.dE_da, self.da_dz) # gradient at the last layer
        #self.dE_dz_1 = self.dE_dz[...]
        #self.dE_dz_2 = self._dE_dZ(y, y_pred)[...]

        new_weights, new_biases = [], []
        L = len(self.activations)
        for layer in range(L-1, -1, -1):
            w_l, b_l = self.weights[layer], self.biases[layer]
            Z_l = self.Z_list[layer]
            
            if layer == 0:
                act_prev = X
            else:
                act_prev = self.activations[layer-1]
            
            if layer < L-1:
                dE_da = self.dE_dz @ self.weights[layer+1].T # dE_da wrt activation of current layer
                da_dz = self.act_derivative(Z_l)
                self.dE_dz = np.multiply(da_dz, dE_da)
            
            dE_dW = act_prev.T @ self.dE_dz
            dE_db = np.sum(self.dE_dz, axis=0)
            w_l -= self.lr * dE_dW
            b_l -= self.lr * dE_db
            
            new_weights.append(w_l)
            new_biases.append(b_l)
            
        self.weights = new_weights[::-1]
        self.biases = new_biases[::-1]

In [13]:
# Load Iris Dataset
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X = iris.data  
y = iris.target


feature_idxs = [1, 3] # SET FEATURES BY INDEX <------------------

feature_names = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']
xlbl, ylbl = feature_names[feature_idxs[0]], feature_names[feature_idxs[1]] 
# We will also split the dataset into training and testing so we can evaluate the kNN classifier
X_trn_, X_test_, y_trn, y_test = train_test_split(X, 
                                                 y, 
                                                 test_size=0.333, 
                                                 random_state=0,
                                                 stratify=y)
X_trn, X_test = X_trn_[:, feature_idxs], X_test_[:, feature_idxs]

print("X_trn.shape = {}, X_test.shape = {}".format(X_trn.shape, X_test.shape))

X_trn.shape = (100, 2), X_test.shape = (50, 2)


In [26]:
# Load Iris Dataset
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X = iris.data  
y = iris.target


feature_idxs = [1, 3] # SET FEATURES BY INDEX <------------------

feature_names = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']
xlbl, ylbl = feature_names[feature_idxs[0]], feature_names[feature_idxs[1]] 
# We will also split the dataset into training and testing so we can evaluate the kNN classifier
X_trn_, X_test_, y_trn, y_test = train_test_split(X, 
                                                 y, 
                                                 test_size=0.333, 
                                                 random_state=0,
                                                 stratify=y)
X_trn, X_test = X_trn_[:, feature_idxs], X_test_[:, feature_idxs]

print("X_trn.shape = {}, X_test.shape = {}".format(X_trn.shape, X_test.shape))
X_trn_norm, (trn_mean, trn_std) = norm_data(X_trn)
X_test_norm = (X_test - trn_mean) / trn_std




X_trn.shape = (100, 2), X_test.shape = (50, 2)


In [40]:
nn = NeuralNetwork(hidden=(6,), 
                   init_weights='unit_norm', 
                   activation='ReLU',
                   shuffle=True,
                   batch_size=50,
                   random_state=1
                   )
nn.train(X_trn_norm, y_trn, n_epochs=100, lr=0.01)
y_pred_trn = nn.predict(X_trn_norm).argmax(axis=1)
y_pred_test = nn.predict(X_test_norm).argmax(axis=1)

print('trn acc', accuracy_score(y_pred_trn, y_trn))
print('test acc', accuracy_score(y_pred_test, y_test))

epoch 99: final trn loss = 0.1913208082881778
trn acc 0.97
test acc 0.94


In [52]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD


model = Sequential()
model.add(Dense(6, activation='relu', input_dim=X_trn_norm.shape[1]))
model.add(Dense(3, activation='softmax',))

sgd = SGD(lr=0.1, decay=0.0, momentum=0.00, nesterov=False)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

model.fit(X_trn_norm, one_hot_encode(y_trn, 3),
          epochs=100,
          batch_size=50)
y_pred_trn = model.predict(X_trn_norm).argmax(axis=1)
y_pred_test = model.predict(X_test_norm).argmax(axis=1)


print('trn acc', accuracy_score(y_pred_trn, y_trn))
print('test acc', accuracy_score(y_pred_test, y_test))
del model

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
trn acc 0.95
test acc 0.92
