In [None]:
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [None]:
def unit_norm_layer_init(input_shape, output_shape):
    return np.random.normal(size=(input_shape, output_shape))

def ones_layer_init(input_shape, output_shape):
    return np.ones((input_shape, output_shape))

def zeros_init_layer(length):
    return np.zeros(length)

def softmax(x):
    # https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
    exp = np.exp(x - np.max(x))
    return exp / np.sum(exp, axis=1)[:, None]

def linear(x):
    return x

def linear_derivative(X):
    return np.ones((X.shape[0]))

def softmax_grad(x):
    s = sigmoid(x)
    _, K = s.shape
    return s @ (np.eye(K, K) - s).T

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

def ReLU(x):
    return np.maximum(x, 0, x)

def ReLU_derivative(x):
    return (x > 0).astype(int)

def cross_entropy(y_true, y_pred):
    N = len(y_true)
    return -np.sum(y_true*np.log(y_pred)) / N

def cross_entropy_derivative(y_true, y_pred):
    N = len(y_true)
    return -y_true/y_pred / N

def onehot_encode(y):
    y = np.array(y)
    y_onehot = np.zeros((len(y), max(y)+1))
    for i, y_i in enumerate(y):
        y_onehot[i, y_i] = 1
    return y_onehot

def one_hot_encode(y, n_classes):
    y_onehot = np.zeros((len(y), n_classes))
    for i, y_i in enumerate(y):
        y_onehot[i, y_i] = 1
    return y_onehot
'''
def softmax_grad(x):
    # da / dZ
    # Reshape the 1-d softmax to 2-d so that np.dot will do the matrix multiplication
    s = softmax(x).reshape(-1,1)
    return np.diagflat(s) - np.dot(s, s.T)
'''
class NeuralNetwork():
    
    def __init__(self, 
                 hidden=(8, 6),
                 init_weights='unit_norm',
                 init_bias='zeros',
                 activation='sigmoid',
                 loss='cross_entropy',
                 mode='classification',
                 random_state=1):
        self.hidden = hidden
        self.init_weights = init_weights
        self.init_bias = init_bias
        self.activation = activation
        self.loss = loss
        self.random_state = random_state
        self.mode = mode
        np.random.seed(self.random_state)
        self._set_act_func()
        self._set_loss()
        
        
    def _init_neural_network(self):
        implemented_weight_inits = {'unit_norm': unit_norm_layer_init,
                                    'ones': ones_layer_init
                                   }
        implemented_bias_inits = {'zeros': zeros_init_layer,
                                   }
        try:
            init_layer_weight = implemented_weight_inits[self.init_weights]
            init_layer_bias = implemented_bias_inits[self.init_bias]
        except KeyError:
            raise Exception('{} or {} not accepted'.format(self.init_weights,
                                                           self.init_bias))

        self.weights = []
        self.biases = []
        for layer in range(len(self.hidden) + 1):
            if layer == 0:
                input_shape = self.n_features
                output_shape = self.hidden[layer]
            elif layer == len(self.hidden):
                input_shape = self.hidden[layer - 1]
                output_shape = self.n_classes
            else:
                input_shape = self.hidden[layer - 1]
                output_shape = self.hidden[layer]                
            w_l = init_layer_weight(input_shape, output_shape)
            b_l = init_layer_bias(output_shape)
            self.weights.append(w_l)
            self.biases.append(b_l)
        
            
    def _set_act_func(self):
        implemented_activations = {'sigmoid': sigmoid,
                                   'ReLU': ReLU,
                                   'linear': linear_derivative}
        # set activation function
        try:
            self.act = implemented_activations[self.activation]
        except KeyError:
            raise Exception('{} not accepted'.format(self.activation))
            
        implemented_derivatives = {'sigmoid': sigmoid_derivative,
                                   'ReLU': ReLU_derivative,
                                   'linear': linear_derivative}
        
        # set activation derivative (da/dz)
        try:
            self.act_derivative = implemented_derivatives[self.activation]
        except KeyError:
            raise Exception('derivative not implemented for {}'.format(self.activation))

        # set activation for last layer (softmax for classification and linear for regression)
        if self.mode == 'classification':
            self.last_act = softmax
        elif self.mode == 'regression':
            self.last_act = linear

    def _set_loss(self):
        implemented_losses = {'cross_entropy': cross_entropy,}
        try:
            self.loss_func = implemented_losses[self.loss]
        except KeyError:
            raise Exception('{} not accepted'.format(self.loss))
    
    def train(self, X, y, n_epochs=10, lr=0.001, n_classes=None):
        self.lr = lr
        self.n_samples, self.n_features = X.shape
        self.classes = n_classes
        if n_classes is None:
            self.classes = set(y)
            self.n_classes = len(self.classes)
        
        y_one_hot = one_hot_encode(y, self.n_classes)
        self._init_neural_network()
        
        for e in range(n_epochs):
            # implement shuffle
            # implement batch
            self._feed_forward(X)
            self.loss_e = self.loss_func(y_one_hot, self.activations[-1])
            print('loss', e, self.loss_e)
            self._back_prop(X, y_one_hot)

    
    def _feed_forward(self, X):
        self.activations = []
        self.Z_list = []
        #self.Z_list.append(X)
        act = self.act
        for layer, (w_l, b_l) in enumerate(zip(self.weights, self.biases)):
            if layer == 0:
                prev = X
            else:
                prev = self.activations[-1]

            if layer == len(self.hidden):
                act = self.last_act
            Z_l = np.dot(prev, w_l) + b_l
            act_l = act(Z_l)    
            self.activations.append(act_l)
            self.Z_list.append(Z_l)
            
    def predict(self, X):
        self._feed_forward(X)
        return self.activations[-1]
            
    def _dE_dZ(self, y, p):
        # dE/dz where E(y) - cross entropy and a(z) is the softmax activation function
        return p - y
            
    def _back_prop(self, X, y):
        y_pred = self.activations[-1]
        
        # self.dE_dp = cross_entropy_derivative(y, y_pred)
        new_weights, new_biases = [], []
        L = len(self.activations)
        for layer in range(L-1, -1, -1):
            w_l, b_l = self.weights[layer], self.biases[layer]
            Z_l = self.Z_list[layer]
            
            if layer == 0:
                act_prev = X
            else:
                act_prev = self.activations[layer-1]
            
            if layer == L-1:
                self.dE_dz = self._dE_dZ(y, y_pred)
            else:
                dE_da = self.dE_dz @ self.weights[layer+1].T # dE_da wrt activation of current layer
                da_dz = self.act_derivative(Z_l)
                self.dE_dz = np.multiply(da_dz, dE_da)
            
            dE_dW = act_prev.T @ self.dE_dz
            dE_db = np.sum(self.dE_dz, axis=0)
            print(layer, act_prev.T.shape, self.dE_dz.shape, dE_dW.shape, w_l.shape)
            w_l -= self.lr * dE_dW
            b_l -= self.lr * dE_db
            
            new_weights.append(w_l)
            new_biases.append(b_l)
            
        self.weights = new_weights[::-1]
        self.biases = new_biases[::-1]
            #Z_l = self.Z_list[layer]
            #self.act_l = self.activations[layer]
            #prev_act = self.activations[layer-1]
            
            
            #self.da_dz = softmax_grad(Z_l)
            #if layer == 0: # last layer
            #    dE_dZ_l = dE_dyp # N vec X N vec

In [None]:
# Load Iris Dataset
from sklearn import datasets
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X = iris.data  
y = iris.target

# For illustration purposes we will only be using the two features in the dataset
feature_idxs = [1, 3] # SET FEATURES BY INDEX <------------------

feature_names = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']
xlbl, ylbl = feature_names[feature_idxs[0]], feature_names[feature_idxs[1]] 
# We will also split the dataset into training and testing so we can evaluate the kNN classifier
X_trn_, X_test_, y_trn, y_test = train_test_split(X, 
                                                 y, 
                                                 test_size=0.333, 
                                                 random_state=0,
                                                 stratify=y)
X_trn, X_test = X_trn_[:, feature_idxs], X_test_[:, feature_idxs]

print("X_trn.shape = {}, X_test.shape = {}".format(X_trn.shape, X_test.shape))

In [None]:

X = iris.data[:, :2]
y = iris.target
nn = NeuralNetwork(hidden=(8,6), 
                   init_weights='unit_norm', 
                   activation='ReLU',
                   )
nn.train(X_trn, y_trn, n_epochs=100, lr=0.0001)
y_pred_trn = nn.predict(X_trn).argmax(axis=1)
y_pred_test = nn.predict(X_test).argmax(axis=1)
print(accuracy_score(y_pred_trn, y_trn))
print(accuracy_score(y_pred_test, y_test))


In [282]:
X = np.random.rand(100, 2)
y = np.random.randint(0, 2, size=100)

nn = NeuralNetwork(hidden=(8,6), 
                   init_weights='unit_norm', 
                   activation='ReLU',
                   )
nn.train(X, y, n_epochs=100, lr=0.001)



loss 0 1.3732304002112679
2 (6, 100) (100, 2) (6, 2) (6, 2)
1 (8, 100) (100, 6) (8, 6) (8, 6)
0 (2, 100) (100, 8) (2, 8) (2, 8)
loss 1 0.7325920254029691
2 (6, 100) (100, 2) (6, 2) (6, 2)
1 (8, 100) (100, 6) (8, 6) (8, 6)
0 (2, 100) (100, 8) (2, 8) (2, 8)
loss 2 0.7117990177688739
2 (6, 100) (100, 2) (6, 2) (6, 2)
1 (8, 100) (100, 6) (8, 6) (8, 6)
0 (2, 100) (100, 8) (2, 8) (2, 8)
loss 3 0.7050309385087026
2 (6, 100) (100, 2) (6, 2) (6, 2)
1 (8, 100) (100, 6) (8, 6) (8, 6)
0 (2, 100) (100, 8) (2, 8) (2, 8)
loss 4 0.7026680684655983
2 (6, 100) (100, 2) (6, 2) (6, 2)
1 (8, 100) (100, 6) (8, 6) (8, 6)
0 (2, 100) (100, 8) (2, 8) (2, 8)
loss 5 0.7010232938654418
2 (6, 100) (100, 2) (6, 2) (6, 2)
1 (8, 100) (100, 6) (8, 6) (8, 6)
0 (2, 100) (100, 8) (2, 8) (2, 8)
loss 6 0.6998592260929809
2 (6, 100) (100, 2) (6, 2) (6, 2)
1 (8, 100) (100, 6) (8, 6) (8, 6)
0 (2, 100) (100, 8) (2, 8) (2, 8)
loss 7 0.6988646999495286
2 (6, 100) (100, 2) (6, 2) (6, 2)
1 (8, 100) (100, 6) (8, 6) (8, 6)
0 (2, 100)

In [256]:
['a', 'b'][::-1]

['b', 'a']

In [234]:
np.ones((2,2))

array([[1., 1.],
       [1., 1.]])

In [32]:
L = 3
for i in range(L-1, -1, -1):
    print(i)

2
1
0


In [6]:
X = np.random.rand(100, 2)
y = np.random.randint(0, 2, size=100)

nn = NeuralNetwork(hidden=(8,6), init_weights='unit_norm', activation='ReLU')
nn.train(X, y)

(3, 1)

In [265]:
X = [[1, 1], [0, 0], [0, 1]]
y = [1, 0, 1]

X, y = np.array(X), np.array(y)
nn = NeuralNetwork(hidden=(8,6), init_weights='unit_norm', activation='ReLU')
nn.train(X, y)
print(nn.loss_e)
for i in range(len(nn.activations)):
    
    print(i, nn.weights[i], '-', nn.biases[i],'-', nn.activations[i])
    
for i in range(len(nn.activations)):
    print(nn.weights[i].shape, '-', nn.biases[i].shape,'-', nn.activations[i].shape, '-', nn.Z_list[i].shape)

print(nn.dE_dz.shape)


loss 0.6941128459836579
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3) (3, 8) (2, 8) (2, 8)
loss 0.6942956806651174
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3) (3, 8) (2, 8) (2, 8)
loss 0.694474433866695
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3) (3, 8) (2, 8) (2, 8)
loss 0.694653732909699
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3) (3, 8) (2, 8) (2, 8)
loss 0.694833579551987
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3) (3, 8) (2, 8) (2, 8)
loss 0.6950139755569036
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3) (3, 8) (2, 8) (2, 8)
loss 0.6951949226932953
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3) (3, 8) (2, 8) (2, 8)
loss 0.6953764227355276
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3) (3, 8) (2, 8) (2, 8)
loss 0.6955584774635007
2 (6, 3) (3, 2) (6, 2) (6, 2)
1 (8, 3) (3, 6) (8, 6) (8, 6)
0 (2, 3

In [236]:
s1 = np.sum(nn.dE_dz, axis=0)
np.ones(nn.dE_dz)

(3, 2)

In [241]:
?np.random.rand

In [61]:
softmax_grad(np.array([[0.2, 0.1, 0.7], [0.2, 0.5, 0.3], [0.2, 0.6, 0.2], [0.7, 0.1, 0.2]]))


ValueError: operands could not be broadcast together with shapes (3,3) (4,3) 

In [75]:
s = softmax(np.array([[0.2, 0.1, 0.7], [0.2, 0.5, 0.3], [0.2, 0.6, 0.2], [0.7, 0.1, 0.2]]))
s

array([[0.28140804, 0.25462853, 0.46396343],
       [0.28943311, 0.39069383, 0.31987306],
       [0.28638322, 0.42723356, 0.28638322],
       [0.46396343, 0.25462853, 0.28140804]])

In [78]:
N, K = s.shape
I = np.repeat(np.eye(K, K)[np.newaxis, :, :], N, axis=0)
(I - s @ np.eye(K, K))

ValueError: operands could not be broadcast together with shapes (4,3,3) (4,3) 

In [103]:
tmp = np.multiply.outer(s, np.eye(1, K))

In [135]:
s2 = s[0, : , np.newaxis]


(3, 1)

In [160]:
# http://saitcelebi.com/tut/output/part2.html

s0 = s[0, : , np.newaxis]
tmp = np.identity(K) - np.matmul(np.ones((K, 1)), s0.T)
tmp2 = np.matmul(s0, np.ones((1, K)))
tmp3 = tmp2 * tmp
tmp3

array([[ 0.20221756, -0.07165452, -0.13056304],
       [-0.07165452,  0.18979284, -0.11813832],
       [-0.13056304, -0.11813832,  0.24870137]])

In [161]:
np.matmul(np.ones((K, 1)), s0.T).shape

(3, 3)

In [164]:
np.ones((K, 1)).shape, s0.T.shape, np.matmul(np.ones((K, 1)), s0.T).shape

((3, 1), (1, 3), (3, 3))

In [182]:
I2 = np.ones((N, K))[:, : , np.newaxis]
s2 = s[:, : , np.newaxis]

print(I2.shape, s2.T.shape), 
print(np.tensordot(I2, s2.T, axes=([2],[0])).shape) #np.matmul(I2, s2.T).shape np.tensordot(I2, s2.T, axes=([2,1],[0,1])).shape

(4, 3, 1) (1, 3, 4)
(4, 3, 3, 4)


In [157]:
tmp = I - np.matmul(I, s2.T)
tmp2 = np.matmul(s2, np.ones((1, K)))
tmp3 = tmp2 * tmp
tmp3

ValueError: operands could not be broadcast together with shapes (4,3,3) (4,3,4) 

In [159]:
np.matmul(I, s2.T).shape

(4, 3, 4)

In [73]:
a = np.eye(2, 2)
print(a.shape)
# (2,  2)

# indexing with np.newaxis inserts a new 3rd dimension, which we then repeat the
# array along, (you can achieve the same effect by indexing with None, see below)


(2, 2)


array([[1., 0.],
       [0., 1.]])

In [201]:
y = [0, 1, 2, 1, 0]
y_ = np.zeros( (len(y), len(set(y))))
y_[:, y[:]] = 1
y_

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])