In [1]:
#back to the main problem
import numpy as np

def sigmoid(x):
    return 1/(1+np.exp(-x))
vsig = np.vectorize(sigmoid)

def dsig(x):
    return np.exp(-x)/((1+np.exp(-x))**2)
vdsig = np.vectorize(dsig)

def dtanh(x):
    4*np.exp(-2*x)/((1+np.exp(-2*x))**2)
vdtanh = np.vectorize(dtanh)

def softmax(v): 
    return np.exp(v)/sum(np.exp(v))

In [20]:
#the basic dense layer class
class denseLayer:
    def __init__(self, linear, bias=0, activation=None):
        self.linear = linear
        self.activation = activation #string
        self.bias = bias
        
    #returns a 1-dim'l np array representing the diagonal of the derivative of the activation
    #evaluated at the input vector
    def activationDerDiag(self,input_vect): #input_vect a column vector to be fed into the derivative
        if self.activation == None:
            return np.ones(len(input_vect))
        elif self.activation.lower() == 'relu':
            #i guess this should return 1 or 0 depending on whether input is >0
            for i in range(0,len(input_vect)):
                if input_vect[i]>0:
                    input_vect[i] = 1
                else: input_vect[i] = 0
            return np.array(input_vect)
        elif self.activation.lower() == 'sigmoid':
            return vdsig(input_vect)
        elif self.activation.lower() == 'tanh':
            return vdtanh(input_vect)
        elif self.activation.lower() == 'softmax':
            #this is not the actual derivative of the diagonal for softmax
            #i am currently assuming softmax is only going to appear at the end of a network, where it's 
            #used with cross entropy loss and the gradient is already incorporated in the gradient of the loss
            return np.ones(len(input_vect))
        else:
            raise ValueError('The activation you are trying to use has not yet been implemented. We have relu, sigmoid, tanh, and softmax.')
        
    #forward pass of a layer without the activation function
    def linearPass(self, input_vector, add_bias = True):
        if add_bias == False:
            return self.linear@input_vector
        return (self.linear@input_vector + self.bias)
        
    #if activation == None, this is the same as linearPass
    def forwardPass(self,input_vector):
        if self.activation == None:
            return (self.linear@input_vector + self.bias)
        elif self.activation.lower() == 'relu':
            return np.maximum(0,self.linearPass(input_vector))
        elif self.activation.lower() == 'sigmoid':
            return vsig(self.linearPass(input_vector))
        elif self.activation.lower() == 'tanh':
            return np.tanh(self.linearPass(input_vector))
        elif self.activation.lower() == 'softmax':
            return softmax(self.linearPass(input_vector))
        else:
            raise ValueError('The activation you are trying to use has not yet been implemented. We have relu, sigmoid, tanh, and softmax.')

#the basic neural network class, which just allows for a sequence of dense layers
class nNet:
    def __init__(self,layersList):
        self.layersList = layersList #a list of layers whose outputs match the following inputs
        
    def forwardPass(self,input_vector, return_all_steps = True):
        #this case is for when training is done
        if return_all_steps == False:
            for layer in self.layersList:
                input_vector = layer.forwardPass(input_vector)
            return input_vector
        
        #keeping track of the output after each layer to save time during backprop
        allStepsOut = [input_vector]
        for layer in self.layersList:
            input_vector = layer.forwardPass(input_vector)
            allStepsOut.append(input_vector)
        return allStepsOut
        
    #data will be list of pairs of column vectors (x,y)
    def backprop(self, data, loss, learning_rate, batch_size = 1): #this automatically does "stochastic gradient descent"
        for pair in data:
#            print(f'forward passing now. x is {pair[0]} and y is {pair[1]}')
            xForward = self.forwardPass(pair[0])
#            print(f'the entire forward pass of x is {xForward}')
            if loss.lower() == 'mse': #mean squared error
                rtTimesGradEtc = learning_rate*2*np.atleast_2d(xForward[-1]-pair[1]).T #T to make row vector from column
#                print(f'the gradient is starting at {rtTimesGradEtc}')
            elif loss.lower() == 'cel': #cross entropy loss - last layer MUST be softmax
                if self.layersList[-1] != 'softmax':
                    raise ValueError('This implementation of cross-entropy loss requires that the last layer of the network is a softmax')
                rtTimesGradEtc = learning_rate*np.atleast_2d(xForward[-1] - pair[1]).T #is it really this simple?
            else:
                raise ValueError('The loss you are trying to use has not yet been implemented. We have cel, mse, and ???.')
            #
            #
            #
            k=len(self.layersList)
            for i in range (0,k):
#                print(f'backpropagating {i} steps now')
                newMtx_kminusi = self.layersList[-i-1].linear
                newBias_kminusi = self.layersList[-i-1].bias
                
#                print(f'the current linear term is {newMtx_kminusi} and the current bias is {newBias_kminusi}')
                
                #multiply rtTimesGradEtc by derivatives of activation
                
                rtTimesGradEtc = rtTimesGradEtc*self.layersList[-i-1].activationDerDiag( self.layersList[-i-1].linear @ xForward[-i-2]+self.layersList[-i-1].bias)
                
#                print(f'updating the gradient by entrywise multiplication with {self.layersList[-i-1].activationDerDiag(self.layersList[-i-1].linear @ xForward[-i-2]+self.layersList[-i-1].bias)}')
                
                #using matrix equations to store the new layer parameters
                newMtx_kminusi = self.layersList[-i-1].linear - (np.atleast_2d(rtTimesGradEtc).T @ xForward[-i-2])
                newBias_kminusi = self.layersList[-i-1].bias - np.atleast_2d(rtTimesGradEtc).T
                
#                print(f'updating the linear term to {newMtx_kminusi} and the bias to {newBias_kminusi}')
                
                #finishing our gradient update
                rtTimesGradEtc = rtTimesGradEtc @ self.layersList[-i-1].linear
                
#                print(f'updating the gradient by matrix multiplication with {self.layersList[-i-1].linear}')
                
                #updating the linear and bias term in the actual model
                self.layersList[-i-1].linear = newMtx_kminusi
                self.layersList[-i-1].bias = newBias_kminusi
                
    def nEpochs(self, n, data, loss, learning_rate, batch_size = 1):
        for _ in range(0,n):
#            print(f'starting epoch {_}')
            self.backprop(data, loss, learning_rate, batch_size)

In [27]:
#
# as a first test, let's do some least squares - let the data be the points (0,1) and (1,2) in R^2
#
data = [(np.array([[0]]),np.array([[1]])),(np.array([[1]]),np.array([[2]]))]

In [28]:
#let's start our single layer model as a random linear function plus a random bias
import random 

layer = denseLayer(np.array([[random.uniform(-10,10)]]),np.array([[random.uniform(-10,10)]]))

model = nNet([layer])

print(layer.linear,layer.bias)

[[-2.46972727]] [[-2.73896382]]


In [29]:
#we'll train it for 50 epochs at 3 different learning rates
model.nEpochs(50, data, 'mse', .1, 1)
model.nEpochs(50, data, 'mse', .05, 1)
model.nEpochs(50, data, 'mse', .01, 1)

#then, print the slope and intercept!
print(model.layersList[0].linear,model.layersList[0].bias)

[[0.99895523]] [[1.00065937]]
