<div class="alert alert-block alert-success">
<b>CREATING LAYERS: FORWARD AND BACKWARD PASS</b>
</div>

In [3]:
import numpy as np

In [4]:
# DENSE LAYER
class Layer_Dense:
    # LAYER INITIALIZATION
    def __init__(self,n_inputs,n_neurons):
        # INITIALIZE WEIGHTS AND BIASES
        self.weights=0.01*np.random.randn(n_inputs,n_neurons)
        self.biases=np.zeros((1,n_neurons))
    # FORWARD PASS
    def forward(self,inputs):
        # REMEMBER INPUT VALUES
        self.inputs=inputs
        # CALULATE OUTPUT VALUES FROM INPUTS ONES,WEIGHTS AND BIASES
        self.output=np.dot(inputs,self.weights)+self.biases
    # BACKWARD PASS
    def backward(self, dvalues):
        # GRADIENT ON PARAMETERS
        self.dweights=np.dot(self.inputs.T,dvalues)
        self.dbiases=np.sum(dvalues,axis=0,keepdims=True)
        # GRADIENT ON INPUTS
        self.dinputs=np.dot(dvalues,self.weights.T)
        

<div class="alert alert-block alert-success">
<b>RELU ACTIVATION : FORWARD AND BACKWARD PASS</b>
</div>

In [5]:
# ReLU activation
class Activation_ReLU:
    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from inputs
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify the original variable,
        # let’s make a copy of values first
        self.dinputs = dvalues.copy()
        # Zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0


<div class="alert alert-block alert-success">
<b>SOFTMAX ACTIVATION : FORWARD PASS</b>
</div>

In [6]:
# SOFTMAX ACTIVATION
class Activation_Softmax:
    # FORWARD PASS 
    def forward(self, inputs):
        # GET UNNORAMLIZED PROBABILITIES
        exp_values=np.exp(inputs-np.max(inputs,axis=1,keepdims=True))
        # NORMALIZE  THEM FOR EACH SAMPLE
        probabilities=exp_values/np.sum(exp_values,axis=1,keepdims=True)

        self.output=probabilities

<div class="alert alert-block alert-success">
<b>LOSS</b>
</div>

In [7]:
class Loss:
    # CALCULATE THE DATA AND REGULARIZATION LOSSES
    # GIVEN MODEL OUTPUT AND GROUND TRUTH VALUES
    def calculate(self,output,y):
        # CALCULATE THE SAMPLE LOSS
        sample_losses=self.forward(output,y)
        # CALCULATE THE MEAN LOSS OVER ALL SAMPLES
        data_loss=np.mean(sample_losses)
        # RETRUN DATA LOSS
        return data_loss

<div class="alert alert-block alert-success">
<b>CATEGORICAL CROSS ENTROPY LOSS: FORWARD AND BACKWARD PASS</b>
</div>

In [8]:
class Loss_CategoricalCrossentropy(Loss):
    # FORWARD PASS
    def forward(self, y_pred, y_true):
        # NUMBER OF SAMPLES IN A BATCH
        samples = len(y_pred)
        # CLIP DATA TO PREVENT DIVISION BY ZERO
        # CLIP BOTH SIDES TO NOT DRAG TOWARDS ANY VALUE
        y_pred_clipped = np.clip(y_pred, 1e-7, 1e-7)
        # Probailities for target labels
        # ONLY IF CATEGORICAL LABELS
        if len(y_true.shape) ==1:
            correct_confidences=y_pred_clipped[range(samples),y_true]
        # MASK VALUE - ONLY FOR ONE HOT ENCODED LABELS
        elif len(y_true.shape)==2:
            correct_confidences=np.sum(y_pred_clipped*y_true, axis=1)
        
        neg_log_likelihoods=-np.log(correct_confidences)
        return neg_log_likelihoods
    def backward(self,davalues,y_true):
        # NUMBER OF SAMPLES
        samples=len(davalues)
        # NUMBER OF LABELS IN EVERY SAMPLE
        # WE WILL USE THE FIRST SAMPLE TO COUNT THEM
        labels=len(davalues[0])
        # IF LABELS ARE SPARSE ,TURN THEM INTO ONE HOT VECTOR
        if len(y_true.shape)==1:
            y_true=np.eye(labels)[y_true]
        
        # CALULATE GRADIENT  
        self.dinputs=-y_true/davalues
        # NORMALIZE GRADIENT 
        self.dinputs=self.dinputs/samples

    

<div class="alert alert-block alert-success">
<b> COMBINED SOFTMAX ACTIVATION AND CROSS ENTROPY LOSS FOR FASTER BACKWARD STEP</b>
</div>

In [9]:
# SOFTMAX CLASSIFIER - COMBINED SOFTMAX ACTIVATION
# CROSS ENTROPY LOSS FOR FASTER BACKWARD STEP
class Activation_Softmax_Loss_CategoricalCrossentropy():
    #CREATES ACTIVATION AND LOSS FUNCTION OBJECTS
    def __init__(self):
        self.activation=Activation_Softmax()
        self.loss=Loss_CategoricalCrossentropy()
        
    #FORWARD PASS
    def forward(self,inputs,y_true):
        #OUTPUT LAYERS ACTIVATION FUNCTIONS
        self.activation.forward(inputs)
        # SET THE OUTPUT
        self.output=self.activation.output
        # CALCULATE AND RETURN LOSS VALUE
        return self.loss.calculate(self.output,y_true)
    
    #BACKWARD PASS
    def backward(self,dvalues,y_true):
        # NUMBER OF SAMPLES
        samples = len(y_true)
        # IF LABELS ARE ONE HOT ENCODED
        # TURN THEM INTO DISCRETE VALUE
        if len(y_true.shape)==2:
            y_true=np.argmax(y_true,axis=1)
        # COPY SO WE CAN SAFELY MODIFY
        self.dinputs=dvalues.copy()
        # CALCULATE GRADIENT
        self.dinputs[range(samples),y_true]-=1
        # NORMALIZE GRADIENT
        self.dinputs=self.dinputs/samples


<div class="alert alert-block alert-success">
<b> FULL CODE UPTO THIS POINT : FORWARD AND BACKWARD PASS</b>
</div>

In [10]:
#CREATE DATASET
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()
# Create dataset
X, y = spiral_data(samples=100, classes=3)
# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)
# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 3 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(3, 3)
# Create Softmax classifier’s combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# Perform a forward pass of our training data through this layer
dense1.forward(X)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)
# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y)

# Let’s see output of the first few samples:
print(loss_activation.output[:5])
# Print loss value
print('loss:', loss)
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)

print(predictions)
if len(y.shape) == 2:
 y = np.argmax(y, axis=1)
accuracy = np.mean(predictions == y)
# Print accuracy
print('acc:', accuracy)
# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)
# Print gradients
print(dense1.dweights)
print(dense1.dbiases)
print(dense2.dweights)
print(dense2.dbiases)


[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]
loss: 16.118095
[0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 0 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2]
acc: 0.34
[[ 1.5766357e-04  7.8368583e-05  4.7324400e-05]
 [ 1.8161038e-04  1.1045573e-05 -3.3096312e-05]]
[[-3.60553473e-04  9.66117223e-05 -1.03671395e-04]]
[[ 5.44109462e-05  1.07411419e-04 

<div class="alert alert-block alert-success">
<b>OPTIMIZERS GRADIENT DESCENT</b>
</div>

In [11]:
# SDG OPTIMIZER
class Optimizer_SGD:
    # INITIALIZE OPTIMIZER - SET SETTINGS
    # LEARNING RATE OF 1, IS DEFAULT FOR THIS OPTIMIZER
    def __init__(self,learning_rate=1):
        self.learning_rate=learning_rate
    # UPDATE PARAMETERS
    def update_params(self,layer):
        layer.weights+=-self.learning_rate*layer.dweights
        layer.biases+=-self.learning_rate*layer.dbiases

In [42]:
# CREATE DATASET
X,y=spiral_data(samples=100,classes=3)
#CREATE DENSE LAYER WITH @ INPUT FEATURES AND 64 OUTPUT VALUES
dense1=Layer_Dense(2,64)
#CREATE RELU ACTIVATION FUNCTION
activation1=Activation_ReLU()
#CREATE DENSE LAYER WITH 64 INPUT FEATURES AND 3 OUTPUT VALUES
dense2=Layer_Dense(64,3)
#CREATE SOFTMAX ACTIVATION FUNCTION
loss_activation=Activation_Softmax_Loss_CategoricalCrossentropy()
#CREATE OPTIMIZER
optimizer=Optimizer_SGD()
# TRAINING IN LOOP 
for epochs in range(10000):
    # FORWARD PASS
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    loss=loss_activation.forward(dense2.output,y)

    predictions=np.argmax(loss_activation.output,axis=1)
    if len(y.shape)==2:
        y=np.argmax(y,axis=1)
    accuracy=np.mean(predictions==y)

    if not epochs %100:
        print(f'Epoch: {epochs}, Accuracy: {accuracy:.2f}, Loss: {loss:.2f}')
    # BACKWARD PASS
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)

    # UPDATE PARAMS
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
                   

Epoch: 0, Accuracy: 0.35, Loss: 16.12
Epoch: 100, Accuracy: 0.41, Loss: 16.12
Epoch: 200, Accuracy: 0.41, Loss: 16.12
Epoch: 300, Accuracy: 0.43, Loss: 16.12
Epoch: 400, Accuracy: 0.42, Loss: 16.12
Epoch: 500, Accuracy: 0.42, Loss: 16.12
Epoch: 600, Accuracy: 0.42, Loss: 16.12
Epoch: 700, Accuracy: 0.43, Loss: 16.12
Epoch: 800, Accuracy: 0.45, Loss: 16.12
Epoch: 900, Accuracy: 0.50, Loss: 16.12
Epoch: 1000, Accuracy: 0.45, Loss: 16.12
Epoch: 1100, Accuracy: 0.46, Loss: 16.12
Epoch: 1200, Accuracy: 0.44, Loss: 16.12
Epoch: 1300, Accuracy: 0.45, Loss: 16.12
Epoch: 1400, Accuracy: 0.42, Loss: 16.12
Epoch: 1500, Accuracy: 0.45, Loss: 16.12
Epoch: 1600, Accuracy: 0.44, Loss: 16.12
Epoch: 1700, Accuracy: 0.49, Loss: 16.12
Epoch: 1800, Accuracy: 0.47, Loss: 16.12
Epoch: 1900, Accuracy: 0.47, Loss: 16.12
Epoch: 2000, Accuracy: 0.50, Loss: 16.12
Epoch: 2100, Accuracy: 0.49, Loss: 16.12
Epoch: 2200, Accuracy: 0.52, Loss: 16.12
Epoch: 2300, Accuracy: 0.52, Loss: 16.12
Epoch: 2400, Accuracy: 0.53,

<div class="alert alert-block alert-success">
<b>OPTIMIZERS: LEARNING DECAY</b>
</div>

In [14]:
class Optimizer_SGD:
    # INITIAL OTPIMIZER  SET SETTING
    # LEARNING RATE OF 1 
    def __init__(self,learning_rate=1.0,decay=0.0):
        self.learning_rate=learning_rate
        self.current_learning_rate=learning_rate
        self.decay=decay
        self.iterations=0
    
    # CALL ONCE BEFORE ANY PARAMETER UPDATES
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1.+self.decay*self.iterations))
    # UPDATE PARAMETERS
    def update_params(self,layer):
        layer.weights+=-self.current_learning_rate*layer.dweights
        layer.biases+=-self.current_learning_rate*layer.dbiases
    
    # CALL ONCE AFTER ANY PARAMETER UPDATES
    def post_update_params(self):
        self.iterations+=1

In [17]:
import numpy as np

# Assuming the necessary classes (Layer_Dense, Activation_ReLU, 
# Activation_Softmax_Loss_CategoricalCrossentropy, and spiral_data) are defined elsewhere

X, y = spiral_data(samples=100, classes=3)

# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 64)

# Create ReLU activation (to be used with Dense layer)
activation1 = Activation_ReLU()

# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(decay=1e-3)

# Train in loop
for epoch in range(10001):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    loss = loss_activation.forward(dense2.output, y)
    
    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate}')
    
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()


epoch: 0, acc: 0.340, loss: 16.118, lr: 1.0
epoch: 100, acc: 0.420, loss: 16.118, lr: 0.9099181073703367
epoch: 200, acc: 0.417, loss: 16.118, lr: 0.8340283569641367
epoch: 300, acc: 0.430, loss: 16.118, lr: 0.7698229407236336
epoch: 400, acc: 0.423, loss: 16.118, lr: 0.7147962830593281
epoch: 500, acc: 0.427, loss: 16.118, lr: 0.66711140760507
epoch: 600, acc: 0.430, loss: 16.118, lr: 0.6253908692933083
epoch: 700, acc: 0.440, loss: 16.118, lr: 0.5885815185403178
epoch: 800, acc: 0.430, loss: 16.118, lr: 0.5558643690939411
epoch: 900, acc: 0.430, loss: 16.118, lr: 0.526592943654555
epoch: 1000, acc: 0.450, loss: 16.118, lr: 0.5002501250625312
epoch: 1100, acc: 0.490, loss: 16.118, lr: 0.4764173415912339
epoch: 1200, acc: 0.497, loss: 16.118, lr: 0.45475216007276037
epoch: 1300, acc: 0.497, loss: 16.118, lr: 0.43497172683775553
epoch: 1400, acc: 0.507, loss: 16.118, lr: 0.4168403501458941
epoch: 1500, acc: 0.510, loss: 16.118, lr: 0.4001600640256102
epoch: 1600, acc: 0.510, loss: 16.11

<div class="alert alert-block alert-success">
<b>OPTIMIZERS: MOMENTUM</b>
</div>

In [22]:
import numpy as np

class Optimizer_SGD:
    # INITIALIZE OPTIMIZER SET SETTINGS
    # LEARNING RATE OF 1.
    def __init__(self,learning_rate=1.0,decay=0.0,momentum=0):
        self.learning_rate=learning_rate
        self.current_learning_rate=learning_rate
        self.decay=decay
        self.iterations=0
        self.momentum=momentum
    # CALL ONCE BEFORE ANY PARAMETER UPDATES
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1.+self.decay*self.iterations))
    # UPDATE PARAMETER
    def update_params(self,layer):
        # If WE USE MOMENTUM
        if self.momentum:
            # IF LAYER DOES NOT CONTAIN MOMENTUM ARRAY,CREATE THEM
            # FILLED WITH ZERO
            if not hasattr(layer,'weight_momentum'):
                layer.weight_momentum=np.zeros_like(layer.weights)
                layer.bias_momentum=np.zeros_like(layer.biases)

            # BUILD WEIGHT UPDATES WITH MOMENTUM TAKE PREVIOUS
            # UPDATES MULTIPLIED BY RETAIN FACTOR AND UPDATE WITH CURRENT

            weight_updates=self.momentum*layer.weight_momentum-self.current_learning_rate*layer.dweights
            layer.weight_momentum=weight_updates
            bias_updates=self.momentum*layer.bias_momentum-self.current_learning_rate*layer.dbiases
            layer.bias_momentum=bias_updates
        # VANILLA SGD UPDATES
        else:
            weight_updates=-self.current_learning_rate*layer.dweights
            bias_updates=-self.current_learning_rate*layer.dbiases

        # UPDATE WEIGHTS AND BAISES USING EITHER VANILLA OR MOMENTUM UPDATES
        layer.weights+=weight_updates
        layer.biases+=bias_updates
   # CALL ONCE AFTER PARAMETER UPDATES
    def post_update_params(self):
        self.iterations+=1

In [23]:
import numpy as np

# Assuming the necessary classes (Layer_Dense, Activation_ReLU, 
# Activation_Softmax_Loss_CategoricalCrossentropy, Optimizer_SGD, and spiral_data) are defined elsewhere

# Create dataset
X, y = spiral_data(samples=100, classes=3)

# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 64)

# Create ReLU activation (to be used with Dense layer)
activation1 = Activation_ReLU()

# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)

# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_SGD(decay=1e-3, momentum=0.9)

# Train in loop
for epoch in range(10001):
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    
    # Perform a forward pass through activation function
    # takes the output of first dense layer here
    activation1.forward(dense1.output)
    
    # Perform a forward pass through second Dense layer
    # takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)
    
    # Perform a forward pass through the activation/loss function
    # takes the output of second dense layer here and returns loss
    loss = loss_activation.forward(dense2.output, y)
    
    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(loss_activation.output, axis=1)
    if len(y.shape) == 2:
        y = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == y)
    
    if not epoch % 100:
        print(f'epoch: {epoch}, ' +
              f'acc: {accuracy:.3f}, ' +
              f'loss: {loss:.3f}, ' +
              f'lr: {optimizer.current_learning_rate}')
    
    # Backward pass
    loss_activation.backward(loss_activation.output, y)
    dense2.backward(loss_activation.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    # Update weights and biases
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()


epoch: 0, acc: 0.357, loss: 16.118, lr: 1.0
epoch: 100, acc: 0.477, loss: 16.118, lr: 0.9099181073703367
epoch: 200, acc: 0.623, loss: 16.118, lr: 0.8340283569641367
epoch: 300, acc: 0.753, loss: 16.118, lr: 0.7698229407236336
epoch: 400, acc: 0.853, loss: 16.118, lr: 0.7147962830593281
epoch: 500, acc: 0.890, loss: 16.118, lr: 0.66711140760507
epoch: 600, acc: 0.920, loss: 16.118, lr: 0.6253908692933083
epoch: 700, acc: 0.927, loss: 16.118, lr: 0.5885815185403178
epoch: 800, acc: 0.933, loss: 16.118, lr: 0.5558643690939411
epoch: 900, acc: 0.920, loss: 16.118, lr: 0.526592943654555
epoch: 1000, acc: 0.917, loss: 16.118, lr: 0.5002501250625312
epoch: 1100, acc: 0.943, loss: 16.118, lr: 0.4764173415912339
epoch: 1200, acc: 0.943, loss: 16.118, lr: 0.45475216007276037
epoch: 1300, acc: 0.950, loss: 16.118, lr: 0.43497172683775553
epoch: 1400, acc: 0.950, loss: 16.118, lr: 0.4168403501458941
epoch: 1500, acc: 0.950, loss: 16.118, lr: 0.4001600640256102
epoch: 1600, acc: 0.950, loss: 16.11