In [2]:
import numpy as np
import random as n

Create the Artificial Neural Network

In [None]:
# Random initialization of ANN weights and biases

def create_ANN():
    '''
    initialize a ANN with the architecture:
    2 inputs -> 3 hidden neurons -> 2 outputs
    '''
    np.random.seed(42)
    n.seed(42)

    # Hidden layer: 2 inputs -> 3 neurons
    # W1 shape: (3,2) ; b1 shape: (3,1)
    W1 = np.random.randn(3,2) * 0.01 # small random weights
    b1 = np.zeros((3,1)) # zero biases

    # Output layer: 3 hidden -> 2 outputs
    # W2 shape: (2,3) ; b1 shape: (2,1)
    W2 = np.random.randn(2,3) * 0.01
    b2 = np.zeros((2,1))

    # store the parameters
    parameters = {
        'W1' : W1,
        'b1' : b1,
        'W2' : W2,
        'b2' : b2
    }

    return parameters

Split the Data in Batches

In [4]:
# Mini-batch: is a small amount of training data used for one gradient update
# so this function only selects data in batches
# it 1- shuffles the dataset
#    2- Yields mini-batches of (X_batch , Y_batch)

def mini_batch(X , Y , batch_size):
    # ensure inputs are numpy arrays
    X = np.array(X)
    Y = np.array(Y)

    N = X.shape[0] # number of samples
    assert N == Y.shape[0], ' X and Y must have the number of samples '

    # 1- Create shuffeled indicies of X and Y in the same order
    indicies = list(range(N))
    n.shuffle(indicies)

    indicies = np.array(indicies)

    # 2- Yield batches one at a time
    for i in range(0 , N , batch_size):
        batch_indicies = indicies[i : i + batch_size]
        X_batch = X[batch_indicies]
        Y_batch = Y[batch_indicies]
        yield (X_batch , Y_batch)

Activation Functions

In [12]:
#------(Activation Function for continuous labels [Regression])------
def sigmoid(z):
    sigmoid_output = 1 / (1 + np.exp(-z))

    return sigmoid_output

def derivative_sigmoid(sigmoid_output):
    sigmoid_derivative = sigmoid_output * (1 - sigmoid_output)

    return sigmoid_derivative
#-----------------------------------------------------

#------(Activation Function for hot encoded labels [Classification])------
def softmax(z):
    # Subtract the maximum value for numerical stability
    # keepdims=True ensures the max is broadcasted correctly across dimensions for 2D arrays
    x = z - np.max(z , axis = -1 , keepdims = True)
    numerator = np.exp(x)
    denominator = np.sum(numerator , axis = -1 , keepdims = True)
    softmax_output = numerator / denominator

    return softmax_output

def derivative_softmax(softmax_output):
    s = softmax_output.reshape(-1 , 1)
    # The Jacobian is calculated as diag(s) - dot(s, s.T)
    jacobian_matrix = np.diagflat(s) - np.dot(s , s.T)

    return jacobian_matrix
#--------------------------------------------------------

Performing Feedforward

In [6]:
def feed_forward(X_batchh , param , keep_prob = 0.5 , training = True , smoothing = False):
    W1 , b1 = param['W1'] , param['b1']
    W2 , b2 = param['W2'] , param['b2']

    m = X_batchh.shape[0]
    X_T = X_batchh.T

    #layer 1: input -> hidden
    z1 = np.dot(W1 , X_T) + b1
    a1 = sigmoid(z1)

    #-----------(Inverted Dropout)-------------#
        # inverted dropout is applied only during training
    if training:
        # create dropout mask: 1 -> keep , 0 -> drop
        dropout_mask = (np.random.rand(*a1.shape) > keep_prob).astype(float)
        # scale surviving neurons by 1/(1 - keep_prob) -> 'inverted dropout'
        a1 = a1 * dropout_mask / (1.0 - keep_prob)
    else:
        dropout_mask = None # not used in inference (prediction phase)
    #-------------------------------------------#

    #layer 2: hidden -> output
    z2 = np.dot(W2 , a1) + b2
    a2 = softmax(z2)

    #-----------(Label Smoothing)-------------#
    if smoothing != False:
        epsilon = 0.1
        num_classes = a2.shape[0]
        a2 = a2 * (1 - epsilon) + (epsilon / num_classes)
    #-----------------------------------------#


    cache = {
        'X_T' : X_T,
        'z1': z1,
        'a1_before_dropout': sigmoid(z1), #needed for backprop without scaling
        'a1' : a1,
        'dropout_mask' : dropout_mask,
        'z2' : z2,
        'a2' : a2,
        'keep_prob' : keep_prob,
        'training' : training
    }

    return a2 , cache

    

Loss Functions (Not Used)

In [14]:
#----------------------------------------
def cross_entropy_loss(a2 , Y_batch):
    #Cross-entropy Loss if the outputs are hot encoded
    pred = np.clip(a2 , 1e-15 , 1 - 1e-15) # clip predictions to avoid log(0) or log(1-0) erros

    # calculate the loss for each sample
    loss = - np.sum(Y_batch.T * np.log(pred))

    return np.mean(loss)
#----------------------------------------

#----------------------------------------
def compute_loss(a2 , Y_batch):
    # Compute MSE loss (Mean Sequared Error)
    loss = np.mean(np.square(a2 - Y_batch.T)) # Y_batch.T to match dimensions

    return loss
#----------------------------------------

Performing Backpropagation

In [18]:
def backpropagation(Y_batch , cache , param):
    W2 = param['W2']
    Y_T = Y_batch.T # (2 , m)
    m = Y_batch.shape[1] # number of samples in the batch

    # Output layer delta (error signal)
    dz2 = cache['a2'] - Y_T
    dW2 = (1 / m) * np.dot(dz2 , cache['a1'].T)
    db2 = (1 / m) * np.sum(dz2 , axis = 1 , keepdims = True)

    # Hidden layer delta (error signal)
    da1 = np.dot(W2.T , dz2) # (3 , m)
    if cache['training']:
        # apply same mask mask during backpropagation
        da1 = da1 * cache['dropout_mask'] / (1.0 - cache['keep_prob'])

    a1_clean = cache['a1_before_dropout'] # unscaled activation

    dz1 = da1 * derivative_sigmoid(a1_clean) # (3 , m)
    dW1 = (1 / m) * np.dot(dz1 , cache['X_T'].T)
    db1 = (1 / m) * np.sum(dz1 , axis =  1 , keepdims = True)
    
    gradients = {
        'dW1' : dW1,
        'db1' : db1,
        'dW2' : dW2,
        'db2' : db2
    }

    return gradients

Updating Parameters

In [20]:
def update_parameters(params , gradients , learning_rate = 0.1):
    # for k, v in gradients.items():
    #     print(k, type(v))

    params['W1'] -= learning_rate * gradients['dW1']
    params['b1'] -= learning_rate * gradients['db1']
    params['W2'] -= learning_rate * gradients['dW2']
    params['b2'] -= learning_rate * gradients['db2']

Inputs

In [None]:
# Random training data
# inputs
X_ = np.random.randn(100 , 2) # 100 samples , 2 features
# outputs
Y_ = np.random.randint(0 , 2 , size=(100 , 2)) # 100 samples , 2 classes (one-hot encoded)

Training Phase

In [25]:
model = create_ANN()

epoches = 100
lr = 0.1 # learning_rate
keep_prob = 0.5

for epoch in range(epoches):
    epoch_loss = 0
    number_of_batches = 0

    for X_batch , Y_batch in mini_batch(X_ , Y_ , batch_size = 32):

        #-----( feed forward with inverted dropout and label smoothing )-----
        a_output , cache = feed_forward(X_batch , model , keep_prob , training = True , smoothing = True)

        #-----( compute loss )-----
        loss = cross_entropy_loss(a_output , Y_batch)
        epoch_loss += loss
        number_of_batches += 1

        #-----( backpropagation with inverted dropout )-----
        gradients = backpropagation(Y_batch , cache , model)

        #-----( update parameters )-----
        update_parameters(model , gradients , lr)

    avg_epoch_loss = epoch_loss / number_of_batches
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Avg Loss: {avg_epoch_loss:.4f}")



Epoch 0, Avg Loss: 68.4261
Epoch 10, Avg Loss: 77.3668
Epoch 20, Avg Loss: 76.9996
Epoch 30, Avg Loss: 75.1854
Epoch 40, Avg Loss: 75.1825
Epoch 50, Avg Loss: 73.3576
Epoch 60, Avg Loss: 73.0293
Epoch 70, Avg Loss: 73.8742
Epoch 80, Avg Loss: 73.9762
Epoch 90, Avg Loss: 75.1918
