# Artificial Neural Networks and its Applications:
Artificial Neural Networks (ANNs) are computer systems designed to mimic how the human brain processes information. Just like the brain uses neurons to process data and make decisions, ANNs use artificial neurons to analyze data, identify patterns and make predictions. These networks consist of layers of interconnected neurons that work together to solve complex problems. The key idea is that ANNs can "learn" from the data they process, just as our brain learns from experience. They are used in various applications from recognizing images to making personalized recommendations. In this article, we will see more about ANNs, how they function and other core concepts.



In [39]:
# First,We will import all the important libraries:-
# ==========================================================================================================================================================================================
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
np.random.seed(42)
# ==========================================================================================================================================================================================

In [40]:
# Generate data
X, y = make_moons(n_samples=1000, noise=0.15, random_state=42)
y = y.reshape(-1, 1)  # shape (n,1)
# ==========================================================================================================================================================================================
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# ==========================================================================================================================================================================================
# Standardize features (important for NN training)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# ==========================================================================================================================================================================================
print("Shapes:", X_train.shape, y_train.shape, X_test.shape, y_test.shape)
# ==========================================================================================================================================================================================

Shapes: (800, 2) (800, 1) (200, 2) (200, 1)


In [41]:
# ==========================================================================================================================================================================================
def sigmoid(z):
    return 1 / (1 + np.exp(-z))
# ==========================================================================================================================================================================================
def sigmoid_derivative(a):  # a = sigmoid(z)
    return a * (1 - a)
# ==========================================================================================================================================================================================
def relu(z):
    return np.maximum(0, z)
# ==========================================================================================================================================================================================
def relu_derivative(z):
    return (z > 0).astype(float)
# ==========================================================================================================================================================================================
def binary_accuracy(y_true, y_pred_probs, threshold=0.5):
    preds = (y_pred_probs >= threshold).astype(int)
    return (preds == y_true).mean()
# ==========================================================================================================================================================================================

In [42]:
# ==========================================================================================================================================================================================
def init_params(n_input, n_hidden, n_output=1, seed=42):
    rng = np.random.RandomState(seed)
    # He initialization for layers followed by ReLU
    W1 = rng.randn(n_input, n_hidden) * np.sqrt(2. / n_input)
    b1 = np.zeros((1, n_hidden))
    # For output (sigmoid) small random init
    W2 = rng.randn(n_hidden, n_output) * np.sqrt(2. / n_hidden)
    b2 = np.zeros((1, n_output))
    return W1, b1, W2, b2
# ==========================================================================================================================================================================================
# Example sizes
n_input = X_train.shape[1]   # 2
n_hidden = 16                # you can change
W1, b1, W2, b2 = init_params(n_input, n_hidden)
print("W1 shape:", W1.shape, "b1:", b1.shape, "W2:", W2.shape, "b2:", b2.shape)
# ==========================================================================================================================================================================================

W1 shape: (2, 16) b1: (1, 16) W2: (16, 1) b2: (1, 1)


In [43]:
# ==========================================================================================================================================================================================
def forward(X, W1, b1, W2, b2):
    # X: (m, n_input)
    Z1 = X.dot(W1) + b1      # (m, n_hidden)
    A1 = relu(Z1)            # (m, n_hidden)
    Z2 = A1.dot(W2) + b2     # (m, 1)
    A2 = sigmoid(Z2)         # (m, 1) — output probabilities
    cache = (Z1, A1, Z2, A2)
    return A2, cache
# ==========================================================================================================================================================================================

In [44]:
# ==========================================================================================================================================================================================
def compute_loss(Y, Y_hat):
    # Y, Y_hat: shape (m,1)
    m = Y.shape[0]
    eps = 1e-12
    loss = - (Y * np.log(Y_hat + eps) + (1 - Y) * np.log(1 - Y_hat + eps))
    return loss.mean()
# ==========================================================================================================================================================================================

In [45]:
# ==========================================================================================================================================================================================
def backward(X, Y, cache, W2):
    # cache = (Z1, A1, Z2, A2)
    Z1, A1, Z2, A2 = cache
    m = X.shape[0]

    # Output layer gradients
    dZ2 = A2 - Y                      # (m,1) for BCE with sigmoid
    dW2 = (A1.T.dot(dZ2)) / m         # (n_hidden, 1)
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m  # (1,1)

    # Hidden layer gradients
    dA1 = dZ2.dot(W2.T)               # (m, n_hidden)
    dZ1 = dA1 * relu_derivative(Z1)   # (m, n_hidden)
    dW1 = (X.T.dot(dZ1)) / m          # (n_input, n_hidden)
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m  # (1, n_hidden)

    grads = (dW1, db1, dW2, db2)
    return grads
# ==========================================================================================================================================================================================

In [46]:
# ==========================================================================================================================================================================================
def update_params(W1, b1, W2, b2, grads, lr):
    dW1, db1, dW2, db2 = grads
    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2
    return W1, b1, W2, b2
# ==========================================================================================================================================================================================

In [47]:
# ==========================================================================================================================================================================================
# Hyperparameters
n_hidden = 32
lr = 0.5
n_epochs = 2000
print_every = 200
# ==========================================================================================================================================================================================
# Initialize
W1, b1, W2, b2 = init_params(n_input, n_hidden, n_output=1, seed=42)
# ==========================================================================================================================================================================================
train_losses = []
train_accs = []
test_losses = []
test_accs = []
# ==========================================================================================================================================================================================
for epoch in range(1, n_epochs + 1):
    # Forward
    Y_hat_train, cache = forward(X_train, W1, b1, W2, b2)
    loss = compute_loss(y_train, Y_hat_train)
    acc = binary_accuracy(y_train, Y_hat_train)

    # Backward
    grads = backward(X_train, y_train, cache, W2)

    # Update
    W1, b1, W2, b2 = update_params(W1, b1, W2, b2, grads, lr)

    # Track
    train_losses.append(loss)
    train_accs.append(acc)
    
    # Evaluate on test set periodically
    if epoch % 10 == 0:
        Y_hat_test, _ = forward(X_test, W1, b1, W2, b2)
        test_loss = compute_loss(y_test, Y_hat_test)
        test_acc = binary_accuracy(y_test, Y_hat_test)
        test_losses.append(test_loss)
        test_accs.append(test_acc)

    if epoch % print_every == 0 or epoch == 1:
        Y_hat_test, _ = forward(X_test, W1, b1, W2, b2)
        test_loss = compute_loss(y_test, Y_hat_test)
        test_acc = binary_accuracy(y_test, Y_hat_test)
        print(f"Epoch {epoch:4d} — Train loss: {loss:.4f}, Train acc: {acc:.4f} — Test loss: {test_loss:.4f}, Test acc: {test_acc:.4f}")
# ==========================================================================================================================================================================================

Epoch    1 — Train loss: 0.6799, Train acc: 0.4400 — Test loss: 0.3098, Test acc: 0.8650
Epoch  200 — Train loss: 0.0956, Train acc: 0.9762 — Test loss: 0.0716, Test acc: 0.9850
Epoch  400 — Train loss: 0.0455, Train acc: 0.9900 — Test loss: 0.0292, Test acc: 1.0000
Epoch  600 — Train loss: 0.0335, Train acc: 0.9938 — Test loss: 0.0187, Test acc: 1.0000
Epoch  800 — Train loss: 0.0282, Train acc: 0.9925 — Test loss: 0.0141, Test acc: 1.0000
Epoch 1000 — Train loss: 0.0249, Train acc: 0.9950 — Test loss: 0.0116, Test acc: 1.0000
Epoch 1200 — Train loss: 0.0226, Train acc: 0.9950 — Test loss: 0.0099, Test acc: 1.0000
Epoch 1400 — Train loss: 0.0210, Train acc: 0.9950 — Test loss: 0.0088, Test acc: 1.0000
Epoch 1600 — Train loss: 0.0198, Train acc: 0.9950 — Test loss: 0.0079, Test acc: 1.0000
Epoch 1800 — Train loss: 0.0189, Train acc: 0.9950 — Test loss: 0.0073, Test acc: 1.0000
Epoch 2000 — Train loss: 0.0182, Train acc: 0.9950 — Test loss: 0.0067, Test acc: 1.0000


In [48]:
# ==========================================================================================================================================================================================
# Utility functions

def initialize_parameters(n_x, n_h, n_y):
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    return W1, b1, W2, b2
# ==========================================================================================================================================================================================

In [49]:
# ==========================================================================================================================================================================================
# Forward Propagation

def forward_pass(X, W1, b1, W2, b2):
    Z1 = np.dot(W1, X) + b1     # (n_h, m)
    A1 = relu(Z1)
    Z2 = np.dot(W2, A1) + b2    # (1, m)
    A2 = sigmoid(Z2)
    
    cache = (Z1, A1, Z2, A2)
    return A2, cache
# ==========================================================================================================================================================================================

In [50]:
# ==========================================================================================================================================================================================
# Compute Loss
def compute_loss(A2, Y):
    m = Y.shape[1]
    loss = -np.mean(Y * np.log(A2 + 1e-9) + (1 - Y) * np.log(1 - A2 + 1e-9))
    return loss
# ==========================================================================================================================================================================================

In [51]:
# ==========================================================================================================================================================================================
# Adam Optimizer Initialization

def init_adam(W1, b1, W2, b2):
    v = {
        "dW1": np.zeros_like(W1),
        "db1": np.zeros_like(b1),
        "dW2": np.zeros_like(W2),
        "db2": np.zeros_like(b2)
    }
    s = {
        "dW1": np.zeros_like(W1),
        "db1": np.zeros_like(b1),
        "dW2": np.zeros_like(W2),
        "db2": np.zeros_like(b2)
    }
    return v, s
# ==========================================================================================================================================================================================


In [52]:
# ==========================================================================================================================================================================================
# Update parameters using Adam

def adam_update(params, grads, v, s, t, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
    W1, b1, W2, b2 = params
    dW1, db1, dW2, db2 = grads
    
    # Moving averages of gradients
    for key, grad in zip(["dW1", "db1", "dW2", "db2"],
                         [dW1, db1, dW2, db2]):
        v[key] = beta1 * v[key] + (1 - beta1) * grad
        s[key] = beta2 * s[key] + (1 - beta2) * (grad ** 2)
    
    # Bias correction
    v_corrected = {k: v[k] / (1 - beta1**t) for k in v}
    s_corrected = {k: s[k] / (1 - beta2**t) for k in s}
    
    # Parameter updates
    W1 -= lr * v_corrected["dW1"] / (np.sqrt(s_corrected["dW1"]) + eps)
    b1 -= lr * v_corrected["db1"] / (np.sqrt(s_corrected["db1"]) + eps)
    W2 -= lr * v_corrected["dW2"] / (np.sqrt(s_corrected["dW2"]) + eps)
    b2 -= lr * v_corrected["db2"] / (np.sqrt(s_corrected["db2"]) + eps)
    
    return (W1, b1, W2, b2), v, s
# ==========================================================================================================================================================================================

In [53]:
# ==========================================================================================================================================================================================
# Training Loop with Mini-Batch Gradient Descent + Adam

def train(X, Y, n_h=8, epochs=1000, batch_size=64, lr=0.001):
    n_x = X.shape[0]
    n_y = 1
    
    # Initialize parameters
    W1, b1, W2, b2 = initialize_parameters(n_x, n_h, n_y)
    v, s = init_adam(W1, b1, W2, b2)
    
    m = X.shape[1]
    
    for t in range(1, epochs + 1):
        permutation = np.random.permutation(m)
        X_shuffled = X[:, permutation]
        Y_shuffled = Y[:, permutation]
        
        for i in range(0, m, batch_size):
            X_batch = X_shuffled[:, i:i+batch_size]
            Y_batch = Y_shuffled[:, i:i+batch_size]
            
            # Forward + Backward + Adam update
            A2, cache = forward_pass(X_batch, W1, b1, W2, b2)
            grads = backward_pass(X_batch, Y_batch, cache, W2)
            (W1, b1, W2, b2), v, s = adam_update(
                (W1, b1, W2, b2), grads, v, s, t, lr=lr
            )
        
        # Print loss
        if t % 100 == 0:
            loss = compute_loss(A2, Y_batch)
            print(f"Epoch {t}, Loss = {loss:.4f}")
    
    return W1, b1, W2, b2
# ==========================================================================================================================================================================================

# Now,Scartch implementation of Deep Neural Network(DNN)

# Introduction to Deep Learning:-
Deep Learning is transforming the way machines understand, learn and interact with complex data. Deep learning mimics neural networks of the human brain, it enables computers to autonomously uncover patterns and make informed decisions from vast amounts of unstructured data.
# How Deep Learning Works?
Neural network consists of layers of interconnected nodes or neurons that collaborate to process input data. In a fully connected deep neural network data flows through multiple layers where each neuron performs nonlinear transformations, allowing the model to learn intricate representations of the data.

In a deep neural network the input layer receives data which passes through hidden layers that transform the data using nonlinear functions. The final output layer generates the model’s prediction.

In [54]:

# ==========================================================================================================================================================================================
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# ==========================================================================================================================================================================================

In [55]:
# Utilities: activations + grads
# ==========================================================================================================================================================================================
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))
# ==========================================================================================================================================================================================
def sigmoid_backward(dA, Z):
    A = sigmoid(Z)
    return dA * A * (1 - A)
# ==========================================================================================================================================================================================
def relu(Z):
    return np.maximum(0, Z)
# ==========================================================================================================================================================================================
def relu_backward(dA, Z):
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0
    return dZ
# ==========================================================================================================================================================================================
def tanh(Z):
    return np.tanh(Z)
# ==========================================================================================================================================================================================
def tanh_backward(dA, Z):
    A = np.tanh(Z)
    return dA * (1 - A ** 2)
# ==========================================================================================================================================================================================
def softmax(Z):
    Z_shift = Z - np.max(Z, axis=0, keepdims=True)
    expZ = np.exp(Z_shift)
    return expZ / np.sum(expZ, axis=0, keepdims=True)
# ==========================================================================================================================================================================================

In [56]:
# Initialization:-
# ==========================================================================================================================================================================================
def initialize_parameters_deep(layer_dims, seed=42):
    """
    Here,We have initialized-
    layer_dims: list of layer sizes, e.g. [n_x, 128, 64, n_y]
    returns params dict with W1,b1,... Wl,bl
    Used He init for ReLU/tanh, Xavier for sigmoid/tanh optionally.
    """
    np.random.seed(seed)
    params = {}
    L = len(layer_dims) - 1
    for l in range(1, L + 1):
        n_l = layer_dims[l]
        n_prev = layer_dims[l - 1]
        # Imp:-He initialization (works well generally for ReLU)
        params['W' + str(l)] = np.random.randn(n_l, n_prev) * np.sqrt(2. / n_prev)
        params['b' + str(l)] = np.zeros((n_l, 1))
    return params
# ==========================================================================================================================================================================================

In [57]:
# Forward (linear -> activation) with dropout
# ==========================================================================================================================================================================================
def linear_forward(A_prev, W, b):
    Z = W.dot(A_prev) + b
    cache = (A_prev, W, b)
    return Z, cache
# ==========================================================================================================================================================================================
def activation_forward(A_prev, W, b, activation, keep_prob=1.0):
    Z, linear_cache = linear_forward(A_prev, W, b)
    if activation == 'relu':
        A = relu(Z)
    elif activation == 'sigmoid':
        A = sigmoid(Z)
    elif activation == 'tanh':
        A = tanh(Z)
    elif activation == 'softmax':
        A = softmax(Z)
    else:
        raise ValueError("Unsupported activation")
    D = None
    if keep_prob < 1.0:
        D = (np.random.rand(*A.shape) < keep_prob).astype(float)
        A = A * D
        A = A / keep_prob
    cache = (linear_cache, Z, D)
    return A, cache
# ==========================================================================================================================================================================================
def forward_model(X, params, activations, keep_probs=None):
    """
    # Explanation of the hyperparameters we  used:-
    X: input data of shape (n_x, m)
    params: dict
    activations: list of activation names for layers [act1, act2, ..., actL]
                 last activation should be 'softmax' for multiclass or 'sigmoid' for binary
    keep_probs: list of keep_prob values for dropout for each layer (same length as activations),
                or None (defaults to 1.0 for all)
    Returns:
      AL: output activation
      caches: list of caches
    """
    caches = []
    A = X
    L = len(activations)
    if keep_probs is None:
        keep_probs = [1.0] * L
    for l in range(1, L + 1):
        W = params['W' + str(l)]
        b = params['b' + str(l)]
        act = activations[l - 1]
        keep_prob = keep_probs[l - 1]
        A, cache = activation_forward(A, W, b, act, keep_prob)
        caches.append(cache)
    return A, caches
# ==========================================================================================================================================================================================

In [58]:
# Cost function
# ==========================================================================================================================================================================================
def compute_cost(AL, Y, params=None, lambda_l2=0.0):
    """
    # Explanation of the hyperparameters we  used:-
    AL: predictions (n_y, m)
    Y: true labels (n_y, m) one-hot for multiclass, or shape (1,m) for binary
    lambda_l2: L2 reg coefficient
    """
    m = Y.shape[1]
    eps = 1e-12
    if AL.shape[0] == 1:
        # binary
        cost = -np.sum(Y * np.log(AL + eps) + (1 - Y) * np.log(1 - AL + eps)) / m
    else:
        # multiclass
        cost = -np.sum(Y * np.log(AL + eps)) / m
    if lambda_l2 and params is not None:
        L = len([k for k in params.keys() if k.startswith('W')])
        l2_sum = 0
        for l in range(1, L + 1):
            l2_sum += np.sum(np.square(params['W' + str(l)]))
        cost += (lambda_l2 / (2 * m)) * l2_sum
    return cost
# ==========================================================================================================================================================================================

In [59]:
# Backward functions
# ==========================================================================================================================================================================================
def linear_backward(dZ, linear_cache):
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    dW = (1 / m) * dZ.dot(A_prev.T)
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = W.T.dot(dZ)
    return dA_prev, dW, db
# ==========================================================================================================================================================================================
def activation_backward(dA, cache, activation, keep_prob=1.0):
    linear_cache, Z, D = cache
    if D is not None:
        dA = dA * D
        dA = dA / keep_prob
    if activation == 'relu':
        dZ = relu_backward(dA, Z)
    elif activation == 'sigmoid':
        dZ = sigmoid_backward(dA, Z)
    elif activation == 'tanh':
        dZ = tanh_backward(dA, Z)
    elif activation == 'softmax':
        # For softmax with cross-entropy, dZ = AL - Y is computed outside for efficiency
        dZ = dA
    else:
        raise ValueError("Unsupported activation")
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    return dA_prev, dW, db
# ==========================================================================================================================================================================================
def backward_model(AL, Y, params, caches, activations, keep_probs=None, lambda_l2=0.0):
    """
    # Explanation of the hyperparameters we  used:-
    AL: predictions (n_y, m)
    Y: true labels (n_y, m)
    caches: list of caches from forward
    activations: list of activations
    Returns grads dict with dWl, dbl, ...
    """
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    if keep_probs is None:
        keep_probs = [1.0] * L

    # Initialize dA for last layer
    if activations[-1] == 'softmax':
        # dZ = AL - Y (shape (n_y, m))
        dA = AL - Y
    elif activations[-1] == 'sigmoid' and AL.shape[0] == 1:
        dA = -(np.divide(Y, AL + 1e-12) - np.divide(1 - Y, 1 - AL + 1e-12))
        # Alternatively simpler: dZ = AL - Y gives same for BCE with sigmoid
        dA = AL - Y
    else:
        dA = AL - Y  # fallback for regression-like or other combos

    # Backprop through layers
    current_dA = dA
    for l in reversed(range(1, L + 1)):
        cache = caches[l - 1]
        act = activations[l - 1]
        keep_prob = keep_probs[l - 1]
        dA_prev, dW, db = activation_backward(current_dA, cache, act, keep_prob)
        # Add L2 regularization to dW
        if lambda_l2:
            dW += (lambda_l2 / m) * params['W' + str(l)]
        grads['dW' + str(l)] = dW
        grads['db' + str(l)] = db
        current_dA = dA_prev
    return grads
# ==========================================================================================================================================================================================

In [60]:
# Adam optimizer helpers
# ==========================================================================================================================================================================================
def initialize_adam_params(params):
    L = len([k for k in params.keys() if k.startswith('W')])
    v = {}
    s = {}
    for l in range(1, L + 1):
        v['dW' + str(l)] = np.zeros_like(params['W' + str(l)])
        v['db' + str(l)] = np.zeros_like(params['b' + str(l)])
        s['dW' + str(l)] = np.zeros_like(params['W' + str(l)])
        s['db' + str(l)] = np.zeros_like(params['b' + str(l)])
    return v, s
# ==========================================================================================================================================================================================
def update_parameters_adam(params, grads, v, s, t, learning_rate=0.001,
                           beta1=0.9, beta2=0.999, epsilon=1e-8):
    L = len([k for k in params.keys() if k.startswith('W')])
    v_corrected = {}
    s_corrected = {}
    for l in range(1, L + 1):
        # moving averages
        v['dW' + str(l)] = beta1 * v['dW' + str(l)] + (1 - beta1) * grads['dW' + str(l)]
        v['db' + str(l)] = beta1 * v['db' + str(l)] + (1 - beta1) * grads['db' + str(l)]
        s['dW' + str(l)] = beta2 * s['dW' + str(l)] + (1 - beta2) * (grads['dW' + str(l)] ** 2)
        s['db' + str(l)] = beta2 * s['db' + str(l)] + (1 - beta2) * (grads['db' + str(l)] ** 2)

        # bias correction
        v_corrected['dW' + str(l)] = v['dW' + str(l)] / (1 - beta1 ** t)
        v_corrected['db' + str(l)] = v['db' + str(l)] / (1 - beta1 ** t)
        s_corrected['dW' + str(l)] = s['dW' + str(l)] / (1 - beta2 ** t)
        s_corrected['db' + str(l)] = s['db' + str(l)] / (1 - beta2 ** t)

        # update params
        params['W' + str(l)] -= learning_rate * (v_corrected['dW' + str(l)] / (np.sqrt(s_corrected['dW' + str(l)]) + epsilon))
        params['b' + str(l)] -= learning_rate * (v_corrected['db' + str(l)] / (np.sqrt(s_corrected['db' + str(l)]) + epsilon))
    return params, v, s

# ==========================================================================================================================================================================================
# Mini-batch utils
# ==========================================================================================================================================================================================
def random_mini_batches(X, Y, mini_batch_size=64, seed=None):
    np.random.seed(seed)
    m = X.shape[1]
    permutation = np.random.permutation(m)
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation]
    mini_batches = []
    num_complete_minibatches = m // mini_batch_size
    for k in range(num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size:(k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size:(k + 1) * mini_batch_size]
        mini_batches.append((mini_batch_X, mini_batch_Y))
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size:]
        mini_batches.append((mini_batch_X, mini_batch_Y))
    return mini_batches
# ==========================================================================================================================================================================================

In [61]:
# Model training:-
# ==========================================================================================================================================================================================
def model_train(X, Y, layer_dims, activations,
                keep_probs=None,
                learning_rate=0.001, num_epochs=1000,
                mini_batch_size=64, print_cost=True,
                lambda_l2=0.0, seed=42):
    """
    # Explanation of the hyperparameters we  used:-
    X: (n_x, m)
    Y: (n_y, m) one-hot for multiclass or (1,m) for binary
    layer_dims: list dims e.g. [n_x, 128, 64, n_y]
    activations: list len L of activation names for each layer (last must match output)
    keep_probs: list len L of dropout keep probabilities (1.0 => no dropout)
    """
    np.random.seed(seed)
    params = initialize_parameters_deep(layer_dims, seed)
    v, s = initialize_adam_params(params)
    t = 0
    costs = []
    m = X.shape[1]
    if keep_probs is None:
        keep_probs = [1.0] * len(activations)

    for epoch in range(1, num_epochs + 1):
        minibatches = random_mini_batches(X, Y, mini_batch_size, seed=epoch)
        epoch_cost = 0
        for minibatch_X, minibatch_Y in minibatches:
            # forward
            AL, caches = forward_model(minibatch_X, params, activations, keep_probs)
            # cost
            cost = compute_cost(AL, minibatch_Y, params, lambda_l2)
            epoch_cost += cost * minibatch_X.shape[1] / m  # weighted average

            # backward
            grads = backward_model(AL, minibatch_Y, params, caches, activations, keep_probs, lambda_l2)
            # update t and params via Adam
            t += 1
            params, v, s = update_parameters_adam(params, grads, v, s, t, learning_rate)
        costs.append(epoch_cost)

        if print_cost and (epoch % max(1, num_epochs // 10) == 0 or epoch == 1):
            print(f"Epoch {epoch}/{num_epochs} — cost: {epoch_cost:.6f}")
    return params, costs
# ==========================================================================================================================================================================================

In [62]:
# Predictions / Accuracy on our trained model
# ==========================================================================================================================================================================================
def predict(X, params, activations):
    AL, _ = forward_model(X, params, activations, keep_probs=[1.0]*len(activations))
    if AL.shape[0] == 1:
        preds = (AL > 0.5).astype(int)
        return preds
    else:
        preds = np.argmax(AL, axis=0)
        return preds

def accuracy_score(Y_true_labels, Y_pred_labels):
    return np.mean(Y_true_labels == Y_pred_labels)
# ==========================================================================================================================================================================================

In [63]:
# Here,We used: digits dataset
# ==========================================================================================================================================================================================
if __name__ == "__main__":
    digits = load_digits()
    X = digits.data  # (n_samples, n_features) = (1797, 64)
    y = digits.target.reshape(-1, 1)  # labels 0..9

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train).T  # shape (n_x, m)
    X_test = scaler.transform(X_test).T

    # One-hot encode labels
    encoder = OneHotEncoder(sparse_output=False, categories='auto')
    Y_train = encoder.fit_transform(y_train).T  # shape (n_y, m)
    Y_test = encoder.transform(y_test).T

    # Network configuration
    n_x = X_train.shape[0]
    n_y = Y_train.shape[0]
    layer_dims = [n_x, 128, 64, n_y]
    activations = ['relu', 'relu', 'softmax']  # last softmax for multiclass
    keep_probs = [1.0, 1.0, 1.0]  # no dropout by default

    # Train
    params, costs = model_train(X_train, Y_train,
                                layer_dims, activations,
                                keep_probs=keep_probs,
                                learning_rate=0.001,
                                num_epochs=100,
                                mini_batch_size=64,
                                print_cost=True,
                                lambda_l2=0.001)
    # ==========================================================================================================================================================================================
    # Predict & evaluate
    y_pred_train = predict(X_train, params, activations)
    y_true_train = np.argmax(Y_train, axis=0)
    train_acc = accuracy_score(y_true_train, y_pred_train)

    y_pred_test = predict(X_test, params, activations)
    y_true_test = np.argmax(Y_test, axis=0)
    test_acc = accuracy_score(y_true_test, y_pred_test)

    print(f"Train accuracy: {train_acc:.4f}, Test accuracy: {test_acc:.4f}")
# ==========================================================================================================================================================================================

Epoch 1/100 — cost: 2.060581
Epoch 10/100 — cost: 0.055774
Epoch 20/100 — cost: 0.015778
Epoch 30/100 — cost: 0.008712
Epoch 40/100 — cost: 0.006447
Epoch 50/100 — cost: 0.005396
Epoch 60/100 — cost: 0.004858
Epoch 70/100 — cost: 0.004534
Epoch 80/100 — cost: 0.004318
Epoch 90/100 — cost: 0.004166
Epoch 100/100 — cost: 0.004050
Train accuracy: 1.0000, Test accuracy: 0.9778
