In [53]:
import kagglehub
import pandas as pd
import numpy as np
import pandas as pd
import scipy.special
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [54]:
# Download latest version
path = kagglehub.dataset_download("zalando-research/fashionmnist")
train_data=pd.read_csv(path+"/fashion-mnist_test.csv")
test_data=pd.read_csv(path+"/fashion-mnist_train.csv")
data=pd.concat([train_data,test_data])



The model

In [61]:
# ======================================================
# Preprocessing Module
# ======================================================
def preprocessing(data):

    #images are in form of vectors
    images = data.iloc[:, 1:]
    labels = data.iloc[:, 0]

    # Normalize pixel values to [0,1]
    images = images.astype('float32') / 255.0

    # Split into training (80%) and temporary (20%)
    train_images, temp_images, train_labels, temp_labels = train_test_split(
        images, labels, test_size=0.2, random_state=42, stratify=labels
    )

    # Split temporary into validation (10%) and test (10%)
    val_images, test_images, val_labels, test_labels = train_test_split(
        temp_images, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
    )

    # Convert labels to one-hot encoding (10 classes)
    train_labels = to_categorical(train_labels, num_classes=10)
    val_labels = to_categorical(val_labels, num_classes=10)
    test_labels = to_categorical(test_labels, num_classes=10)

    return train_images, val_images, test_images, train_labels, val_labels, test_labels

# ======================================================
# Neural Network Model
# ======================================================

# Hyperparameters & Network Architecture
hidden_layers = 5
input_dim = 784 # 28x28 images flattened
hidden_neurons = [256, 256, 256, 256, 256]
output_dim = 10

# For hidden layers, you can choose any activation among:
# "relu", "leaky_relu", "tanh", "gelu" : The output layer will always use "softmax".
activations = ['relu', 'relu', 'relu', 'relu', 'relu', 'softmax']

# Build list of layer dimensions: [input_dim, hidden1, hidden2, ..., output_dim]
layers_dims = [input_dim] + hidden_neurons + [output_dim]

# Initialize weights and biases
weights = []
biases = []
for i in range(len(layers_dims) - 1):
    # For hidden layers, we use He initialization if activation is ReLU/Leaky-ReLU (or similar)
    if i < len(activations) - 1 and activations[i].lower() in ['relu', 'leaky_relu']:
        W = np.random.randn(layers_dims[i+1], layers_dims[i]) * np.sqrt(2.0 / layers_dims[i])
    else:
        # For other activations or for the softmax layer, we can use Xavier initialization.
        W = np.random.randn(layers_dims[i+1], layers_dims[i]) * np.sqrt(1.0 / layers_dims[i])
    b = np.zeros((layers_dims[i+1], 1))
    weights.append(W)
    biases.append(b)

# ======================================================
# Activation Functions & Their Derivatives
# ======================================================
def activation(x, func="relu", alpha=0.01):
    func = func.lower()
    if func == "relu":
        return np.maximum(0, x)
    elif func == "leaky_relu":
        return np.where(x > 0, x, alpha * x)
    elif func == "tanh":
        return np.tanh(x)
    elif func == "gelu":
        return 0.5 * x * (1 + scipy.special.erf(x / np.sqrt(2)))
    elif func == "elu":
        return np.where(x > 0, x, alpha * (np.exp(x) - 1))
    elif func == "softmax":
        exps = np.exp(x - np.max(x, axis=0, keepdims=True))
        return exps / np.sum(exps, axis=0, keepdims=True)
    else:
        raise ValueError("Unsupported activation function: " + func)

def activation_derivative(z, func="relu", alpha=0.01):
    func = func.lower()
    if func == "relu":
        return (z > 0).astype(float)
    elif func == "leaky_relu":
        return np.where(z > 0, 1.0, alpha)
    elif func == "tanh":
        return 1 - np.tanh(z)**2
    elif func == "gelu":
        return (0.5 * (1 + scipy.special.erf(z / np.sqrt(2))) +
                (z * np.exp(-0.5 * z**2) / np.sqrt(2 * np.pi)))
    elif func == "elu":
        return np.where(z > 0, 1.0, alpha * np.exp(z))
    else:
        raise ValueError("Unsupported activation function for derivative: " + func)

# ======================================================
# Forward Pass
# ======================================================
def forward(X, weights, biases, activation_config, dropout_rate, training=True):
    activations_list = [X]    # Input activation
    pre_activations_list = []
    dropout_masks = []        # Stores dropout masks (for backprop)

    A = X
    # Process hidden layers
    for i in range(len(weights) - 1):
        Z = np.dot(weights[i], A) + biases[i]
        pre_activations_list.append(Z)

        # Use the activation function specified in activation_config
        A = activation(Z, activation_config[i])

        # Apply dropout only during training
        if training and dropout_rate > 0:
            mask = (np.random.rand(*A.shape) >= dropout_rate).astype(float)
            A = A * mask / (1 - dropout_rate)  # Inverted dropout scaling
        else:
            mask = np.ones_like(A)

        dropout_masks.append(mask)  # Save mask for backpropagation
        activations_list.append(A)

    # Process output layer (softmax); no dropout here.
    Z = np.dot(weights[-1], A) + biases[-1]
    pre_activations_list.append(Z)
    A = activation(Z, "softmax")
    activations_list.append(A)

    # Add identity mask for the output layer (no dropout applied here)
    dropout_masks.append(np.ones_like(A))

    return A, activations_list, pre_activations_list, dropout_masks


# ======================================================
# Loss Function (Cross-Entropy)
# ======================================================
def CE_loss(y, y_pred):
    epsilon = 1e-12  # To avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1.0 - epsilon)
    loss = -np.sum(y * np.log(y_pred), axis=0)
    return np.mean(loss)

def loss(X, y, weights, biases, activations, dropout_rate):
    y_pred, _, _, _ = forward(X, weights, biases, activations, dropout_rate)
    return CE_loss(y, y_pred)

# ======================================================
# Backpropagation with Support for Different Activation Functions
# ======================================================
def backprop(weights, biases, X, y, learning_rate, activations, dropout_rate):
    """
    Performs a single step of backpropagation using mini-batch gradient descent.
    Supports different activation functions for each hidden layer.

    Assumes:
      - The output layer uses softmax activation with cross-entropy loss.
      - The global list "activations" holds the activation for each layer
        in the forward pass (the last element being "softmax").
      - The forward() function returns:
            y_pred, activations_list, pre_activations_list
    """
    m = X.shape[1]  # Number of examples in the mini-batch.
    # Forward pass
    y_pred, activations_list, pre_activations_list, dropout_masks = forward(X, weights, biases, activations, dropout_rate)

    # For softmax with cross-entropy, the gradient at the output layer:
    dA = y_pred - y  # shape: (output_dim, m)

    dW_list = []
    dB_list = []

    # Backpropagate through layers (from output to input)
    for i in reversed(range(len(weights))):
        A_prev = activations_list[i]
        # For the current layer, dZ is initially dA (for output, softmax derivative is already applied)
        dZ = dA
        # Compute gradients for weights and biases.
        dW = np.dot(dZ, A_prev.T) / m
        dB = np.sum(dZ, axis=1, keepdims=True) / m
        dW_list.insert(0, dW)
        dB_list.insert(0, dB)

        # Propagate error to previous layer (if not at input)
        if i != 0:
            dA = np.dot(weights[i].T, dZ)
            # Apply the same dropout mask used in forward for the i-th hidden layer.
            dA = dA * dropout_masks[i-1]
            # Use the derivative of the activation used in layer i.
            # The corresponding activation for layer i is activations[i-1] because:
            #   pre_activations_list[0] corresponds to layer 1 (first hidden layer) and uses activations[0]
            act_func = activations[i-1]
            dZ = dA * activation_derivative(pre_activations_list[i-1], func=act_func)
            dA = dZ

    # Update weights and biases
    for i in range(len(weights)):
        weights[i] -= learning_rate * dW_list[i]
        biases[i]  -= learning_rate * dB_list[i]

    return weights, biases

# prediction accuracy
def predict(X, weights, biases, activations, dropout_rate=0.0):
    y_pred, _, _, _ = forward(X.T, weights, biases, activations, dropout_rate=0.0, training=False)
    predictions = np.argmax(y_pred, axis=0)
    return predictions

# ======================================================
# Training Loop (Mini-Batch Gradient Descent)
# ======================================================
def train(X, y, X_val, y_val, weights, biases, activations, epochs=100, learning_rate=0.01, batch_size=128, dropout_rate=0.0):
    """
    Trains the model using mini-batch gradient descent.
    X shape: (features, number_of_examples)
    y shape: (classes, number_of_examples)
    """
    m = X.shape[1]  # Total number of training examples.
    for epoch in range(epochs):
        # Shuffle training data.
        permutation = np.random.permutation(m)
        X_shuffled = X[:, permutation]
        y_shuffled = y[:, permutation]

        # Process mini-batches.
        for i in range(0, m, batch_size):
            end = i + batch_size
            X_batch = X_shuffled[:, i:end]
            y_batch = y_shuffled[:, i:end]
            weights, biases = backprop(weights, biases, X_batch, y_batch, learning_rate, activations, dropout_rate)

        # Print loss every 10 epochs.
        if epoch % 10 == 0:
            current_loss = loss(X, y, weights, biases, activations, dropout_rate)

            predictions = predict(X_val, weights, biases, activations, dropout_rate)
            true_labels = np.argmax(y_val, axis=0)
            accuracy = np.mean(predictions == true_labels) * 100

            print(f"Epoch {epoch}, Loss: {current_loss:.4f}, Validation Accuracy: {accuracy:.2f}%")

    return weights, biases


In [None]:
# preprocessing the data
train_images, val_images, test_images, train_labels, val_labels, test_labels = preprocessing(data)

X_train = train_images.to_numpy().T   # Now shape: (784, number_of_train_examples)
y_train = train_labels.T              # Now shape: (10, number_of_train_examples)

val_labels = val_labels.T

# the trained weights and biases
weights, biases = train(X_train, y_train, val_images, val_labels, weights, biases, activations, epochs=80, learning_rate=0.01, batch_size=128, dropout_rate=0.0)

Epoch 0, Loss: 0.6318, Validation Accuracy: 78.64%
Epoch 10, Loss: 0.3738, Validation Accuracy: 86.21%
Epoch 20, Loss: 0.3080, Validation Accuracy: 87.91%
Epoch 30, Loss: 0.2485, Validation Accuracy: 88.57%
Epoch 40, Loss: 0.2402, Validation Accuracy: 88.63%


In [58]:
test_labels=test_labels.T
predictions = predict(test_images, weights, biases, activations, dropout_rate=0)
true_labels = np.argmax(test_labels, axis=0)
accuracy = np.mean(predictions == true_labels)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

Test Accuracy: 89.27%
