In [1]:
import numpy as np
from keras.datasets import mnist # mnist.load_data()
from PIL import Image

2025-05-08 06:57:42.557272: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
np.random.seed(42)

# Sigmoid activation function and its derivative
def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -15, 15)))  # Clip to avoid overflow

def sigmoid_derivative(x):
    return x * (1 - x)

# Softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Cross-entropy loss
def cross_entropy_loss(y_pred, y_true):
    m = y_true.shape[0]
    # Add a small epsilon to prevent log(0)
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1.0 - epsilon)
    log_likelihood = -np.sum(y_true * np.log(y_pred)) / m
    return log_likelihood

# One-hot encoding
def one_hot_encode(y, num_classes=10):
    return np.eye(num_classes)[y]


In [3]:
class ConvLayer:
    def __init__(self, num_filters, kernel_size, stride=1):
        self.num_filters = num_filters
        self.kernel_size = kernel_size
        self.stride = stride
        self.filters = None  # Will be initialized after input is known
        self.biases = None

    def forward(self, input_data):
     
        #input_data shape: (batch_size, channels, height, width)
     
        self.input = input_data
        batch_size, channels, height, width = input_data.shape

        if self.filters is None:
            # Filters: (num_filters, channels, kernel_size, kernel_size)
            self.filters = np.random.randn(self.num_filters, channels, self.kernel_size, self.kernel_size) * 0.1
            self.biases = np.zeros(self.num_filters)

        output_height = (height - self.kernel_size) // self.stride + 1
        output_width = (width - self.kernel_size) // self.stride + 1
        output = np.zeros((batch_size, self.num_filters, output_height, output_width))

        for b in range(batch_size):
            for f in range(self.num_filters):
                for h in range(output_height):
                    for w in range(output_width):
                        h_start = h * self.stride
                        h_end = h_start + self.kernel_size
                        w_start = w * self.stride
                        w_end = w_start + self.kernel_size

                        region = input_data[b, :, h_start:h_end, w_start:w_end]
                        output[b, f, h, w] = np.sum(region * self.filters[f]) + self.biases[f]


        self.output_pre_activation = output
        self.output = sigmoid(output)
        return self.output

    def backward(self, dvalues, learning_rate=0.01):
        batch_size, input_channels, input_height, input_width = self.input.shape
        _, _, output_height, output_width = dvalues.shape

        dinputs = np.zeros_like(self.input)
        dfilters = np.zeros_like(self.filters)
        dbiases = np.zeros_like(self.biases)

        dactivation = dvalues * sigmoid_derivative(self.output)

        for b in range(batch_size):
            for f in range(self.num_filters):
                for h in range(output_height):
                    for w in range(output_width):
                        h_start = h * self.stride
                        h_end = h_start + self.kernel_size
                        w_start = w * self.stride
                        w_end = w_start + self.kernel_size

                        region = self.input[b, :, h_start:h_end, w_start:w_end]
                        dfilters[f] += dactivation[b, f, h, w] * region
                        dinputs[b, :, h_start:h_end, w_start:w_end] += dactivation[b, f, h, w] * self.filters[f]
                        dbiases[f] += dactivation[b, f, h, w]

        self.filters -= learning_rate * dfilters / batch_size
        self.biases -= learning_rate * dbiases / batch_size

        return dinputs


In [4]:
class AvgPoolLayer:
    def __init__(self, pool_size=2, stride=2):
        self.pool_size = pool_size
        self.stride = stride
        
    def forward(self, input_data):
        #Forward pass for average pooling layer
        #input_data shape: (batch_size, channels, height, width)
        self.input = input_data
        batch_size, channels, height, width = input_data.shape
        
        output_height = (height - self.pool_size) // self.stride + 1
        output_width = (width - self.pool_size) // self.stride + 1
        
        output = np.zeros((batch_size, channels, output_height, output_width))
        
        for b in range(batch_size):
            for c in range(channels):
                for h in range(output_height):
                    for w in range(output_width):
                        h_start = h * self.stride
                        h_end = h_start + self.pool_size
                        w_start = w * self.stride
                        w_end = w_start + self.pool_size
                        
                        output[b, c, h, w] = np.mean(input_data[b, c, h_start:h_end, w_start:w_end])
        
        self.output = output
        return self.output

    def backward(self, dvalues):
   
        #Backward pass for average pooling layer
        #dvalues shape: (batch_size, channels, output_height, output_width)
    
        batch_size, channels, output_height, output_width = dvalues.shape
        dinputs = np.zeros_like(self.input)
        
        for b in range(batch_size):
            for c in range(channels):
                for h in range(output_height):
                    for w in range(output_width):
                        h_start = h * self.stride
                        h_end = h_start + self.pool_size
                        w_start = w * self.stride
                        w_end = w_start + self.pool_size
                        
                        # Distribute gradient equally to all positions in the pooling window
                        avg_gradient = dvalues[b, c, h, w] / (self.pool_size * self.pool_size)
                        dinputs[b, c, h_start:h_end, w_start:w_end] += np.ones((self.pool_size, self.pool_size)) * avg_gradient
        
        return dinputs
        

In [5]:
class FlattenLayer:
    def __init__(self):
        self.input_shape = None
        
    def forward(self, input_data):
      
        #Forward pass for flatten layer
        #input_data shape: (batch_size, channels, height, width)
       
        self.input_shape = input_data.shape
        batch_size = input_data.shape[0]
        return input_data.reshape(batch_size, -1)
    
    def backward(self, dvalues):
      
        #Backward pass for flatten layer
        #dvalues shape: (batch_size, flattened_size)
        
        return dvalues.reshape(self.input_shape)

class Conv1x1Layer:
    def __init__(self, input_channels, output_channels):
       
        #1x1 Convolution layer (equivalent to fully connected layer with specific structure)
       
        self.input_channels = input_channels
        self.output_channels = output_channels
        # Initialize weights with small random values
        self.weights = np.random.randn(input_channels, output_channels) * 0.01
        self.biases = np.zeros(output_channels)
        
    def forward(self, input_data):
        
        #Forward pass for 1x1 convolution layer
        #input_data shape: (batch_size, flattened_size)
        
        self.input = input_data
        # Compute output: y = x * W + b
        output = np.dot(input_data, self.weights) + self.biases
        return output
    
    def backward(self, dvalues, learning_rate=0.01):
      
        #Backward pass for 1x1 convolution layer
        #dvalues shape: (batch_size, output_channels)
       
        batch_size = dvalues.shape[0]
        
        dweights = np.dot(self.input.T, dvalues) / batch_size
        dbiases = np.sum(dvalues, axis=0) / batch_size
        
        dinputs = np.dot(dvalues, self.weights.T)
        
        self.weights -= learning_rate * dweights
        self.biases -= learning_rate * dbiases
        
        return dinputs

In [6]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.astype(np.float32) / 255.0
X_test = X_test.astype(np.float32) / 255.0

# Channel dimension: (N, 28, 28) -> (N, 1, 28, 28)
X_train = X_train[:, np.newaxis, :, :]
X_test = X_test[:, np.newaxis, :, :]

y_train_onehot = one_hot_encode(y_train)
y_test_onehot = one_hot_encode(y_test)

conv_layer = ConvLayer(num_filters=2, kernel_size=3, stride=1)
pool_layer = AvgPoolLayer(pool_size=2, stride=2)
flatten_layer = FlattenLayer()
conv1x1_layer = Conv1x1Layer(input_channels=2*13*13, output_channels=10)  # Size after pooling: 2x13x13

epochs = 35
batch_size = 32
learning_rate = 0.01

for epoch in range(epochs):
    epoch_loss = 0
    correct_predictions = 0
    

    indices = np.random.permutation(len(X_train))
    X_train_shuffled = X_train[indices]
    y_train_onehot_shuffled = y_train_onehot[indices]
    
    for i in range(0, len(X_train), batch_size):
        # Get batch
        X_batch = X_train_shuffled[i:i+batch_size]
        y_batch = y_train_onehot_shuffled[i:i+batch_size]
        
        current_batch_size = X_batch.shape[0]

  # Forward pass
        conv_out = conv_layer.forward(X_batch)
        pool_out = pool_layer.forward(conv_out)
        flattened = flatten_layer.forward(pool_out)
        logits = conv1x1_layer.forward(flattened)
        predictions = softmax(logits)
        
        loss = cross_entropy_loss(predictions, y_batch)
        epoch_loss += loss
        
        correct_predictions += np.sum(np.argmax(predictions, axis=1) == np.argmax(y_batch, axis=1))
        
        # Backpropagation
        # Gradient of softmax with cross-entropy
        dvalues = predictions.copy()
        dvalues[range(current_batch_size), np.argmax(y_batch, axis=1)] -= 1
        dvalues /= current_batch_size
        
        # Backward pass through each layer
        dvalues = conv1x1_layer.backward(dvalues, learning_rate)
        dvalues = flatten_layer.backward(dvalues)
        dvalues = pool_layer.backward(dvalues)
        dvalues = conv_layer.backward(dvalues, learning_rate)
    
    accuracy = correct_predictions / len(X_train)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(X_train):.4f}, Accuracy: {accuracy * 100:.2f}%")

correct_test = 0
test_loss = 0


for i in range(0, len(X_test), batch_size):
    X_batch = X_test[i:i+batch_size]
    y_batch = y_test_onehot[i:i+batch_size]
    
    # Forward pass
    conv_out = conv_layer.forward(X_batch)
    pool_out = pool_layer.forward(conv_out)
    flattened = flatten_layer.forward(pool_out)
    logits = conv1x1_layer.forward(flattened)
    predictions = softmax(logits)

      
    loss = cross_entropy_loss(predictions, y_batch)
    test_loss += loss
    

    correct_test += np.sum(np.argmax(predictions, axis=1) == np.argmax(y_batch, axis=1))

test_accuracy = correct_test / len(X_test)
avg_test_loss = test_loss / (len(X_test) / batch_size)
print(f"\nTest Results:")
print(f"Loss: {avg_test_loss:.4f}, Accuracy: {test_accuracy * 100:.2f}%")
    

Epoch 1/35, Loss: 0.0719, Accuracy: 11.17%
Epoch 2/35, Loss: 0.0718, Accuracy: 11.24%
Epoch 3/35, Loss: 0.0717, Accuracy: 11.25%
Epoch 4/35, Loss: 0.0715, Accuracy: 11.35%
Epoch 5/35, Loss: 0.0713, Accuracy: 11.81%
Epoch 6/35, Loss: 0.0711, Accuracy: 13.44%
Epoch 7/35, Loss: 0.0707, Accuracy: 18.96%
Epoch 8/35, Loss: 0.0702, Accuracy: 25.30%
Epoch 9/35, Loss: 0.0696, Accuracy: 35.42%
Epoch 10/35, Loss: 0.0688, Accuracy: 41.27%
Epoch 11/35, Loss: 0.0678, Accuracy: 49.15%
Epoch 12/35, Loss: 0.0667, Accuracy: 54.40%
Epoch 13/35, Loss: 0.0653, Accuracy: 59.30%
Epoch 14/35, Loss: 0.0636, Accuracy: 62.22%
Epoch 15/35, Loss: 0.0618, Accuracy: 65.17%
Epoch 16/35, Loss: 0.0598, Accuracy: 67.70%
Epoch 17/35, Loss: 0.0577, Accuracy: 69.43%
Epoch 18/35, Loss: 0.0555, Accuracy: 71.10%
Epoch 19/35, Loss: 0.0532, Accuracy: 72.82%
Epoch 20/35, Loss: 0.0508, Accuracy: 74.01%
Epoch 21/35, Loss: 0.0485, Accuracy: 74.97%
Epoch 22/35, Loss: 0.0462, Accuracy: 76.15%
Epoch 23/35, Loss: 0.0441, Accuracy: 76.9

# **Architecture**: 
The network is a simple Convolutional Neural Network (CNN) designed for digit classification on the MNIST dataset (handwritten digits 0-9). It consists of the following layers:

## **Convolutional Layer (ConvLayer):**
Parameters: 2 filters, 3x3 kernel, stride 1.
Input: (batch_size, 1, 28, 28) (MNIST images with 1 channel).
Output: (batch_size, 2, 26, 26) (after convolution: (28-3)//1 + 1 = 26).
Activation: Sigmoid
Extracts spatial features like edges from the input images

## **Average Pooling Layer (AvgPoolLayer):**
Parameters: 2x2 pool size, stride 2.
Input: (batch_size, 2, 26, 26).
Output: (batch_size, 2, 13, 13) (after pooling: (26-2)//2 + 1 = 13).
Reduces spatial dimensions, making the model less sensitive to small translations.

## **Flatten Layer (FlattenLayer):**
Input: (batch_size, 2, 13, 13).
Output: (batch_size, 2*13*13) = (batch_size, 338).
Converts the 4D tensor into a 2D matrix for the fully connected layer.

## **1x1 Convolution Layer (Conv1x1Layer):**
Parameters: Input channels = 338, output channels = 10 (one per digit class).
Input: (batch_size, 338).
Output: (batch_size, 10) (logits for each class).
Acts as a fully connected layer, mapping features to class scores.

Softmax:Applied to the logits to produce a probability distribution over the 10 classes.

# **Implementation:** 
The CNN is implemented from scratch using NumPy, without high-level frameworks like TensorFlow or PyTorch. Key aspects of the implementation include:

## **Data Preparation:**
MNIST dataset is loaded using keras.datasets.mnist.
Images are normalized to [0, 1] by dividing pixel values by 255.
Data is reshaped to (N, 1, 28, 28) to include the channel dimension.
Labels are one-hot encoded (e.g., digit 5 â†’ [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]).

**Forward Pass:** Input images pass through the ConvLayer (convolution + sigmoid), AvgPoolLayer (downsampling), FlattenLayer, and Conv1x1Layer.

Softmax is applied to the final logits to get class probabilities.

**Loss Function:** Cross-entropy loss is used to measure the difference between predicted probabilities and true labels, with an epsilon (1e-15) to avoid log(0).

**Backward Pass:** Gradients are computed manually for each layer using backpropagation.
The gradient of the cross-entropy loss with softmax is calculated as predictions - y_true.
Gradients are propagated backward through the Conv1x1Layer, FlattenLayer, AvgPoolLayer, and ConvLayer, updating weights and biases with a learning rate of 0.01.

## **Training:**
The model is trained for 35 epochs with a batch size of 32.
Data is shuffled at the start of each epoch to improve generalization.
Loss and accuracy are computed and printed for each epoch.

## **Testing:**
The model is evaluated on the test set in batches, computing the loss and accuracy.

The training and test results are as follows:

## **Training:**
Over 35 epochs, the training accuracy improves from 11.17% to 82.91%.
The training loss decreases from 0.0719 to 0.0271.
This indicates the model learns to classify digits effectively on the training data, though the low number of filters (2) limits its capacity.

## **Testing:**
Test Accuracy: 84.21%.
Test Loss: 0.8347.
The test accuracy is slightly higher than the final training accuracy, but the test loss is significantly higher than the training loss (0.8347 vs. 0.0271), suggesting some overfitting. The model generalizes reasonably well but could benefit from regularization techniques like dropout or more data augmentation.

The CNN architecture is minimal, with one convolutional layer (2 filters), one pooling layer, and a fully connected layer, followed by softmax for classification. The network is implemented from scratch, it achieves a test accuracy of 84.21% and a test loss of 0.8347 on MNIST. While the results are decent for such a simple model, the high test loss indicates overfitting, which could be addressed by adding more layers, using ReLU instead of sigmoid, or applying regularization.