## Simple Neural Network for MNIST 

## 1. Import Libraries

In [21]:
import numpy as np
from sklearn.datasets import fetch_openml

## 2. Load Data and Preprocess

In [22]:
# Load MNIST (using sklearn )
print("Loading data...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')

# Use a subset (2000 samples) for speed and normalize pixel values (0-255 -> 0-1)
X_raw = mnist.data[:2000] / 255.0  
y_raw = mnist.target[:2000].astype(int)

# One-Hot Encode Y (Convert label '3' to [0,0,0,1,0...])
n_values = 10
Y_ohe = np.eye(n_values)[y_raw] 

# Transpose to match shape structure: (Features, Examples)
X = X_raw.T       # (784, 2000)
Y = Y_ohe.T       # (10, 2000)

Loading data...


## 3. Activation Functions Used

In [23]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def sigmoid_deriv(A):
    return A * (1 - A)

# Softmax for multi-class output
def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True)) # Subtract max for stability
    return expZ / np.sum(expZ, axis=0, keepdims=True)

## 4. Neural Network
- **Input layer** $a^{[0]}$: 784 units `(28*28*1 pixels)`
- **Hidden layer** $a^{[1]}$: 64 units, sigmoid activation  
- **Output layer** $a^{[2]}$: 10 unit, softmax activation (multiclass output)

---

## 4.1 Forward Propagation

$$
Z^{[1]} = W^{[1]} X + b^{[1]}
$$

$$
A^{[1]} = \sigma_{\text{ReLU}}(Z^{[1]}))
$$

$$
Z^{[2]} = W^{[2]} A^{[1]} + b^{[2]}
$$

$$
A^{[2]} = \sigma_{\text{softmax}}(Z^{[2]})
$$


In [24]:
# Parameters Initialization
def init_params(n_x, n_h, n_y):
    np.random.seed(1)
    W1 = np.random.randn(n_h, n_x) * 0.1
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.1
    b2 = np.zeros((n_y, 1))
    return W1, b1, W2, b2

# Forward propagation
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = sigmoid(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2) # Softmax for 10 classes
    return Z1, A1, Z2, A2


## 4.2 Backward Propagation

$$
dZ^{[2]} = A^{[2]} - Y
$$

$$
dW^{[2]} = \frac{1}{m} \, dZ^{[2]} A^{[1]T}
$$

$$
db^{[2]} = \frac{1}{m} \sum dZ^{[2]}
$$

$$
dZ^{[1]} = W^{[2]T} dZ^{[2]} \circ \sigma'_{\text{ReLU}}(A^{[1]}))
$$

$$
dW^{[1]} = \frac{1}{m} \, dZ^{[1]} A^{[0]T}
$$

$$
db^{[1]} = \frac{1}{m} \sum dZ^{[1]}
$$


In [25]:
# Backward propagation
def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = X.shape[1]

    dZ2 = A2 - Y
    dW2 = (1/m) * dZ2.dot(A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = W2.T.dot(dZ2)
    dZ1 = dA1 * sigmoid_deriv(A1)
    dW1 = (1/m) * dZ1.dot(X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

## 4.3 Parameter Updates

$$
W^{[2]} := W^{[2]} - \alpha \, dW^{[2]}
$$

$$
b^{[2]} := b^{[2]} - \alpha \, db^{[2]}
$$

$$
W^{[1]} := W^{[1]} - \alpha \, dW^{[1]}
$$

$$
b^{[1]} := b^{[1]} - \alpha \, db^{[1]}
$$

---

In [26]:
# Update weights
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 -= alpha * dW1
    b1 -= alpha * db1
    W2 -= alpha * dW2
    b2 -= alpha * db2
    return W1, b1, W2, b2

# Prediction
def predict(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    return np.argmax(A2, axis=0)

## 5. Model Training

In [27]:
# Training loop 
def gradient_descent(X, Y, iterations=1000, alpha=0.5):
    n_x = X.shape[0] # 784
    n_h = 64         # hidden layer neurons
    n_y = 10         # 10 output classes

    W1, b1, W2, b2 = init_params(n_x, n_h, n_y)

    # Convert One-Hot Y back to labels for accuracy checking
    Y_labels = np.argmax(Y, axis=0)

    for i in range(iterations + 1):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)

        if i % 100 == 0:
            predictions = predict(X, W1, b1, W2, b2)
            acc = np.mean(predictions == Y_labels)
            print(f"Iteration {i}   Accuracy: {acc*100:.2f}%")

    return W1, b1, W2, b2

# Train
W1, b1, W2, b2 = gradient_descent(X, Y)

Iteration 0   Accuracy: 15.35%
Iteration 100   Accuracy: 87.10%
Iteration 200   Accuracy: 91.30%
Iteration 300   Accuracy: 93.50%
Iteration 400   Accuracy: 94.95%
Iteration 500   Accuracy: 95.55%
Iteration 600   Accuracy: 96.05%
Iteration 700   Accuracy: 96.85%
Iteration 800   Accuracy: 97.60%
Iteration 900   Accuracy: 98.10%
Iteration 1000   Accuracy: 98.50%


## 6. Prediction

In [28]:
# Final Prediction Check
preds = predict(X, W1, b1, W2, b2)
print("\nFirst 20 Predictions:", preds[:20])
print("First 20 True Labels:", np.argmax(Y, axis=0)[:20])


First 20 Predictions: [5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9]
First 20 True Labels: [5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9]
