In [31]:
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [32]:
def one_hot_encode(y, num_classes):
    encoded = np.zeros((y.size, num_classes))
    encoded[np.arange(y.size), y] = 1
    return encoded

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def d_sigmoid(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh(x):
    return np.tanh(x)

def d_tanh(x):
    return 1-np.tanh(x)**2

def relu(x):
    return np.maximum(0, x)

def d_relu(x):
    return (x > 0).astype(float)

def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def d_mse(y_true, y_pred):
    return 2 * (y_pred - y_true) / y_true.size


In [33]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size): 
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01 #weight matrix from input to hidden layer
        self.b1 = np.zeros((1, hidden_size)) #bias vector for hidden layer
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01 #weight matrix from hidden to output layer 
        self.b2 = np.zeros((1, output_size)) #bias vector for output layer

      
    def __init__(self, input_size, hidden_size, output_size, activation, d_activation):
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))

        self.activation = activation              # hidden layer activation
        self.d_activation = d_activation         


    def feedforward(self, X):
        self.Z1 = X @ self.W1 + self.b1
        self.A1 = self.activation(self.Z1)       
        self.Z2 = self.A1 @ self.W2 + self.b2
        self.A2 = self.activation(self.Z2)              
        return self.A2

        

    def backpropagate(self, X, y):
        m = X.shape[0]

        dZ2 = 2 * (self.A2 - y) * self.d_activation(self.Z2)
        dW2 = self.A1.T @ dZ2
        db2 = np.sum(dZ2, axis=0, keepdims=True)

        dZ1 = (dZ2 @ self.W2.T) * self.d_activation(self.Z1)
        dW1 = X.T @ dZ1
        db1 = np.sum(dZ1, axis=0, keepdims=True)

        return dW1 / m, db1 / m, dW2 / m, db2 / m

        
    
    def train(self, X, y, epochs, lr):
        for epoch in range(epochs):
            output = self.feedforward(X) 
            loss = mse(y, output)
            dW1, db1, dW2, db2 = self.backpropagate(X, y)

            self.W1 -= lr * dW1
            self.b1 -= lr * db1
            self.W2 -= lr * dW2
            self.b2 -= lr * db2

            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
                

    def predict(self, X):
        probs = self.feedforward(X) #array-row corresponds to predicted possibilities for each class
        return np.argmax(probs, axis=1)

    def evaluate(self, X, y_true):
        y_pred = self.predict(X)
        return accuracy_score(np.argmax(y_true, axis=1), y_pred)


In [43]:
digits = load_digits()
X = digits.data #features
y = digits.target #actual digits corresponding to each image

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

y_encoded = one_hot_encode(y, 10)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# training NN 
#nn = NeuralNetwork(64, 30, 10)
#nn.train(X_train, y_train, epochs=1000, lr=0.5)

#accuracy = nn.evaluate(X_test, y_test)
#print(f"Test Accuracy (Sigmoid): {accuracy * 100:.2f}%")

activations = {
    "sigmoid": (sigmoid, d_sigmoid),
    "tanh": (tanh, d_tanh),
    "relu": (relu, d_relu)
}

for name, (act_fn, d_act_fn) in activations.items():
    print(f"\nTraining with {name} activation:")
    nn = NeuralNetwork(64, 30, 10, act_fn, d_act_fn)
    nn.train(X_train, y_train, epochs=1000, lr=0.5)
    accuracy = nn.evaluate(X_test, y_test)
    print(f"{name.capitalize()} Accuracy: {accuracy * 100:.2f}%")




Training with sigmoid activation:
Epoch 0, Loss: 0.2472
Epoch 100, Loss: 0.0896
Epoch 200, Loss: 0.0830
Epoch 300, Loss: 0.0536
Epoch 400, Loss: 0.0376
Epoch 500, Loss: 0.0266
Epoch 600, Loss: 0.0198
Epoch 700, Loss: 0.0145
Epoch 800, Loss: 0.0116
Epoch 900, Loss: 0.0097
Sigmoid Accuracy: 95.83%

Training with tanh activation:
Epoch 0, Loss: 0.1002
Epoch 100, Loss: 0.0318
Epoch 200, Loss: 0.0289
Epoch 300, Loss: 0.0264
Epoch 400, Loss: 0.0229
Epoch 500, Loss: 0.0232
Epoch 600, Loss: 0.0215
Epoch 700, Loss: 0.0223
Epoch 800, Loss: 0.0210
Epoch 900, Loss: 0.0199
Tanh Accuracy: 96.67%

Training with relu activation:
Epoch 0, Loss: 0.0998
Epoch 100, Loss: 0.0051
Epoch 200, Loss: 0.0035
Epoch 300, Loss: 0.0028
Epoch 400, Loss: 0.0022
Epoch 500, Loss: 0.0017
Epoch 600, Loss: 0.0014
Epoch 700, Loss: 0.0012
Epoch 800, Loss: 0.0012
Epoch 900, Loss: 0.0011
Relu Accuracy: 97.50%



The optimal learning rate was found to be 0.5, especially for the sigmoid activation function, which showed a significant improvement in performance in this case. When using a smaller learning rate, such as 0.1, the model achieved very low accuracy (~41.67%) with sigmoid, while both tanh and ReLU maintained high accuracies (~95.8% and 97.2%, respectively).

This likely happened because of how the sigmoid function tends to saturate, especially with smaller weight updates. At low learning rates, the gradients can become small (vanishing gradient problem), so the model updates very slowly, which affects learning. On the other hand, tanh (which is zero-centered) and ReLU (which avoids saturation in the positive range) are better at keeping the gradient strong enough to learn even with smaller learning rates.

Increasing the learning rate to 0.5 allowed the sigmoid function to escape these slow learning regions, improving its accuracy significantly. However,the learning rate was not set too high even though the accuracy for sigmoid activation function was slightly better with higher learning rates, as this could lead to unstable training or overshooting the minimum

Similarly, increasing the number of epochs to 1000 was ideal for the sigmoid activation, giving the network more time to converge and make meaningful updates. When fewer epochs were used, the sigmoid-based model struggled to achieve good performance due to its slower learning dynamics. The number of epochs was capped at 1000, even though training for longer slightly improved the accuracy for sigmoid to balance performance with training time. Since sigmoid typically learns more slowly than ReLU or tanh, giving it more epochs helps it catch up but 1000 epochs were found to be a time reasonable upper limit for consistent training across all three activations.

