In [160]:
import numpy as np
from torchvision.datasets import MNIST

def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',
                    transform=lambda x: np.array(x).flatten(),
                    download=True,
                    train=is_train)
    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
    return np.array(mnist_data), np.array(mnist_labels)


train_X, train_Y = download_mnist(True)
test_X, test_Y = download_mnist(False)

In [161]:
train_X = np.array(train_X)/255.0
test_X = np.array(test_X)/255.0

def convert_labels(labels):

    labels = np.array(labels).astype(int)
    classes = 10
    matrix = np.zeros((labels.shape[0], classes))
    matrix[np.arange(labels.shape[0]), labels] = 1

    return matrix

train_Y = convert_labels(train_Y)
print(train_Y.shape[0])

test_Y = convert_labels(test_Y)

60000


In [162]:
np.random.seed(50)
input_size = train_X.shape[1]  # 784
hidden_layer = 100
classes = 10
dropout_rate = 0.1
reg_lambda = 0.0001


W1 = np.random.uniform( low=-np.sqrt(1. / (input_size + hidden_layer)), high=np.sqrt(1. / (input_size + hidden_layer)), size=(input_size, hidden_layer))
b1 = np.zeros(hidden_layer)

W2 = np.random.uniform( low=-np.sqrt(1. / (hidden_layer + classes)), high=np.sqrt(1. / (hidden_layer + classes)), size=(hidden_layer, classes))
b2 = np.zeros(classes)

In [163]:
def sigmoid(z, clip_min=-4, clip_max=4):
    z = np.clip(z, clip_min, clip_max)
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z) * (1 - sigmoid(z))

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  #vrem pe fiecare rand
    return exp_z / exp_z.sum(axis=1, keepdims=True)

def cross_entropy_loss(y, y_pred):
    eps = 1e-8
    return -np.sum(y * np.log(y_pred+eps))

def apply_dropout(A, rate, is_training=True):
    if is_training:
       mask = np.random.binomial(1, 1 - rate, size=A.shape)
       return A * mask / (1 - rate)
    return A


In [164]:
def gradient_descent(X, y, W1, b1, W2, b2, learning_rate=0.01, is_training=True):
    #forward propagation

    z1 = sigmoid(X@W1 + b1)
    z1 = apply_dropout(z1, dropout_rate, is_training=is_training)

    y_pred = softmax(z1@W2 + b2)

    loss = cross_entropy_loss(y, y_pred)
    loss += reg_lambda * (np.sum(W1**2) + np.sum(W2**2))

    # backward propagation
    err_output = y_pred - y
    dW2 = (z1.T@err_output) / X.shape[0] + reg_lambda * W2
    db2 = np.sum(err_output, axis=0) / X.shape[0]

    error_hidden = err_output@W2.T * sigmoid_derivative(z1)
    dW1 = (X.T@error_hidden) / X.shape[0] + reg_lambda * W1
    db1 = np.sum(error_hidden, axis = 0) / X.shape[0]

    # Update weights and biases
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1

    return W1, b1, W2, b2, loss

In [165]:
def update_learning_rate(current_loss, best_loss, lr, threshold, decay_factor, min_lr, epochs_since_improvement):
    if current_loss < best_loss:
        best_loss = current_loss
        epochs_since_improvement = 0
    else:
        epochs_since_improvement += 1


    if epochs_since_improvement >= threshold:
        lr = max(lr * decay_factor, min_lr)
        epochs_since_improvement = 0

    return lr, best_loss, epochs_since_improvement


In [166]:
def train_with_scheduler(train_X, train_Y, test_X, test_Y, W1, b1, W2, b2, epochs=100, batch_size=100, initial_lr=0.01, threshold=2, decay_factor=0.2, min_lr=0.0009):

    num_batches = np.ceil(train_X.shape[0] / batch_size).astype(int)


    lr = initial_lr
    best_loss = float('inf')
    epochs_since_improvement = 0

    for epoch in range(epochs):

        shuffle_indices = np.random.permutation(train_X.shape[0])
        train_X_shuffled = train_X[shuffle_indices]
        train_Y_shuffled = train_Y[shuffle_indices]

        epoch_loss = 0
        for i in range(num_batches):
            start = i * batch_size
            end = min(start + batch_size, train_X.shape[0])

            X_batch = train_X_shuffled[start:end]
            y_batch = train_Y_shuffled[start:end]


            W1, b1, W2, b2, batch_loss = gradient_descent(X_batch, y_batch, W1, b1, W2, b2, learning_rate=lr)

            epoch_loss += batch_loss

        epoch_loss /= num_batches

        lr, best_loss, epochs_since_improvement = update_learning_rate(
            epoch_loss, best_loss, lr, threshold, decay_factor, min_lr, epochs_since_improvement)

        train_acc = accuracy(train_X, train_Y, W1, b1, W2, b2)
        val_acc = accuracy(test_X, test_Y, W1, b1, W2, b2)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Learning Rate: {lr:.6f}")
        print(f"Training Accuracy: {train_acc * 100:.2f}%, Validation Accuracy: {val_acc * 100:.2f}%\n")

    return W1, b1, W2, b2


W1, b1, W2, b2 = train_with_scheduler(train_X, train_Y, test_X, test_Y, W1, b1, W2, b2, epochs=130, batch_size=100, initial_lr=0.01, threshold=3, decay_factor=0.2, min_lr=0.001)


# Measure accuracy
def accuracy(X, y, W1, b1, W2, b2):

    z1 = sigmoid(X@W1 + b1)

    z2 = z1@W2+ b2
    y_pred = softmax(z2)

    predicted_classes = np.argmax(y_pred, axis=1)
    true_classes = np.argmax(y, axis=1)
    return np.mean(predicted_classes == true_classes)



test_accuracy = accuracy(test_X, test_Y, W1, b1, W2, b2)
print(f"Testing Data Accuracy: {test_accuracy * 100:.2f}%")

Epoch 1/130, Loss: 224.2051, Learning Rate: 0.010000
Training Accuracy: 45.21%, Validation Accuracy: 46.11%

Epoch 2/130, Loss: 203.2630, Learning Rate: 0.010000
Training Accuracy: 62.52%, Validation Accuracy: 63.60%

Epoch 3/130, Loss: 167.8555, Learning Rate: 0.010000
Training Accuracy: 71.27%, Validation Accuracy: 72.04%

Epoch 4/130, Loss: 132.1030, Learning Rate: 0.010000
Training Accuracy: 76.62%, Validation Accuracy: 77.45%

Epoch 5/130, Loss: 106.9828, Learning Rate: 0.010000
Training Accuracy: 79.78%, Validation Accuracy: 80.55%

Epoch 6/130, Loss: 90.8232, Learning Rate: 0.010000
Training Accuracy: 81.79%, Validation Accuracy: 82.18%

Epoch 7/130, Loss: 79.8663, Learning Rate: 0.010000
Training Accuracy: 83.39%, Validation Accuracy: 83.92%

Epoch 8/130, Loss: 72.2334, Learning Rate: 0.010000
Training Accuracy: 84.38%, Validation Accuracy: 84.95%

Epoch 9/130, Loss: 66.5252, Learning Rate: 0.010000
Training Accuracy: 85.13%, Validation Accuracy: 85.79%

Epoch 10/130, Loss: 62.