In [1]:
# Cell 1: Imports and Setup
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Set random seed for reproducibility
np.random.seed(0)


In [2]:
# Cell 2: Helper Functions (Activation and Loss)

def sigmoid(A):
    """Sigmoid activation function."""
    # Ensure numerical stability
    A = np.clip(A, -500, 500)
    return 1 / (1 + np.exp(-A))

def tanh(A):
    """Hyperbolic tangent activation function."""
    return np.tanh(A)

def softmax(A):
    """Softmax activation function for the output layer."""
    # Subtract max for numerical stability
    exp_A = np.exp(A - np.max(A, axis=1, keepdims=True))
    sum_exp_A = np.sum(exp_A, axis=1, keepdims=True)
    return exp_A / sum_exp_A

def tanh_grad(A):
    """Gradient of the tanh function."""
    return 1 - np.tanh(A)**2

def sigmoid_grad(A):
    """Gradient of the sigmoid function."""
    Z = sigmoid(A)
    return Z * (1 - Z)

def cross_entropy_error(y, Z3, batch_size):
    """Calculates the cross-entropy loss (Problem 3)."""
    epsilon = 1e-7
    # y is the one-hot true label
    # L = -1/nb * sum(y * log(Z3))
    return -np.sum(y * np.log(Z3 + epsilon)) / batch_size

In [3]:
# Cell 3: GetMiniBatch Class

class GetMiniBatch:
    """
    Iterator to get a mini-batch (provided in the prompt)
    """

    def __init__(self, X, y, batch_size=20, seed=0):
        self.batch_size = batch_size
        np.random.seed(seed)
        shuffle_index = np.random.permutation(np.arange(X.shape[0]))
        self._X = X[shuffle_index]
        self._y = y[shuffle_index]
        # Use integer division for stop calculation
        self._stop = (X.shape[0] + self.batch_size - 1) // self.batch_size

    def __len__(self):
        return self._stop

    def __getitem__(self, item):
        p0 = item * self.batch_size
        p1 = item * self.batch_size + self.batch_size
        return self._X[p0:p1], self._y[p0:p1]

    def __iter__(self):
        self._counter = 0
        return self

    def __next__(self):
        if self._counter >= self._stop:
            raise StopIteration()
        p0 = self._counter * self.batch_size
        p1 = self._counter * self.batch_size + self.batch_size
        self._counter += 1
        return self._X[p0:p1], self._y[p0:p1]

In [4]:
# Cell 4: ScratchSimpleNeuralNetrowkClassifier (Problems 1, 2, 4, 5, 6, 7)

class ScratchSimpleNeuralNetrowkClassifier:
    """
    Simple three-layer neural network classifier using tanh/sigmoid and SGD.
    """

    def __init__(self, n_features=784, n_nodes1=400, n_nodes2=200, n_output=10,
                 sigma=0.01, lr=0.01, n_epoch=10, batch_size=20,
                 activation_func=tanh, verbose=True):

        self.verbose = verbose
        self.n_features = n_features
        self.n_nodes1 = n_nodes1
        self.n_nodes2 = n_nodes2
        self.n_output = n_output
        self.sigma = sigma
        self.lr = lr
        self.n_epoch = n_epoch
        self.batch_size = batch_size
        self.activation_func = activation_func

        # Select the correct gradient for backprop
        self.activation_grad = tanh_grad if activation_func == tanh else sigmoid_grad

        # Loss history for plotting (Problem 7)
        self.loss = []
        self.val_loss = []

        # Initialize weights and biases (Problem 1)
        self.W1 = self.sigma * np.random.randn(self.n_features, self.n_nodes1)
        self.B1 = self.sigma * np.random.randn(self.n_nodes1)
        self.W2 = self.sigma * np.random.randn(self.n_nodes1, self.n_nodes2)
        self.B2 = self.sigma * np.random.randn(self.n_nodes2)
        self.W3 = self.sigma * np.random.randn(self.n_nodes2, self.n_output)
        self.B3 = self.sigma * np.random.randn(self.n_output)


    def _forward(self, X):
        """Performs forward propagation (Problem 2)."""

        # Layer 1
        A1 = X @ self.W1 + self.B1
        Z1 = self.activation_func(A1)

        # Layer 2
        A2 = Z1 @ self.W2 + self.B2
        Z2 = self.activation_func(A2)

        # Layer 3 (Output Layer)
        A3 = Z2 @ self.W3 + self.B3
        Z3 = softmax(A3)

        return Z3, A3, Z2, A2, Z1, A1


    def _backward_and_update(self, X, Y, Z3, Z2, A2, Z1, A1):
        """Performs back-propagation and updates parameters (Problem 4)."""

        batch_size = X.shape[0]

        # --- 3rd Layer ---
        grad_A3 = (Z3 - Y) / batch_size
        grad_W3 = Z2.T @ grad_A3
        grad_B3 = np.sum(grad_A3, axis=0)
        grad_Z2 = grad_A3 @ self.W3.T

        # --- 2nd Layer ---
        grad_A2 = grad_Z2 * self.activation_grad(A2)
        grad_W2 = Z1.T @ grad_A2
        grad_B2 = np.sum(grad_A2, axis=0)
        grad_Z1 = grad_A2 @ self.W2.T

        # --- 1st Layer ---
        grad_A1 = grad_Z1 * self.activation_grad(A1)
        grad_W1 = X.T @ grad_A1
        grad_B1 = np.sum(grad_A1, axis=0)

        # Update weights and biases (SGD)
        self.W1 -= self.lr * grad_W1
        self.B1 -= self.lr * grad_B1
        self.W2 -= self.lr * grad_W2
        self.B2 -= self.lr * grad_B2
        self.W3 -= self.lr * grad_W3
        self.B3 -= self.lr * grad_B3


    def fit(self, X, y, X_val=None, y_val=None):
        """Learns the neural network classifier (Problem 6)."""

        if self.verbose:
            print("Starting training...")
            print("Epoch | Train Loss | Val Loss | Train Acc | Val Acc")
            print("-" * 51)

        for epoch in range(1, self.n_epoch + 1):
            epoch_loss_sum = 0

            # Use mini-batch iterator
            get_mini_batch = GetMiniBatch(X, y, batch_size=self.batch_size)

            for mini_X_train, mini_y_train in get_mini_batch:

                # Forward Propagation
                Z3, A3, Z2, A2, Z1, A1 = self._forward(mini_X_train)

                # Calculate loss (sum over the batch, scaled for the number of samples in the batch)
                current_batch_size = mini_X_train.shape[0]
                batch_loss = cross_entropy_error(mini_y_train, Z3, current_batch_size)
                epoch_loss_sum += batch_loss * current_batch_size # Accumulate total loss sum

                # Back-propagation and update
                self._backward_and_update(mini_X_train, mini_y_train, Z3, Z2, A2, Z1, A1)

            # Average training loss for the epoch
            avg_train_loss = epoch_loss_sum / len(X)
            self.loss.append(avg_train_loss)

            # Validation and Accuracy
            if X_val is not None and y_val is not None:
                # Validation Loss
                Z3_val, *_ = self._forward(X_val)
                val_loss = cross_entropy_error(y_val, Z3_val, len(X_val))
                self.val_loss.append(val_loss)

                # Accuracy (requires converting one-hot to single-label)
                y_pred_train = self.predict(X)
                y_pred_val = self.predict(X_val)

                y_single_train = np.argmax(y, axis=1)
                y_single_val = np.argmax(y_val, axis=1)

                train_acc = accuracy_score(y_single_train, y_pred_train)
                val_acc = accuracy_score(y_single_val, y_pred_val)

                if self.verbose:
                    print(f"{epoch:5} | {avg_train_loss:.6f} | {val_loss:.6f} | {train_acc:.4f} | {val_acc:.4f}")
            elif self.verbose:
                print(f"{epoch:5} | {avg_train_loss:.6f}")


    def predict(self, X):
        """Estimates class labels (Problem 5)."""
        Z3, *_ = self._forward(X)
        # Find the index of the highest probability
        y_pred = np.argmax(Z3, axis=1)
        return y_pred

In [5]:
from keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [6]:
"""X_train = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)"""

'X_train = X_train.reshape(-1, 784)\nX_test = X_test.reshape(-1, 784)'

In [7]:
"""X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)
X_train /= 255
X_test /= 255
print(X_train.max())  # 1.0
print(X_train.min())  # 0."""

'X_train = X_train.astype(np.float64)\nX_test = X_test.astype(np.float64)\nX_train /= 255\nX_test /= 255\nprint(X_train.max())  # 1.0\nprint(X_train.min())  # 0.'

In [8]:


import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# --- Data Preprocessing Steps ---

# 1. Combine training and testing data for a balanced 60/20/20 split
X_all = np.concatenate([X_train, X_test], axis=0)
y_int_all = np.concatenate([y_train, y_test], axis=0)

# 2. Flatten Features (28x28 -> 784) and Normalize (0-255 -> 0-1)
# Convert to float and divide by 255
X_all = X_all.reshape(-1, 784).astype(float) / 255.0

# 3. One-Hot Encoding (OHE) for labels
# The loss function (cross-entropy) and output layer require OHE labels.
enc = OneHotEncoder(sparse_output=False, categories='auto')
y_one_hot_all = enc.fit_transform(y_int_all.reshape(-1, 1))

# 4. Data Splitting: Train (60%), Validation (20%), Test (20%)
# First split: Train (60%) vs. Temp (40%)
X_train, X_temp, y_train_ohe, y_temp_ohe = train_test_split(
    X_all, y_one_hot_all, test_size=0.4, random_state=42
)

# Second split: Validation (50% of Temp) vs. Test (50% of Temp) -> 20% each
X_val, X_test, y_val_ohe, y_test_ohe = train_test_split(
    X_temp, y_temp_ohe, test_size=0.5, random_state=42
)

# Rename OHE labels for compatibility with the ScratchSimpleNeuralNetrowkClassifier.fit()
y_train = y_train_ohe
y_val = y_val_ohe

# --- Set Global Parameters ---
N_FEATURES = X_train.shape[1]
N_OUTPUT = y_train.shape[1]

print("--- MNIST Data Preprocessing Complete ---")
print(f"N_FEATURES: {N_FEATURES}, N_OUTPUT: {N_OUTPUT}")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

--- MNIST Data Preprocessing Complete ---
N_FEATURES: 784, N_OUTPUT: 10
X_train shape: (42000, 784), y_train shape: (42000, 10)
X_val shape: (14000, 784), y_val shape: (14000, 10)
X_test shape: (14000, 784), y_test shape: (10000,)


In [None]:
# Cell 6: Training and Evaluation (Problem 6)

# Hyperparameters
EPOCHS = 30
LEARNING_RATE = 0.01
BATCH_SIZE = 100

# Initialize the classifier
nn_classifier = ScratchSimpleNeuralNetrowkClassifier(
    n_features=N_FEATURES, n_nodes1=400, n_nodes2=200, n_output=N_OUTPUT,
    lr=LEARNING_RATE, n_epoch=EPOCHS, batch_size=BATCH_SIZE,
    activation_func=tanh, verbose=True
)

# Train the model
nn_classifier.fit(X_train, y_train, X_val, y_val)

# Final Estimation on Validation Data
y_pred_val = nn_classifier.predict(X_val)

# Get true single labels for accuracy check
y_val_single = np.argmax(y_val, axis=1)

# Calculate Accuracy
final_accuracy = accuracy_score(y_val_single, y_pred_val)
print(f"\nFinal Validation Accuracy: {final_accuracy:.4f}")

Starting training...
Epoch | Train Loss | Val Loss | Train Acc | Val Acc
---------------------------------------------------
    1 | 2.295777 | 2.287416 | 0.3357 | 0.3343
    2 | 2.263188 | 2.215753 | 0.4052 | 0.4034
    3 | 1.978252 | 1.625892 | 0.4414 | 0.4427
    4 | 1.314704 | 1.060700 | 0.6727 | 0.6683


In [None]:
X_train, X_val, y_train_one_hot, y_val_one_hot = train_test_split(X_train, y_train_one_hot, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_val.shape)

In [None]:
# Cell 7: Plotting the Learning Curve (Problem 7)

plt.figure(figsize=(10, 6))
plt.plot(range(1, nn_classifier.n_epoch + 1), nn_classifier.loss, label='Training Loss')
if nn_classifier.val_loss:
    plt.plot(range(1, nn_classifier.n_epoch + 1), nn_classifier.val_loss, label='Validation Loss')

plt.title('Learning Curve (Cross-Entropy Error)')
plt.xlabel('Epoch')
plt.ylabel('Loss (Cross-Entropy Error)')
plt.legend()
plt.grid(True)
plt.show()