# data

In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [89]:
test = pd.read_csv('test.csv')
X_test = test.iloc[:, :]
X_test = X_test / 255.0
X_test = X_test.to_numpy()

In [90]:
mnist = pd.read_csv('train.csv')
# Get the data and labels
X = mnist.iloc[:, 1:]  # Select all columns except the last one (features)
y = mnist.iloc[:, 0]   # Select the last column (label)

In [91]:
len(y)

42000

In [92]:
# Normalize pixel values to [0, 1]
X = X / 255.0
X = X.to_numpy()

In [93]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=1000, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, test_size=1000, random_state=42)

In [94]:
# One-hot encode the labels
def one_hot_encode(y, num_classes=10):
    one_hot = np.zeros((y.size, num_classes))
    one_hot[np.arange(y.size), y] = 1
    return one_hot

y_train_encoded = one_hot_encode(y_train)
y_valid_encoded = one_hot_encode(y_valid)
y_test_encoded = one_hot_encode(y_test)

In [95]:
input_size = 784
output_size = 10

# model

In [122]:
import numpy as np

class MLP:
    def __init__(self, layer_sizes, activation='relu', task='classification', dropout_rate=0.0):
        self.layer_sizes = layer_sizes
        self.task = task
        self.activation = activation
        self.dropout_rate = dropout_rate
        self.weights = []
        self.biases = []
        self.dropout_masks = []


        for i in range(len(layer_sizes) - 1):
            self.weights.append(np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * np.sqrt(2 / layer_sizes[i]))
            self.biases.append(np.zeros((1, layer_sizes[i + 1])))

    def activation_function(self, Z, derivative=False):
        if self.activation == 'relu':
            if derivative:
                return Z > 0
            return np.maximum(0, Z)
        elif self.activation == 'tanh':
            if derivative:
                return 1 - np.tanh(Z) ** 2
            return np.tanh(Z)
        elif self.activation == 'gelu':
            if derivative:
                return 0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (Z + 0.044715 * Z ** 3)))
            return 0.5 * Z * (1 + np.tanh(np.sqrt(2 / np.pi) * (Z + 0.044715 * Z ** 3)))

    def softmax(self, Z):
        expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return expZ / np.sum(expZ, axis=1, keepdims=True)

    def forward(self, X, is_training=True):
        A = X
        cache = [(None, X)]  # (Z, A) pairs
        self.dropout_masks = []

        for i, (W, b) in enumerate(zip(self.weights[:-1], self.biases[:-1])):
            Z = np.dot(A, W) + b
            A = self.activation_function(Z)

            # Apply dropout during training
            if is_training and self.dropout_rate > 0:
                dropout_mask = np.random.rand(*A.shape) > self.dropout_rate
                A *= dropout_mask  # Drop neurons
                A /= (1 - self.dropout_rate)  # Scale output during training
                self.dropout_masks.append(dropout_mask)
            else:
                self.dropout_masks.append(None)

            cache.append((Z, A))

        # Output
        Z = np.dot(A, self.weights[-1]) + self.biases[-1]
        if self.task == 'classification':
            A = self.softmax(Z)
        elif self.task == 'regression':
            A = Z
        cache.append((Z, A))

        return A, cache

    def compute_loss(self, y_pred, y_true):
        if self.task == 'classification':
            return -np.sum(y_true * np.log(y_pred + 1e-8)) / len(y_true)
        elif self.task == 'regression':
            return np.mean((y_pred - y_true) ** 2)

    def backward(self, X, y_true, cache):
        grads = []
        m = y_true.shape[0]

        Z_last, A_last = cache[-1]
        if self.task == 'classification':
            dZ = A_last - y_true
        elif self.task == 'regression':
            dZ = (A_last - y_true) / m

        for i in reversed(range(len(self.weights))):
            A_prev = cache[i][1]
            dW = np.dot(A_prev.T, dZ) / m
            db = np.sum(dZ, axis=0, keepdims=True) / m

            if i > 0:
                Z_prev = cache[i][0]
                dA_prev = np.dot(dZ, self.weights[i].T)

                # Apply dropout mask
                if self.dropout_masks[i - 1] is not None:
                    dA_prev *= self.dropout_masks[i - 1]
                    dA_prev /= (1 - self.dropout_rate)

                dZ = dA_prev * self.activation_function(Z_prev, derivative=True)

            grads.append((dW, db))

        grads.reverse()
        return grads

    def update_parameters(self, grads, learning_rate):
        for i in range(len(self.weights)):
            self.weights[i] -= learning_rate * grads[i][0]
            self.biases[i] -= learning_rate * grads[i][1]

    def accuracy(self, y_pred, y_true):
        preds = np.argmax(y_pred, axis=1)
        true_vals = np.argmax(y_true, axis=1)
        return np.mean(preds == true_vals)

    def train(self, X_train, y_train, X_valid, y_valid, X_test, epochs, learning_rate, batch_size):
        n_samples = X_train.shape[0]

        for epoch in range(epochs):
            indices = np.random.permutation(n_samples)
            X_train = X_train[indices]
            y_train = y_train[indices]

            for batch_start in range(0, n_samples, batch_size):
                batch_X = X_train[batch_start:batch_start + batch_size]
                batch_y = y_train[batch_start:batch_start + batch_size]


                y_pred, cache = self.forward(batch_X)

                loss = self.compute_loss(y_pred, batch_y)

                grads = self.backward(batch_X, batch_y, cache)

                self.update_parameters(grads, learning_rate)


            y_pred_valid, _ = self.forward(X_valid, is_training=False)
            val_acc = self.accuracy(y_pred_valid, y_valid)

            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}, Validation Accuracy: {val_acc}")

        y_pred_test, _ = self.forward(X_test, is_training=False)
        return y_pred_test


In [123]:
mlp = MLP(layer_sizes=[784, 32, 10], activation='tanh', task='classification', dropout_rate=0.2)
y_pred = mlp.train(X_train, y_train_encoded, X_valid, y_valid_encoded, X_test, epochs=22, learning_rate=0.1, batch_size=64)

Epoch 1/22, Loss: 0.33723219879319155, Validation Accuracy: 0.905
Epoch 2/22, Loss: 0.3961536948096187, Validation Accuracy: 0.921
Epoch 3/22, Loss: 0.30210481907975184, Validation Accuracy: 0.932
Epoch 4/22, Loss: 0.1688687412064978, Validation Accuracy: 0.934
Epoch 5/22, Loss: 0.3648909776964, Validation Accuracy: 0.938
Epoch 6/22, Loss: 0.16736143185200103, Validation Accuracy: 0.942
Epoch 7/22, Loss: 0.25919729624258886, Validation Accuracy: 0.939
Epoch 8/22, Loss: 0.22591423251084375, Validation Accuracy: 0.946
Epoch 9/22, Loss: 0.18180360438538432, Validation Accuracy: 0.946
Epoch 10/22, Loss: 0.07646227432718081, Validation Accuracy: 0.948
Epoch 11/22, Loss: 0.18615574007519495, Validation Accuracy: 0.945
Epoch 12/22, Loss: 0.1483283373993225, Validation Accuracy: 0.953
Epoch 13/22, Loss: 0.21906376372298153, Validation Accuracy: 0.951
Epoch 14/22, Loss: 0.15054406561205266, Validation Accuracy: 0.949
Epoch 15/22, Loss: 0.29852721445283004, Validation Accuracy: 0.953
Epoch 16/22

In [124]:
y_pred

array([[1.56060047e-05, 8.05235179e-03, 2.02909123e-02, ...,
        3.01541103e-05, 9.69686922e-01, 3.39075597e-05],
       [2.27263483e-08, 9.99369073e-01, 6.11265379e-05, ...,
        3.29471808e-04, 1.42739355e-04, 7.09888160e-06],
       [1.94068952e-06, 2.28616606e-06, 1.31605548e-07, ...,
        3.93647453e-03, 1.14755411e-04, 9.90771418e-01],
       ...,
       [4.04274100e-07, 9.90849325e-01, 7.98583429e-05, ...,
        8.11795533e-03, 3.53544262e-04, 1.32079751e-04],
       [6.08746499e-05, 3.20128536e-08, 9.99756262e-01, ...,
        3.09836051e-06, 4.10161298e-05, 6.87746522e-07],
       [7.07087212e-06, 6.31396636e-07, 1.82971285e-04, ...,
        3.67625211e-07, 1.27752594e-06, 2.37155433e-07]])

In [129]:
pred = np.argmax(y_pred, axis=1)
pred = list(pred)

In [133]:
import pandas as pd

idx = [i+1 for i in range(len(pred))]

df =pd.DataFrame({'ImageId': idx,
     'Label': pred})
df.to_csv('submission.csv', index=False)