In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Load the data
train_data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_data = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
X_train = train_data.iloc[:, 1:]  # Removing Labels
Y_train = train_data['label']  # Extracting Y labels
X_test = test_data.iloc[:, 1:]

# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42,shuffle=True)

if __name__ == "__main__":
    print("Train Size:", X_train.shape)
    print("Validation Size:", X_val.shape)


Train Size: (33600, 784)
Validation Size: (8400, 784)


In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def ReLU(Z):
    return np.maximum(0, Z)

def SoftMax(Z): 
    Z_shifted = Z - np.max(Z, axis=0, keepdims=True) 
    expZ = np.exp(Z_shifted)
    return expZ / np.sum(expZ, axis=0, keepdims=True)

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    return one_hot_Y.T

def ReLU_deriv(Z):
    return Z > 0
    
class MNIST_model():
    def __init__(self):
        # Initializing weights
        self.W1 = np.random.rand(224, 784) - 0.5
        self.b1 = np.random.rand(224, 1) - 0.5
        self.W2 = np.random.rand(10, 224) - 0.5
        self.b2 = np.random.rand(10, 1) - 0.5
        
        # Adam optimizer parameters
        self.mW1, self.vW1 = np.zeros_like(self.W1), np.zeros_like(self.W1)
        self.mb1, self.vb1 = np.zeros_like(self.b1), np.zeros_like(self.b1)
        self.mW2, self.vW2 = np.zeros_like(self.W2), np.zeros_like(self.W2)
        self.mb2, self.vb2 = np.zeros_like(self.b2), np.zeros_like(self.b2)
        self.epsilon = 1e-8
        
    def forward(self, X):
        self.Z1 = self.W1.dot(X) + self.b1
        self.A1 = ReLU(self.Z1)
        self.Z2 = self.W2.dot(self.A1) + self.b2
        self.A2 = SoftMax(self.Z2)
        return self.A2
    
    def backprop(self, Y, X):
        m = Y.size
        self.one_hot_Y = one_hot(Y)
        self.dZ2 = self.A2 - self.one_hot_Y
        self.dW2 = 1 / m * self.dZ2.dot(self.A1.T)
        self.db2 = 1 / m * np.sum(self.dZ2, axis=1, keepdims=True) 
        self.dZ1 = self.W2.T.dot(self.dZ2) * ReLU_deriv(self.Z1)
        self.dW1 = 1 / m * self.dZ1.dot(X.T)
        self.db1 = 1 / m * np.sum(self.dZ1, axis=1, keepdims=True)
        
    def get_predictions(self, A2):
        return np.argmax(A2, axis=0)
    
    def learning_rate_decay(self, lr, t, decay_rate):
        self.lr = lr * (1 / (1 + t * decay_rate))
        
    def get_accuracy(self, predictions, Y):
        return np.sum(predictions == Y) / Y.size
    
    def adam_optimizer(self, param, grad, m, v, beta1, beta2, lr, t):
        m = beta1 * m + (1 - beta1) * grad
        v = beta2 * v + (1 - beta2) * (grad ** 2)
        m_hat = m / (1 - beta1 ** t)
        v_hat = v / (1 - beta2 ** t)
        param -= lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
        return param, m, v
    
    def update_param(self, lr, beta1=0.9, beta2=0.999):
        self.W1, self.mW1, self.vW1 = self.adam_optimizer(self.W1, self.dW1, self.mW1, self.vW1, beta1, beta2, lr, self.iteration)
        self.b1, self.mb1, self.vb1 = self.adam_optimizer(self.b1, self.db1, self.mb1, self.vb1, beta1, beta2, lr, self.iteration)
        self.W2, self.mW2, self.vW2 = self.adam_optimizer(self.W2, self.dW2, self.mW2, self.vW2, beta1, beta2, lr, self.iteration)
        self.b2, self.mb2, self.vb2 = self.adam_optimizer(self.b2, self.db2, self.mb2, self.vb2, beta1, beta2, lr, self.iteration)
        
    def train(self, iterations, X, Y, lr):
        self.iteration = 1
        for i in range(iterations):
            A2 = self.forward(X)
            self.backprop(Y, X)
            self.update_param(lr)
            
            if i % 10 == 0:
                self.train_predictions = self.get_predictions(A2)
                print(f'Iteration: {i} \nTrain Accuracy: {int(self.get_accuracy(self.train_predictions, Y) * 100)}%')
                self.check_accuracy(X_val.T,Y_val)
            self.iteration += 1
    
    def check_accuracy(self, X, Y):
        A2 = self.forward(X)
        test_predictions = self.get_predictions(A2)
        print(f'Test Accuracy: {int(self.get_accuracy(test_predictions, Y) * 100)}%\n')
    
    def make_predictions(self, X):
        A2 = self.forward(X)
        predictions = self.get_predictions(A2)
        return predictions

    def test_prediction(self, index, X_train, Y_train):
        current_image = X_train[:, index, None]
        prediction = self.make_predictions(current_image)
        label = Y_train[index]
        print("Prediction: ", prediction)
        print("Label: ", label)

        current_image = current_image.reshape((28, 28)) * 255
        plt.gray()
        plt.imshow(current_image, interpolation='nearest')
        plt.show()


In [101]:
model = MNIST_model()
model.train(100,X_train.T,Y_train,0.01)

Iteration: 0 
Train Accuracy: 11%
Test Accuracy: 13%

Iteration: 10 
Train Accuracy: 80%
Test Accuracy: 82%

Iteration: 20 
Train Accuracy: 87%
Test Accuracy: 86%

Iteration: 30 
Train Accuracy: 89%
Test Accuracy: 88%

Iteration: 40 
Train Accuracy: 90%
Test Accuracy: 89%

Iteration: 50 
Train Accuracy: 91%
Test Accuracy: 90%

Iteration: 60 
Train Accuracy: 92%
Test Accuracy: 90%

Iteration: 70 
Train Accuracy: 92%
Test Accuracy: 91%

Iteration: 80 
Train Accuracy: 93%
Test Accuracy: 91%

Iteration: 90 
Train Accuracy: 94%
Test Accuracy: 91%

