In [1]:
def convert(imgf, labelf, outf, n):#Function to Convert MNIST image and label files into a CSV format.
    """Parameters:

        imgf: Path to the image file (e.g., train-images.idx3-ubyte).
        
        labelf: Path to the label file (e.g., train-labels.idx1-ubyte).
        
        outf: Output CSV file path (e.g., mnist_train.csv).
        
        n: Number of images to convert."""
    f = open(imgf, "rb")
    o = open(outf, "w")
    l = open(labelf, "rb")
    #Open the image file, label file and output CSV file in binary read mode ("rb"), and write mode ("w"), respectively.

    #Skip the header information in the image and label files. MNIST image files have a 16-byte header, and label files have an 8-byte header.
    f.read(16)
    l.read(8)
    images = []# Initialize an empty list to store the converted images.

    """Loop through each image:

        Read the label for the current image and convert it to an integer using ord().
        
        Read each pixel of the image (28x28 = 784 pixels) and append it to the image list.
        
        Add the complete image to the images list."""
    for i in range(n):
        image = [ord(l.read(1))]#reads 1 byte, converts it into int and initializes image list with label as 1st ele
        for j in range(28*28):
            image.append(ord(f.read(1)))
        images.append(image)

    for image in images:# Write each image (including its label) to the output CSV file, with pixels converted to strings separated by commas.
        o.write(",".join(str(pix) for pix in image)+"\n")
    #Close all the opened files.
    f.close()
    o.close()
    l.close()

#Convert the training and testing datasets to CSV format.
convert("train-images.idx3-ubyte", "train-labels.idx1-ubyte",
        "mnist_train.csv", 60000)
convert("t10k-images.idx3-ubyte", "t10k-labels.idx1-ubyte",
        "mnist_test.csv", 10000)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

 Loading Test Data

In [4]:
data_test=pd.read_csv("mnist_test.csv")#Loads the converted test dataset from the CSV file into a pandas DataFrame named data_test.

In [5]:
data_test.head()#1st col:labels(2,1,0,4,1,.);remaining:pixel values

Unnamed: 0,7,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.658,0.659,0.660,0.661,0.662,0.663,0.664,0.665,0.666,0.667
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Loading train data

In [7]:
data_train=pd.read_csv("mnist_train.csv")#Loads the converted training dataset from the CSV file into a pandas DataFrame named data_train.
data_train.head()

Unnamed: 0,5,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.608,0.609,0.610,0.611,0.612,0.613,0.614,0.615,0.616,0.617
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
m,n=data_train.shape
print(m,n)#checking for no. of training images and each image's data

59999 785


Transpose the arrays so that each column represents an image, which is useful for matrix operations in neural networks.

In [10]:
data_train_array=np.array(data_train)#Converts the data_train DataFrame into a numpy array named data_train_array.
data_train_array=data_train_array.T#Transposes the data_train_array. This operation swaps the rows with columns.
data_train_array.shape


(785, 59999)

In [11]:
data_test_array=np.array(data_test)
data_test_array=data_test_array.T
data_test_array.shape

(785, 9999)

In [12]:
y_train=data_train_array[0]# Extracts the first row of data_train_array and assigns it to y_train.
x_train=data_train_array[1:n]#Extracts rows from the second row to the n-1 row of data_train_array and assigns them to x_train
x_train=x_train/255 #Normalizes the pixel values in x_train by dividing them by 255.
y_test=data_test_array[0]#Extracts the first row of data_test_array and assigns it to y_test
x_test=data_test_array[1:n]#Extracts rows from the second row to the n-1 row of data_test_array and assigns them to x_test.


In [13]:
x_train[:,0].shape#all rows and 1st col

(784,)

In [14]:
x_train.shape

(784, 59999)

In [15]:
y_train.size

59999

In [16]:
y_train.max(0)#max label value in the array of no.s 0-9 is 9


9

### Neural Network

In [18]:
def init_parameters():# Defines a function to initialize the parameters of the neural network.
    w1=np.random.rand(10, 784)-0.5
    #Initializes the weights for the first layer (w1) as a matrix of random values between -0.5 and 0.5. The shape is (10, 784), meaning there are 10 output neurons and 784 input neurons (28x28 images).
    b1=np.random.rand(10, 1)-0.5
    #Initializes the biases for the first layer (b1) as a column vector of random values between -0.5 and 0.5.
    w2=np.random.rand(10, 10)-0.5
    #Initializes the weights for the second layer (w2) as a matrix of random values between -0.5 and 0.5.
    b2=np.random.rand(10, 1)-0.5
    #Initializes the biases for the second layer (b2) as a column vector of random values between -0.5 and 0.5.
    return w1,w2,b1,b2

def ReLU(x):#Defines a function named ReLU that applies the ReLU (Rectified Linear Unit) activation to the input x.
    return np.maximum(x,0)#This means that any input value less than 0 is set to 0, and any input value greater than or equal to 0 remains unchanged.

def softmax(x):#Defines a function named softmax that applies the softmax activation to the input x.
   # Used in the output layer for multi-class classification problems to ensure outputs sum up to 1, representing probabilities.
    exp_x = np.exp(x - np.max(x)) #Computes the exponential of each element in x.
    return exp_x / np.sum(exp_x)#Returns the softmax output, which is the exponential of x divided by the sum of exponentials of all elements in x.

def forward_prop(w1,w2,b1,b2,x):#Defines a function named forward_prop that performs the forward propagation through the neural network.
    #Computes the output of the network for a given input by passing it through each layer.
    z1 = np.dot(w1, x) + b1#Computes the weighted sum (z1) for the first layer by multiplying the weights (w1) with the input (x) and adding the bias (b1).
    A1 = ReLU(z1)#Applies the ReLU activation function to z1 to get the output of the first layer (A1).
    
    z2 = np.dot(w2, A1) + b2#Computes the weighted sum (z2) for the second layer by multiplying the weights (w2) with the output of the first layer (A1) and adding the bias (b2).
    A2 = softmax(z2)# Applies the softmax activation function to z2 to get the final output (A2) of the network.
   
    return z1, z2, A1, A2


def one_hot_encode(y):#Converts categorical labels into a numerical representation that can be processed by the network.
    one_hot_y=np.zeros((y.size,(y.max()+1)))#Initializes a matrix filled with zeros for one-hot encoding.
   
    i=np.arange(y.size)# Sets the appropriate column of each row to 1 based on the label value.
    one_hot_y[i,y]=1#Sets the corresponding column to 1 for each label.
    one_hot_y=one_hot_y.T#Transposes the one-hot encoded matrix.
    return one_hot_y

def der_relu(z):#Defines the derivative of the ReLU activation function used in backpropagation.
   return z>0#Returns a boolean array indicating where z is greater than 0.
    
def back_prop(z1,z2,A1,A2,w2,x,y):#Computes the gradients of the loss with respect to each parameter in the network.
    m = y.size#Gets the number of examples.
    one_hot_y = one_hot_encode(y)#One-hot encodes the labels.
    dz2 = A2 - one_hot_y#Computes the error gradient for the output layer.
    
    dw2 = np.dot(dz2, A1.T) / m #Computes the gradient of the loss with respect to the weights of the second layer.
    db2 = np.sum(dz2) / m #Computes the gradient of the loss with respect to the bias of the second layer.
    dz1 = np.dot(w2.T, dz2) * der_relu(z1) #Computes the error gradient for the first layer.
    dw1 = np.dot(dz1, x.T) / m #Computes the gradient of the loss with respect to the weights of the first layer.
    db1 = np.sum(dz1) / m #Computes the gradient of the loss with respect to the bias of the first layer.
    return dw1,dw2,db1,db2

def update_param(w1,w2,b1,b2,dw1,dw2,db2,db1,a):#Defines a function to update the parameters using gradients and a learning rate.
    w1=w1-a*dw1#Updates the weights of the first layer.
    w2=w2-a*dw2#Updates the weights of the second layer.
    b1=b1-a*db1#Updates the bias of the first layer.
    b2=b2-a*db2#Updates the bias of the second layer.
    return w1,w2,b1,b2

In [19]:
def get_predict(A2):#Defines a function named get_predict that returns the predicted labels based on the output of the network (A2).
    return np.argmax(A2,0)#Returns the indices of the maximum values along the first axis (axis=0) of A2, which correspond to the predicted labels.

def accuracy(pred, y):#Defines a function named accuracy that calculates the accuracy of the model by comparing predictions (pred) with actual labels (y).
    print(pred,y)
    return np.sum(pred==y)/y.size#Returns the accuracy as the ratio of correctly predicted labels to the total number of labels.

def gradient_decend(x,y,epochs,lr):#Defines a function named gradient_decend that trains the neural network using gradient descent.
    w1,w2,b1,b2=init_parameters()#Initializes the weights and biases of the network using the init_parameters function.
    for i in range (epochs):# Loops through each training epoch.
        # An epoch is a term used in machine learning to describe the process of training a model on the entire training dataset once.
        z1,z2,A1,A2=forward_prop(w1,w2,b1,b2,x)#Performs forward propagation to compute the outputs of the network.
        dw1,dw2,db1,db2=back_prop(z1,z2,A1,A2,w2,x,y)#Calls the forward_prop function to get the intermediate results (z1, z2, A1, A2).
        w1,w2,b1,b2=update_param(w1,w2,b1,b2,dw1,dw2,db2,db1,lr)#Performs backward propagation to compute the gradients.
        if(i%50==0):#hecks if the current epoch is a multiple of 50.
            print("epochs=",i)
            print("accuracy=",accuracy(get_predict(A2),y))#Prints the accuracy of the model at the current epoch.
    return w1,w2,b1,b2
    

In [20]:
w1,w2,b1,b2=gradient_decend(x_train,y_train,2000,0.1)


epochs= 0
[8 8 2 ... 8 8 2] [0 4 1 ... 5 6 8]
accuracy= 0.10200170002833381
epochs= 50
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 100
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 150
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 200
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 250
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 300
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 350
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 400
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 450
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 500
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 550
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.09870164502741713
epochs= 600
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.0987016

  exp_x = np.exp(x - np.max(x)) #Computes the exponential of each element in x.


epochs= 1850
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.0987183119718662
epochs= 1900
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.0987183119718662
epochs= 1950
[0 0 0 ... 0 0 0] [0 4 1 ... 5 6 8]
accuracy= 0.0987183119718662


In [21]:
def pred(x,w1,w2,b1,b2):
    z1,z2,A1,A2=forward_prop(w1,w2,b1,b2,x)
    return np.argmax(A2,0)




In [22]:
predic=pred(x_test,w1,w2,b1,b2)
accuracy(predic,y_test)

[0 0 0 ... 0 0 0] [2 1 0 ... 4 5 6]


0.09800980098009801