In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
from sklearn.decomposition import PCA

  from ._conv import register_converters as _register_converters


We'll be using a library called Cupy. It is similar to Numpy with the added advantage of doing its computations on the GPU. 

In [2]:
def Load_data():
    # Load data and use Min-Max scaling
    (train,y_train),(test,y_test) = mnist.load_data()
    train = train.reshape(60000,28*28)/255
    test = test.reshape(10000,28*28)/255
    return (train,test),(y_train,y_test)

In [3]:
def Softmax(data,params):
    ans = np.exp(np.matmul(data,params[0])+params[1])
    ans = (ans/(np.sum(ans,axis=1)).reshape(data.shape[0],1))
    return ans

In [4]:
def Neg_log_loss(labels,softmax,theta,regularisation,lamb):
    ans = -np.mean(labels*np.log(softmax))
    if regularisation:
        ans = ans - lamb/(2*softmax.shape[0])*(np.linalg.norm(theta))
    return np.array(ans)

In [5]:
def fdash_thet0(sftmax,labels):
    '''derivative of theta_0 is calculated as summation of labels-Softmax'''
    ans = np.sum(sftmax-labels,axis=0)
    return np.array(ans)/sftmax.shape[0]

In [6]:
def fdash_thet(data,labels,sftmx,theta,regularisation,lamb):
    '''derivative of theta is calculated as (Updated_labels.T x Data).T
       Updated labels = -(labels-Softmax)
       This negative sign is due to Negative Log Loss'''
    ans = np.matmul(np.transpose(data),sftmx-labels)
    if regularisation:
        ans = np.matmul(np.transpose(data),sftmx-labels) - lamb/(2*data.shape[0])*np.sum(theta)
    return np.array(ans)/data.shape[0]

In [7]:
def gradient_descent(training_data,labels,thetas,epsilon,learning_rate,Type=None,epochs=0,batch_size=0,regularisation=None,lamb=0):
    '''
    This function will use Batch Gradient Descent and Mini-Batch Gradient Descent 
    for optimisation depending upon the choice of user.
    
    Parameters
    ----------
    training_data: array-like
        input of shape (m,n) where m represents the number of training samples.
    
    labels: array-like
        input of shape (m,1)
        should not be one hot encoded
    
    thetas: list of arrays [thetas,theta_0]
        they should be pre-initialised
        thetas should be of the shape (n,k) where k is the number of unique labels
        theta_0 should be of the shape (1,k)
    
    epsilon: float
        error tolerance
    
    learning_rate: float
    
    Type: str input, optional
        use 'mini' for Mini-Batch Gradient Descent
        use None for Batch Gradient Descent
    
    epochs: int, optional
        should be used if user has mentioned the type of gradient descent to be used
        
    batch_size: int, optional
        size of the mini batch
        
    regularisation: bool, optional
        Default None
        set to True for l2 regularisation
        
    lamb: int, optional
        a positive value which will be used as regularisation parameter
        
        
    returns a tuple containing thet_f, thet0_f and neg_loss_history
    '''
    assert thetas[0].shape==(training_data.shape[1],len(np.unique(labels)))
    assert thetas[1].shape==(1,len(np.unique(labels)))
    assert epochs>=0
    assert learning_rate>=0
    assert epsilon>=0
    assert lamb>=0
    assert batch_size>=0 and batch_size<training_data.shape[0]
    
    
    thet_i = thetas[0]
    thet0_i = thetas[1]
    
    iterations=[]
    neg_log_history=[]
    deriv=[]
    iden=np.identity(len(np.unique(labels)))
    classes_train = labels
        
    if Type==None:
        training_data_1 = training_data
        classes_tr = np.array(iden[classes_train])
        i=0
        while True:
            sftmx_ini = Softmax(training_data,[thet_i,thet0_i])

            thet0_f = thet0_i - learning_rate*(fdash_thet0(sftmx_ini,classes_tr))
            thet_f = thet_i - learning_rate*(fdash_thet(training_data_1,classes_tr,sftmx_ini,theta=thet_i,regularisation=regularisation,lamb=lamb))

            sftmx_fin = Softmax(training_data_1,[thet_f,thet0_f])


            neg_loss = abs(Neg_log_loss(classes_tr,sftmx_fin,thet_f,regularisation,lamb) - Neg_log_loss(classes_tr,sftmx_ini,thet_i,regularisation,lamb))
            neg_log_history.append(abs(Neg_log_loss(classes_tr,sftmx_fin,thet_f,regularisation,lamb)))
            iterations.append(i)
            deriv.append(thet_f[:,0])
            if i%500==0:
                print('neg log loss at iteration {} is {}'.format(i,neg_log_history[-1]))

            i+=1

            if (neg_loss)<epsilon:
                print('neg log loss final at iteration {} is {}'.format(i,neg_log_history[-1]))
                return thet_f,thet0_f,neg_log_history,deriv
            else:
                thet0_i = thet0_f
                thet_i = thet_f

                
    elif Type=='mini':
        
        batches = training_data.shape[0]//batch_size
        
        for e in range(epochs):
            for batch in range(batches):
                training_data_1 = np.array(training_data[batch*batch_size:(batch+1)*batch_size])
                classes_tr = np.array(iden[classes_train[batch*batch_size:(batch+1)*batch_size]])
                
                sftmx_ini = Softmax(training_data_1,[thet_i,thet0_i])

                thet0_f = thet0_i - learning_rate*(fdash_thet0(sftmx_ini,classes_tr))
                thet_f = thet_i - learning_rate*(fdash_thet(training_data_1,classes_tr,sftmx_ini,thet_i,regularisation,lamb))

                sftmx_fin = Softmax(training_data_1,[thet_f,thet0_f])
                

                neg_loss = abs(Neg_log_loss(classes_tr,sftmx_fin,thet_f,regularisation,lamb) - Neg_log_loss(classes_tr,sftmx_ini,thet_i,regularisation,lamb))
                neg_log_history.append(abs(Neg_log_loss(classes_tr,sftmx_fin,thet_f,regularisation,lamb)))
                iterations.append((e,batch))

                thet0_i = thet0_f
                thet_i = thet_f
            if e%10==0:
                print('neg log loss after epoch {} is {}'.format(e,neg_log_history[-1]))
            deriv.append(thet_f[:,0])

        print('neg log loss after epoch {} is {}'.format(e,neg_log_history[-1]))
    return thet_f,thet0_f,neg_log_history,deriv

In [8]:
# load data
(train,test),(classes_train,classes_test) = Load_data()

In [9]:
#PCA for dimensionality reduction. This will reduce the amount of calculations to be done and will
#improve the execution speed.
cov_matrix = pd.DataFrame(train).cov()
q,lam,qt = np.linalg.svd(cov_matrix)
trace = np.sum(lam)
f_vector=[]
s=0
for i in range(len(lam)):
    s+=lam[i]
    if s/trace>0.97:
        break
    else:
        f_vector.append(q[:,i])
f_vector = np.array(f_vector)
f_vector.shape

(213, 784)

In [10]:
training_data = np.matmul(np.array(train),np.transpose(f_vector))
training_data.shape

(60000, 213)

In [11]:
#Run the training process
thet0_i = np.zeros(shape=(1,len(np.unique(classes_train))))
thet_i = np.zeros(shape=(training_data.shape[1],len(np.unique(classes_train))))

tf,t0f,nlh,derivs = gradient_descent(training_data=training_data,labels=classes_train,thetas=[thet_i,thet0_i],
                                     epsilon=1e-6,learning_rate=0.001,Type='mini',epochs=100,batch_size=128,
                                     regularisation=True,lamb=0.5)

neg log loss after epoch 0 is 0.1777537280064086
neg log loss after epoch 10 is 0.043883561689200025
neg log loss after epoch 20 is 0.02455075174261439
neg log loss after epoch 30 is 0.016727286195589004
neg log loss after epoch 40 is 0.012331537599606228
neg log loss after epoch 50 is 0.009437511998576582
neg log loss after epoch 60 is 0.007345367307110983
neg log loss after epoch 70 is 0.00573663757703362
neg log loss after epoch 80 is 0.004444410101457651
neg log loss after epoch 90 is 0.003372239855612042
neg log loss after epoch 99 is 0.002545480956634273


In [12]:
#training accuracy
ans=[]
sft=Softmax(training_data,[tf,t0f])
for i in range(training_data.shape[0]):
    ans.append(np.argmax(sft[i]))
np.count_nonzero(ans==classes_train)/training_data.shape[0]

0.8931166666666667

In [13]:
#testing
iden = np.identity(len(np.unique(classes_test)))
classes_te = np.array(iden[classes_test])
test = np.matmul(test,np.transpose(f_vector))

In [14]:
#testing accuracy
test_sft=Softmax(test,[tf,t0f])
ans_test = []
for i in range(test.shape[0]):
    ans_test.append(np.argmax(test_sft[i]))
np.count_nonzero(ans_test==classes_test)/test.shape[0]

0.8999

The accuracy can be improved by tuning our hyperparameters