In [2]:
import numpy as np
import joblib
import scipy.special
import sys
import matplotlib.pyplot as  plt
# from google.colab import drive
# drive.mount('/content/gdrive')

In [3]:
path = '/content/gdrive/My Drive/Colab Notebooks/'

In [4]:
class MLPClassifier:

    acti_fns = ['relu', 'sigmoid', 'tanh']
    weight_inits = ['random', 'he', 'xavier']
    optimizers = ['gradient_descent','momentum', 'nag', 'adam', 'adagrad', 'rmsprop']
    regularizations = ['l1', 'l2', 'layer_norm', 'batch_norm']
    
    def __init__(self, layers, num_epochs, dropouts, learning_rate = 1e-5, activation_function='relu', optimizer="gradient_descent",
weight_init="random", regularization="l2", batch_size=64, **kwargs):
        
        if (activation_function not in self.acti_fns):
            raise Exception("Incorrect Weight Activation Function")
        if (optimizer not in self.optimizers):
            raise Exception("Incorrect Optimizer")
        if (regularization not in self.regularizations):
            raise Exception("Incorrect Regularizers")
        if (weight_init not in self.weight_inits):
            raise Exception("Incorrect Weight Initialization")
        
        np.random.seed(20)
        
        self.weights = []
        self.weights_grad = []
        self.inputs = []
        for i in range(len(layers)-1):
            if weight_init == "random":
                w = np.random.random((layers[i]+1,layers[i+1])).astype(np.float128)*0.01
                #else : he xavier
            self.weights.append(w)
            self.weights_grad.append(np.zeros(w.shape,dtype=np.float128))
            

    
        if activation_function == 'relu':
            self.acti_fns = self.relu
            self.acti_fns_grad = self.relu_grad
        elif activation_function == 'sigmoid':
            self.acti_fns = self.sigmoid
            self.acti_fns_grad = self.sigmoid_grad
        elif activation_function == 'tanh':
            self.acti_fns = self.tanh
            self.acti_fns_grad = self.tanh_grad
    
    
    
        if optimizer == 'gradient_descent': 
            self.optimizer = self.gradient_descent
            self.gradient_descent_init(learning_rate)
        elif optimizer == 'momentum':
            self.optimizer = self.momentum
            self.momentum_init(learning_rate,beta= 0.9)
        elif optimizer == 'nag':
            self.optimizer = self.nag
            self.nag_init()
        elif optimizer == 'adam':
            self.optimizer = self.adam
            self.adam_init()
        elif optimizer == 'adagrad':
            self.optimizer = self.adagrad
            self.adagrad_init()
        elif optimizer =='rmsprop':
            self.optimizer = self.rmsprop
            self.rmsprop_init()
    
    
    
#         self.lr = learning_rate

        self.batch_size = batch_size
        self.epochs = num_epochs
        self.dropouts = dropouts
        self.regul = regularization
    
    def fit(self,X,Y):
        
        # make it loop over batches <---
        training_loss = []
        iterations = []
        accuracy = []
        for e in range(self.epochs):
            rows = np.random.choice(X.shape[0],size = self.batch_size,replace = False)
            x_batch = X[rows,:]
            y_batch = Y[rows]
            one_hot = np.eye(9 + 1)[y_batch] # D(s)
            
            probs = self.forward_pass(x_batch) # Y(s)           
            predicted_labels = np.argmax(probs,axis = 1)
            loss = self.loss(probs,one_hot)
            acc = self.accuracy(predicted_labels,y_batch)           

            print("---> Epoch: {}/{},Batch Size: {},Loss:{} ,Accuracy: {}".format(e+1,self.epochs,self.batch_size,loss.sum(),acc))

            training_loss.append(loss.sum())
            iterations.append(e+1)
            accuracy.append(acc)
            
            self.backward_pass(probs,one_hot)
            self.optimizer()
            
            
        self.iteration = iterations
        self.training_loss = training_loss
        self.training_accuracy = accuracy
            
    def forward_pass(self,X):
        self.inputs =[] # Yi(s)
        
        for i in range(len(self.weights)):

            X = np.append(X,np.ones((X.shape[0],1)),axis=1)
            self.inputs.append(X)
            X = np.matmul(X,self.weights[i])
            if (i < len(self.weights) - 1):
                X = self.acti_fns(X)
            else :
                X = self.softmax(X)
        self.inputs.append(X)
        return X
        
    def backward_pass(self,Y,D):
        self.zero_grad()
        dl_dy = self.loss_grad(Y,D)
        for l in range(len(self.inputs)-1,0,-1):

            if (l == len(self.inputs) -1 ):
                # grad = self.softmax_grad(self.inputs[l])
                dl_dz = Y - D
            else: 
                grad = self.acti_fns_grad(self.inputs[l])
                dl_dz = np.multiply(grad,dl_dy) 

            if (l == len(self.inputs) - 1):
              self.weights_grad[l-1] += np.matmul(self.inputs[l-1].T,dl_dz)
              dl_dy = np.matmul(dl_dz,self.weights[l-1].T)
            else :
              self.weights_grad[l-1] += np.matmul(self.inputs[l-1].T,dl_dz[:,:-1])
              dl_dy = np.matmul(dl_dz[:,:-1],self.weights[l-1].T)
            
    def gradient_descent(self):
        for i in range(len(self.weights)):
            self.weights[i] -= self.lr*self.weights_grad[i]

    def momentum(self):
        for i in range(len(self.weights)):
            self.m[i] = self.beta*self.m[i] - self.lr*self.weights_grad[i]
            self.weights[i] = self.weights[i]  + self.m[i]
    
    def nag(self):
        pass
    
    def adam(self):
        pass
    
    def adagrad(self):
        pass
    
    def rmsprop(self):
        pass
    
    def gradient_descent_init(self,learning_rate):
        """ initialize learing rate
        """
        self.lr = learning_rate
        pass
    
    def momentum_init(self,learning_rate,beta):
        self.beta = beta
        self.m = []
        for i in range(len(self.weights)):
            self.m.append(np.zeros(self.weights[i].shape))
        self.lr = learning_rate 
    
    def nag_init(self):
        pass
    
    def adam_init(self):
        pass
    
    def adagrad_init(self):
        pass
    
    def rmsprop_init(self):
        pass
        
        
    
    def predict(self,X):  
        probs = self.predict_proba(X)
        return np.argmax(probs,axis=1)
    
    def predict_proba(self,X):
        return self.forward_pass(X)
        
    def get_params(self):
        return self.weights
        
    def score(self,X,Y):
        pass 
        
    def zero_grad(self):
        for i in range(len(self.weights_grad)):
            self.weights_grad[i] = np.zeros(self.weights_grad[i].shape)

    def loss(self,Y,D):
        ''' Cross_Entropy Loss
            Y: probs (n_instances,M)
            D: Actual Labels (n_instances,M); M:number of neurons in the last layer
        '''
        noise = 1e-9
        return -1*np.mean(D*np.log(Y+noise))
    
    def loss_grad(self,Y,D):
        '''Y : probabilites (n_instances,M)
           D : Actual Labels(n_instances,M); categorical cross entropy (not used)
        '''
        N = Y + 1e-9
        N = -1/N
        return N
    
    
    def relu(self,X):
        r = X.copy()
        r[X<0] = 0
        return r

    def relu_grad(self,X):
        r = X.copy()
        r[X>=0] = 1
        r[X<0] = 0
        return r
    
    def tanh(self,X):
        return np.tanh(X)
    
    def tanh_grad(self,X):
        return 1 - self.tanh(X)**2
    
    def sigmoid(self,X):
        return 1/(1 + np.exp(-X))
    
    def sigmoid_grad(self,X):
        a = self.sigmoid(X)
        return a*(1 - a)
    
    def softmax(self,X):
        exp = (X - X.mean(axis= 1).reshape((X.shape[0],1)))
        # X = np.exp(X)/np.sum(np.exp(X),axis= 1)[:,None] 
        exp = scipy.special.softmax(exp ,axis=1)
        return exp
                
    def softmax_grad(self,X):
        ''' there is an other of derivative, however for gradient computation this term suffices'''
        S = self.softmax(X)
        out = S*(1- S)
        return out
    
    def accuracy(self,predicted_labels,true_labels):
        s = predicted_labels == true_labels
        return s.sum()/s.shape[0]
        
                  

In [9]:
n = MLPClassifier(layers = [784,128,10], dropouts = 0.2,num_epochs=500, learning_rate = 1e-1, activation_function='relu', optimizer="gradient_descent",
weight_init="random",batch_size=64)
n.fit(x_train,y_train)
plt.plot(n.iteration,n.training_loss)

---> Epoch: 1/500,Batch Size: 64,Loss:0.2301017779517305 ,Accuracy: 0.0625
---> Epoch: 2/500,Batch Size: 64,Loss:1.2272503069494434 ,Accuracy: 0.140625
---> Epoch: 3/500,Batch Size: 64,Loss:0.24498258349864976 ,Accuracy: 0.140625
---> Epoch: 4/500,Batch Size: 64,Loss:0.31673391490966546 ,Accuracy: 0.109375
---> Epoch: 5/500,Batch Size: 64,Loss:0.2776804690059273 ,Accuracy: 0.109375
---> Epoch: 6/500,Batch Size: 64,Loss:0.24649649506746182 ,Accuracy: 0.0625
---> Epoch: 7/500,Batch Size: 64,Loss:0.253835423108267 ,Accuracy: 0.03125
---> Epoch: 8/500,Batch Size: 64,Loss:0.23663220634418772 ,Accuracy: 0.09375
---> Epoch: 9/500,Batch Size: 64,Loss:0.23377823852092433 ,Accuracy: 0.140625
---> Epoch: 10/500,Batch Size: 64,Loss:0.23072455786801518 ,Accuracy: 0.15625
---> Epoch: 11/500,Batch Size: 64,Loss:0.2324358177545772 ,Accuracy: 0.046875
---> Epoch: 12/500,Batch Size: 64,Loss:0.23614141522552476 ,Accuracy: 0.109375
---> Epoch: 13/500,Batch Size: 64,Loss:0.22993263425022853 ,Accuracy: 0.14

KeyboardInterrupt: 

In [7]:
def upload_datasets():
    val_set = joblib.load('val_set.pkl')
    train_set = joblib.load('train_set.pkl')
    return train_set,val_set
def preprocess_data(dataset):
    x_temp = dataset['Image'].values
    x = []
    y = dataset['Labels'].values
#     y = np.array(y).reshape((y.shape[0],1))
    for i in range(0,len(x_temp)):
        x.append(np.array(x_temp[i]).reshape(784,)/255)
    x = np.array(x)
    return x,y
train_set, val_set = upload_datasets()
x_train, y_train = preprocess_data(train_set)
x_val , y_val = preprocess_data(val_set)

In [None]:
np.eye(y_val.max()+1)[y_val]

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])