In [None]:
import sklearn.datasets as dataset
import numpy as np
import pandas as pd
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import random
from sklearn.utils import shuffle
data= dataset.load_iris()
X= data.data
Y= data.target
Y=Y.reshape(150,1,1)
X=X.reshape(150,1,4)
X= X/np.linalg.norm(X)
x_train,x_test,y_train,y_test=train_test_split(X,Y,train_size=0.33,random_state=42)
y_train=np_utils.to_categorical(y_train)
y_test=np_utils.to_categorical(y_test)
print(y_test[0])

# Gradient descent used below

In [None]:
class Layer:
    def __init__(self,input_size,output_size):
        self.input=None
        self.output=None
        self.weights= np.random.rand(input_size,output_size)-0.5
        self.biases= np.random.rand(1,output_size)-0.5
    def forward(self,input):
        self.input=input
        self.output= np.dot(self.input,self.weights)+self.biases
        return self.output
    def backward(self,o_error,alpha):
        w_error= np.dot(self.input.T,o_error)
        x_error= np.dot(o_error,self.weights.T)
        self.weights=self.weights + w_error*alpha 
        self.biases=self.biases+alpha*o_error
        return x_error

In [None]:
class Activation_class:
    def __init__(self,a_func,act_func_derivative):
        self.activation= a_func
        self.activation_derivative= act_func_derivative
    def forward(self,input):
        self.input= input
        self.output= self.activation(self.input)
        return self.output
    def backward(self,y_error,alpha):
        return self.activation_derivative(self.input)*y_error

In [None]:
class NN:
    def __init__(self):
        self.layers=[]
        self.error= lambda y_pred,y_true: np.mean(np.power(y_true-y_pred,2))
        self.error_derivative= lambda y_true,y_output: 2*(y_true-y_output)/75
    def adding_layer(self,layers):
        self.layers.append(layers)
    def fit_model(self,x,y,itera,alpha):
        for i in range(itera):
            p_e=[]
            net_error=0
            for j in range(49):
                output=x[j]
                for layers in self.layers:
                    output= layers.forward(output)
                error_occured= self.error(output,y[j])
                net_error+=error_occured
                error_derivative= self.error_derivative(y[j],output)
                for layers in reversed(self.layers):
                    error_derivative=layers.backward(error_derivative,alpha,)
            net_error/=49
            if(i%200==0):
                print("iteration: ", i,"error: ",net_error)
        return net_error
    def predict(self,input):
        samples = len(input)
        result =[]
        for i in range(samples):
            output = input[i]
            for layers in self.layers:
                output= layers.forward(output)  
            result.append(output)
        return result
    def accuracy(self,input):
        ac = 0
        samples = len(input)
        for i in range(samples):
            y_pred=self.predict(x_test[i])
            y_pred= np.argmax(y_pred)
            if(y_pred == np.argmax(y_test[i])):
                ac+=1
        acc = (ac/samples)*100
        return acc

In [None]:
def Activation_Relu(x):
    output = np.maximum(0, x)
    return output
def Activation_Relu_derivative(x):
     return np.greater(x, 0.).astype(np.float64)
      
def Activation_softmax_derivative(x):
    return x*(1-x)
def Activation_softmax(input):
    exp_values = np.exp(input.astype(float))
    prob= exp_values/np.sum(exp_values,keepdims=True)
    return prob
def sigmoid(input):
    return 1/(1+(np.exp(-input)))
def sigmoid_derivative(input):
    return sigmoid(input)*(1 - sigmoid(input))
 

In [None]:
network= NN()
network.adding_layer(Layer(4,10))
network.adding_layer(Activation_class(Activation_Relu,Activation_Relu_derivative))
network.adding_layer(Layer(10,3))
network.adding_layer(Activation_class(sigmoid,sigmoid_derivative))
network.fit_model(x_train,y_train,30000,0.5)
p=network.predict(x_test)
q= network.accuracy(x_test)

In [None]:
#accuracy
print(q)

**The accuracy is around 97 percent after 30000 iterations**

# **Other optimizers used**:

# **Momentum Gradient Descent**
Uses and additional momemtum parameter to reach the minimum faster 

In [None]:
class Layer_2:
    def __init__(self,input_size,output_size):
        self.input=None
        self.output=None
        self.weights= np.random.rand(input_size,output_size)-0.5
        self.biases= np.random.rand(1,output_size)-0.5
        self.momemtum = 0.9
        self.m_vector_w, self.m_vector_b = 0 , 0
    def forward(self,input):
        self.input=input
        self.output= np.dot(self.input,self.weights)+self.biases
        return self.output
    def backward(self,o_error,alpha): 
        w_error= np.dot(self.input.T,o_error)
        x_error= np.dot(o_error,self.weights.T)
        self.m_vector_w = self.momemtum * self.m_vector_w + alpha * w_error
        self.m_vector_b = self.momemtum * self.m_vector_b + alpha * o_error
        self.weights=self.weights + self.m_vector_w
        self.biases=self.biases + self.m_vector_b 
        return x_error

In [None]:
network= NN()
network.adding_layer(Layer_2(4,5))
network.adding_layer(Activation_class(Activation_Relu,Activation_Relu_derivative))
network.adding_layer(Layer_2(5,3))
network.adding_layer(Activation_class(sigmoid,sigmoid_derivative))
network.fit_model(x_train,y_train,15000,0.5)
p=network.predict(x_test)
q= network.accuracy(y_test)

In [None]:
#accuracy now
print(q)

**In Momemtum Gradient Descent, 97 percent accuracy on test data after just 15000 iterations, which is half of what was required in simple gradient descent**

# **Another implementation of momemtum optimizer**

In [None]:
class Layer_2_1:
    def __init__(self,input_size,output_size):
        self.input=None
        self.output=None
        self.weights= np.random.rand(input_size,output_size)-0.5
        self.biases= np.random.rand(1,output_size)-0.5
        self.momemtum = 0.9
        self.m_vector_w, self.m_vector_b = 0 , 0
    def forward(self,input):
        self.input=input
        self.output= np.dot(self.input,self.weights)+self.biases
        return self.output
    def backward(self,o_error,alpha): 
        w_error= np.dot(self.input.T,o_error)
        x_error= np.dot(o_error,self.weights.T)
        self.m_vector_w = self.momemtum * self.m_vector_w + (1-self.momemtum) * w_error
        self.m_vector_b = self.momemtum * self.m_vector_b + (1-self.momemtum) * o_error
        self.weights=self.weights + alpha*self.m_vector_w
        self.biases=self.biases + alpha*self.m_vector_b 
        return x_error

In [None]:
network= NN()
network.adding_layer(Layer_2_1(4,5))
network.adding_layer(Activation_class(Activation_Relu,Activation_Relu_derivative))
network.adding_layer(Layer_2_1(5,3))
network.adding_layer(Activation_class(sigmoid,sigmoid_derivative))
network.fit_model(x_train,y_train,15000,0.5)
p=network.predict(x_test)
q= network.accuracy(y_test)

In [None]:
print("Accuracy after 15000 iterations: ", q)

**The accuracy on test data is 97 percent after 15000 iterations**

# **RMS Propagation:** uses different learning rate for differnet parameters.
 Divides learning rate by the average of exponential decay of squared gradients.

In [None]:
class Layer_3:
    def __init__(self,input_size,output_size):
        self.input=None
        self.output=None
        self.weights= np.random.rand(input_size,output_size)-0.5
        self.biases= np.random.rand(1,output_size)-0.5
        self.momemtum = 0.9
        self.rms_prop_w, self.rms_prop_b = 0 , 0
    def forward(self,input):
        self.input=input
        self.output= np.dot(self.input,self.weights)+self.biases
        return self.output
    def backward(self,o_error,alpha): 
        w_error= np.dot(self.input.T,o_error)
        x_error= np.dot(o_error,self.weights.T)
        self.rms_prop_w = self.momemtum * self.rms_prop_w + (1-self.momemtum) * (w_error**2)
        self.rms_prop_b = self.momemtum * self.rms_prop_b + (1-self.momemtum) * (o_error**2)
        self.weights=self.weights + alpha*(w_error/(self.rms_prop_w**0.5-0.000000001))
        self.biases=self.biases + alpha*(o_error/(self.rms_prop_b**0.5-0.00000001)) 
        return x_error   

In [None]:
network= NN()
network.adding_layer(Layer_3(4,5))
network.adding_layer(Activation_class(Activation_Relu,Activation_Relu_derivative))
network.adding_layer(Layer_3(5,3))
network.adding_layer(Activation_class(sigmoid,sigmoid_derivative))
network.fit_model(x_train,y_train,5500,0.01)
p=network.predict(x_test)
q= network.accuracy(y_test)

In [None]:
# accuracy
print(q)

**Test data accuracy: 97 percent after just 5500 iterations**

 