In [42]:
import numpy as np
import pandas as pd

data = pd.read_csv('data/IRIS.csv')
data = data.sample(frac=1)
data.replace({'species':{list(data['species'].unique())[i]:i for i in range(0,3)}},inplace=True)
data_train = data.iloc[:75,:]
data_test = data.iloc[75:,:]

In [43]:
def one_hot_encoder(y):
    number_of_class = len(np.unique(y))
    return np.eye(number_of_class)[y]

In [44]:
X = (data_train.drop('species',axis=1).to_numpy()).T
Y = (one_hot_encoder(data_train['species'].to_numpy())).T
X_test = (data_test.drop('species',axis=1).to_numpy()).T
Y_test = (one_hot_encoder(data_test['species'].to_numpy())).T

***create_network(input_size,output_size,hidden_layers,layer_size)***\
\
*input_size* : number of features in dataset\
*output_size* : 1 in case of regression and K in case of k classes\
*hidden_layers* : number of hidden layers\
*layer_size* : size(number of neuron) in each layer\
\
aim is to generate random initial weight and bias for layers\
returns a dictionary *parameters*\
where $(W_{i})_{mxn}$ $m$ is size of the $i^{th}$ layer and $n$ is size of $(i-1)^{th}$ layer\
and $(b_{i})_{mx1}$ $m$ is size of the $i^{th}$ layer 

In [45]:
# creating random initial parameter ie. weights and bias for different layer
def create_network(input_size,output_size,hidden_layers,layer_size):
    parameters = {}
    np.random.seed(42)
    sizes = [input_size]+layer_size+[output_size]
    for i in range(1,hidden_layers+2):
        parameters['w'+str(i)] = np.random.randn(sizes[i],sizes[i-1])
        parameters['b'+str(i)] = np.random.randn(sizes[i],1)
        #print('creation b size',parameters['b'+str(i)].shape)
    return parameters

In [46]:
# just some activation function , its derevative and output fuction respectively
def sigmoid(x):
    return 1/(1+np.exp(-x))
def grad_sigmoid(x):
    return sigmoid(x)*(1-sigmoid(x))
def softmax(x):
    n = np.exp(x)
    d = np.sum(n,axis=0)
    return (n/d)

***feed_forward(X,hidden_layers,parameters)***\
\
*X* : data matrix\
*hidden_layers* : number of hidden layers\
*parameters* : dictioanry of weight and bias\
\
return dictionary A and H\
where $a_{i}$ in A is the pre-activation result of $i^{th}$ layer\
and $h_{i}$ in H is the activation result of $i^{th}$ layer\
\
also return $\hat y$ which is result matrix of neural network (just a vector in case of regression)\
in $\hat y$ each column correspond to one data point and each row is *probability* of that class

In [47]:
# to genrate end-result and the intermidiate a and h
def feed_forward(x,hidden_layers,parameters):
    A = {}
    H = {'h0':x}
    
    for i in range(1,hidden_layers+2):
        #print(i,parameters['w'+str(i)].shape, H['h'+str(i-1)].shape)
        A['a'+str(i)] = np.dot(parameters['w'+str(i)], H['h'+str(i-1)]) + parameters['b'+str(i)]
        #print(A['a1'].shape)
        if i != hidden_layers+1:
            H['h'+str(i)] = sigmoid(A['a'+str(i)])
        else:
            H['h'+str(i)] = softmax(A['a'+str(i)])
    y_hat = H['h'+str(hidden_layers+1)]
    
    return A,H,y_hat

***back_propogate(A,H,y,y_hat,parameters,hidden_layers,input_size)***\
\
*A , H , y , y_hat , parameters , hidden_layers , input_size* : All have usual meaning as above\
\
return a dictionary *gradients*\
where $dW_{i}$ is gradient of loss function with respect to weight of layer $i$\
similar for $db_{i},da_{i},dh_{i}$ are gradient of loss function w.r.t bias,pre-activation and activation of $i^{th}$ layer respectively

In [48]:
def back_propogate(A,H,y,y_hat,parameters,hidden_layers,input_size):
    gradients = {}
    A['a0'] = np.zeros((input_size,1))
    gradients['da'+str(hidden_layers+1)] = y_hat-y
    #print(gradients['da'+str(hidden_layers+1)].shape)
    for i in range(hidden_layers+1,0,-1):
        gradients["dw" + str(i)] = np.dot(gradients["da" + str(i)], (H["h" + str(i-1)]).T)
        gradients["db" + str(i)] = gradients["da" + str(i)]
        gradients["dh" + str(i-1)] = np.dot((parameters["w" + str(i)]).T, 
                                              gradients["da" + str(i)])
        derv = grad_sigmoid(A["a" + str(i-1)])
        gradients['da'+str(i-1)] = gradients['dh'+str(i-1)]*derv
        #print( gradients["db" + str(i)].shape)
    return gradients

below cell is the most basic implementation of gradient decent
* first we calculated the A , H and y_hat with feed_forward function
* then find gradient of loss function w.r.t weights an bias (as our network output only depend on weights and bias {we calculate da and dh as it required to find dW and db})
* then update the weights and bias in parameter ie. decent

In [49]:
# finding result with best/post-training parameters
def accuracy(decented_param):
    A,H,y_hat = feed_forward(X_test,3,decented_param)

    y_pred = np.argmax(y_hat,axis=0)
    y_true = data_test.replace(
        {'species':{list(data_test['species'].unique())[i]:i for i in range(0,3)}}
    ).to_numpy()[:,4]
    c = 0
    for i in range(len(y_true)):
        if y_pred[i] == y_true[i]:
            c+=1
    print(f'{c} correct out of {len(y_true)}')

In [None]:
p = create_network(4,3,3,[5,5,5])
for i in range(1,1000000,100000):
    for K in range(i):
        A,H,y_hat = feed_forward(X,3,p)
        #print('a1 shape',A['a1'].shape)
        gradients = back_propogate(A,H,Y,y_hat,p,3,4)
        for i in range(1,5,1):
            p['w'+str(i)] = p['w'+str(i)] - gradients['dw'+str(i)]
            p['b'+str(i)] = p['b'+str(i)] - gradients['db'+str(i)]
        #print(p['b'+str(i)].shape)
    accuracy(p)

23 correct out of 75
34 correct out of 75
34 correct out of 75
