In this notebook, we will try to replicate what Tensorflow does. That is, we will try to create a simple Neural Network from scratch. We are going to create a model with 2 hidden layers.

In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
#load the data
data = pd.read_csv('D:/Datasets/Winsconsin/data.csv')

In [3]:
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
data.drop(labels=['id',data.columns[-1]],axis=1,inplace=True)

In [5]:
data['diagnosis'].replace(to_replace=['M','B'],value=[1,0],inplace=True)

In [6]:
data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
#test,train splits
data_copy = data.copy()
data_copy = data_copy.sample(frac=1)

train_len = int(0.8*data.shape[0])

train,test = np.array(data_copy.iloc[0:train_len,1:]),np.array(data_copy.iloc[train_len:,1:])
c_train,c_test = np.array(data_copy.iloc[0:train_len,0]),np.array(data_copy.iloc[train_len:,0])

In [8]:
sc = StandardScaler()
train = sc.fit_transform(train)
test = sc.fit_transform(test)

Just like in logistic regression, we will calculate score using a linear decision boundary to be fed into our Sigmoid Neuron<br>

$d = \sum_{i=1}^{N} \theta^T X + b$
<br>
$\theta_{n,m}$ is the Weight matrix 
<br>
$X$ is the input matrix
<br>
$b$ is the bias

In [9]:
def decision_boundary(data,theta):
    score = data@theta
    return score.reshape(1,theta.shape[1])

We will use Sigmoid function as the Activation Function<br>

$\large \frac{1}{1+e^{-z}}$


In [10]:
def Sigmoid(x):
    '''Calculates the Sigmoid of the given input'''
    return (1/(1+np.exp(-x)))

We will use cross-entropy as our loss function for Stochastic Gradient Descent
<br>
$L(\theta) = -[y\, ln(\hat y) + (1-y) \,ln(1- \hat y)]$

In [11]:
def loss(label,pred):
    ans = -(label*np.log(pred) + (1-label)*np.log(1-pred))
    return ans

During forward pass we will calculate activations for each layer. Activation is nothing but the output of the layer<br>

$X->\,[Z=\theta^T X]->\,\sigma(Z)$
<br>

The trickiest part is updating the parameters or $\theta$ of our network. We do this with the help of Backpropagation. That is, we calculate the derivatives of our Loss function with respect to the parameters.
<br>
$\large \frac{\delta L(\theta)}{\delta \theta} = \frac{\delta A}{\delta \theta}.
\frac{\delta L(\theta)}{\delta A}$
<br><br>
The derivative $\frac{\delta A}{\delta \theta}$ can be calculated during the forward pass
<br>
The derivative $\frac{\delta L(\theta)}{\delta A}$ can only be calculated during the backward pass

We will be using the Stochastic Gradient Descent algorithm for optimization of our Loss function.

In [12]:
#initialise the values of theta for every layer
theta_1 = np.random.uniform(size=(train.shape[1],30))
theta_2 = np.random.uniform(size=(30,30))
theta_3 = np.random.uniform(size=(30,1))

#we are considering input to be the 0th layer

losses = [] #to track the final losses
errors = [] #to track the errors

alpha = 0.5 #learning rate for SGD
epochs = 200


for e in tqdm( range(epochs)):
    f_train = []
    f_test = []
    for i in range(train.shape[0]):
        #forward pass

        #layer 1
        x = train[i].reshape(1,train.shape[1])
        Z1 = decision_boundary(x,theta_1)
        A1 = Sigmoid(Z1)
        del_theta_1 = np.matmul(x.T,A1*(1-A1)) #del(A1)/del(theta_1)

        #layer 2
        Z2 = decision_boundary(A1,theta_2)
        A2 = Sigmoid(Z2)
        del_theta_2 = np.matmul(A1.T,A2*(1-A2)) #del(A2)/del(theta_2)

        #layer 3
        Z3 = decision_boundary(A2,theta_3)
        A3 = Sigmoid(Z3)
        del_theta_3 = np.matmul(A2.T,A3*(1-A3)) #del(A3)/del(theta_3)
        

        #we are done with the Forward prop
        #Now we will calculate the loss

        L = loss(c_train[i],A3) #Loss
        losses.append(L)

        E = A3 - c_train[i] #Error
        errors.append(E)
        
        f_train.append(1 if A3>0.5 else 0)
        
        

        #backward pass

        del_a3 = E/(A3*(1-A3)) #del(L)/del(A3)

        del_a2 = del_a3.T*np.matmul(theta_3,(A3*(1-A3)).T) #del(L)/del(A2)

        del_a1 = del_a2.T*np.matmul(theta_2,(A2*(1-A2)).T) #del(L)/del(A1)

        #update thetas

        theta_1 = theta_1 - alpha*del_a1.T*del_theta_1

        theta_2 = theta_2 - alpha*del_a2.T*del_theta_2

        theta_3 = theta_3 - alpha*del_a3*del_theta_3
        

for i in range(test.shape[0]):
    x = test[i].reshape(1,30)
    Z1 = decision_boundary(x,theta_1)
    A1 = Sigmoid(Z1)

    Z2 = decision_boundary(A1,theta_2)
    A2 = Sigmoid(Z2)

    Z3 = decision_boundary(A2,theta_3)
    A3 = Sigmoid(Z3)

    f_test.append(1 if A3>0.5 else 0)
print('training accuracy after epoch',e,'is',np.where(np.array(f_train)==c_train)[0].shape[0]/train.shape[0])
print('testing accuracy after epoch',e,'is',np.where(np.array(f_test)==c_test)[0].shape[0]/test.shape[0])        

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 26.33it/s]

training accuracy after epoch 199 is 0.9626373626373627
testing accuracy after epoch 199 is 0.8421052631578947



