In [5]:
#Coding a Layer of 3 neurons , each being fed 4 ip and having a bias each

ip = [1,2,3,2.5]

weights = [[0.2,0.8,-0.5,1],
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]] #list of list

# Importance of biases, why sometimes initialise to non zero value -> suppose the neuron does not fire and bias=0 , when op of one layer-> ip of next will be 0
# hence that neuron will also not fire and if the bias for that also=0 the neural nw is dead

bias = [2,3,0.5]
layer_op=[]

for neuron_wt, neuron_bias in zip(weights,bias):
  neuron_op = 0
  for n_ip,weight in zip(ip,neuron_wt):
    neuron_op +=n_ip*weight
  neuron_op += neuron_bias
  layer_op.append(neuron_op)

layer_op

[4.8, 1.21, 2.385]

In [6]:
# just use dot product from numpy to ease the codework->
import numpy as np

layer_op = np.dot(weights,ip) +bias # typical matrix and vector multiplication so when
# (matrix,vector) -> vector(4,1) and for (vector,matrix) ->vector(1,4),hence error in this case

layer_op

array([4.8  , 1.21 , 2.385])

In [7]:
# why batches??
# more parallel computation and better generalizations
#increasing the batch size to optimum size ,generally power of 2; leads to better fitting/training of the neuron
# if all shown at same time ->' OVERFITTING'

In [8]:
# now we have 3 sets of ip

ip = [[1,2,3,2.5],
      [2,5,1,2],
      [-1.5,2.7,3.3,-.8]]

# 3 neurons with diff weights for diff ip

weights = [[0.2,0.8,-0.5,1],
           [0.5, -0.91, 0.26, -0.5],
           [-0.26, -0.27, 0.17, 0.87]]

bias = [2,3,0.5]
layer_op=[]

In [9]:
op  =np.dot(ip,np.array(weights).T) +bias
op

array([[ 4.8  ,  1.21 ,  2.385],
       [ 7.9  , -1.29 ,  0.54 ],
       [ 1.41 ,  1.051,  0.026]])

In [10]:
#adding a layer to the network
weights2 = [[0.1,-0.14,0.5],
           [-0.5, 0.12, -0.33],
           [-0.44, 0.73, -0.13]]

bias2 = [-1,.2,-0.5]
op2= np.dot(op, weights2)+bias2
op2



array([[-2.1744 ,  1.41425,  1.19065],
       [ 0.1974 , -0.6666 ,  3.8055 ],
       [-1.39594,  0.1477 , -0.14521]])

In [11]:
# initialising a layer
# either we have stored weights and bias from our pvs saved models or we build on our own
# we need weights ,typically bw -1 and 1 to avoid the problem of exploding gradients later since we will be having a weighted sum of ip

In [12]:
np.random.seed(0)

X = [[1, 2, 3, 2.5],                 #passing 3 batch of ip with 4 features
     [2.0, 5.0, -1.0, 2.0],
     [-1.5, 2.7, 3.3, -0.8]]

In [13]:
class Layer_Dense:
  def __init__(self,n_inputs,n_neurons):
    self.weights = 0.1*np.random.randn(n_inputs,n_neurons)  # here the shape is set to ip,neurons to avoid transpose op in the forward pass
    self.bias = np.zeros((1,n_neurons))

  def forward(self,inputs):
    self.op = np.dot(inputs,self.weights) +self.bias


In [14]:
layer_1 =Layer_Dense(4,5)  #here the ip should be same as ip columns
layer_2 =Layer_Dense(5,2) #here the ip shape should match the op of previous layer

In [15]:
layer_1.forward(X)
layer_1.op

array([[ 0.10758131,  1.03983522,  0.24462411,  0.31821498,  0.18851053],
       [-0.08349796,  0.70846411,  0.00293357,  0.44701525,  0.36360538],
       [-0.50763245,  0.55688422,  0.07987797, -0.34889573,  0.04553042]])

In [16]:
layer_2.forward(layer_1.op)
layer_2.op

array([[ 0.148296  , -0.08397602],
       [ 0.14100315, -0.01340469],
       [ 0.20124979, -0.07290616]])

In [17]:
# why we need an activation function in first place?? 'NON-LINEARITY'

# if we dont have non linear activation :
#'PROBLEM'-> our model is basically a linear function/mapping of the ip, so we can fit a linear function or
# at max approx a non linear function inefficiently

In [18]:
#use of sigmoid for step function -> missing the granularity in knowing how close was our model in op a 1 or a 0
# use of relu for sigmoid-> it is faster and the problem of vanishing gradients with the sigmoid

In [19]:
!pip install nnfs
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()

Collecting nnfs
  Downloading nnfs-0.5.1-py3-none-any.whl (9.1 kB)
Installing collected packages: nnfs
Successfully installed nnfs-0.5.1


In [20]:
X, y = spiral_data(100, 3)

In [32]:
y.shape

(300,)

In [21]:
class Activation_relu:
  def forward(self,input):
    self.op = np.maximum(0,input)


In [22]:
class Layer_Dense:
  def __init__(self,n_inputs,n_neurons):
    self.weights = 0.1*np.random.randn(n_inputs,n_neurons)
    self.bias = np.zeros((1,n_neurons))
  def forward(self,inputs):
    self.op = np.dot(inputs,self.weights) +self.bias

activation1 = Activation_relu()

In [23]:
layer_1 =Layer_Dense(2,5)   #spiral dataset has 2 columns
layer_1.forward(X)
layer_1.op
activation1.forward(layer_1.op)

In [24]:
layer_1.op

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [-8.35815910e-04, -7.90404272e-04, -1.33452227e-03,
         4.65504505e-04,  4.56846210e-05],
       [-2.39994470e-03,  5.93469958e-05, -2.24808278e-03,
         2.03573116e-04,  6.10024377e-04],
       ...,
       [ 1.13291524e-01, -1.89262271e-01, -2.06855070e-02,
         8.11079666e-02, -6.71350807e-02],
       [ 1.34588361e-01, -1.43197834e-01,  3.09493970e-02,
         5.66337556e-02, -6.29687458e-02],
       [ 1.07817926e-01, -2.00809643e-01, -3.37579325e-02,
         8.72561932e-02, -6.81458861e-02]], dtype=float32)

In [25]:
activation1.op #all negative values are clipped to 0

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.65504505e-04,
        4.56846210e-05],
       [0.00000000e+00, 5.93469958e-05, 0.00000000e+00, 2.03573116e-04,
        6.10024377e-04],
       ...,
       [1.13291524e-01, 0.00000000e+00, 0.00000000e+00, 8.11079666e-02,
        0.00000000e+00],
       [1.34588361e-01, 0.00000000e+00, 3.09493970e-02, 5.66337556e-02,
        0.00000000e+00],
       [1.07817926e-01, 0.00000000e+00, 0.00000000e+00, 8.72561932e-02,
        0.00000000e+00]], dtype=float32)

In [26]:
#use of softmax for relu ??
#how wrong is our model when 2 accuracy is same  for both ??

# relu gives unbounded op [0,infinity], learning from clipped values ->ineffetive learning
# cannot use absolute function since lose the meaning of negative values entirely hence exponential
# then normalisation to get the probabilty distribution required ->> "SOFTMAX"

In [27]:
class activation_softmax:
  def forward(self, inputs):
    exp_values = np.exp(inputs-np.max(inputs,axis=1,keepdims=True))
    probability = exp_values /(np.sum(exp_values, axis=1, keepdims =True))
    self.op = probability

In [28]:
layer_2 = Layer_Dense(5, 3) # op of first layer is 5 and since we have 3 classes, so our op layer should have 3 neurons
activation_2 = activation_softmax()

layer_2.forward(activation1.op)
activation_2.forward(layer_2.op)
activation_2.op[:5]

array([[0.33333334, 0.33333334, 0.33333334],
       [0.33334148, 0.3333302 , 0.33332834],
       [0.33335316, 0.33332598, 0.33332086],
       [0.333332  , 0.33330762, 0.3333604 ],
       [0.33333603, 0.33330083, 0.33336315]], dtype=float32)

In [29]:
# loss functions ??

# our model gives an op in form of probability ditsribution instead of classes, so it would be more convinient for the optimizer to know that
# what actually was the confidence with which our model was able to correclty classify or not
# in classification we use categorically cross-entropy

In [46]:
class loss:
  def calculate(self,op,y):
    sample_loss = self.forward(op,y)
    data_loss = np.mean(sample_loss)
    return data_loss

class loss_categoricalcrossentropy(loss):
  def forward(self,y_pred,y_true):
    sample = len(y_pred)
    y_pred_clipped =np.clip(y_pred,1e-7, 1-1e-7) # taking care of the instance when we get a mean(- log(0))

# handling the cases where y_true is passed as a scaler or one hot encoded vectors

    if len(y_true.shape) == 1:    # if scalar ->(batch_size,)
      correct_confidences = y_pred_clipped[range(sample), y_true]
    if len(y_true.shape) ==2:     #if one hot encoded vectors ->(batch_size,num_classes)
      correct_confidences = np.sum(y_pred_clipped * y_true,axis=1)
    negative_log_likelihoods = -np.log(correct_confidences)
    return negative_log_likelihoods


In [48]:
loss_function = loss_categoricalcrossentropy()
loss = loss_function.calculate(activation_2.op, y)

loss

1.0988972