---
## Importing TensorFlow library  and other libraries. Check the version of TensorFlow library I am using
---

In [2]:
import numpy as np
import sympy as sp
import sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
from keras.datasets import mnist
%matplotlib inline
import tensorflow as tf
tf.__version__

'2.15.0'

## Setting the Printing precision

In [3]:
np.set_printoptions(precision = 2)

## Generating Artificial Data set for This Tutorial

In [31]:
# Consider there are 5 samples, and each sample has 4 features.
# These samples can belong to 3 output classes (0,1,2)
# The data matrix, samples are arranged column by column
# So, the shape of the data matrix will be (4*5) instead of (5*4)
# The weights matrix is created for the output classes considering all features
# So, shape of the weights matrix will be (3*4)
# We know that Z = WX; so shape of Z is (shape of W)*(Shape of X)
# In other words, the Raw Scores matrix is of shape (3*5)

num_samples = 5
num_features = 4
num_labels = 3

# Creating data matrix of shape (4*5)
X = np.random.choice(np.arange(0, 5), size = (num_features, num_samples), replace = True)

# Creating the Output class labels vectors of shape (5*1)
y = np.random.choice([0, 1, 2], size = num_samples, replace = True)

# Creating the Weights matrix for output classes and features of shape (3*4)
W = np.random.choice(np.arange(-4, 4), size = (num_labels, num_features), replace = True)

print('X = ', X) # shape is (4*5)
print('-----------------------------------------------------')
print('y = ', y) # Shape is (1*5)
print('-----------------------------------------------------')
print('w = ', W) # Shape is (3*4)

X =  [[2 4 0 2 3]
 [0 4 4 3 1]
 [2 1 3 4 3]
 [1 4 4 2 4]]
-----------------------------------------------------
y =  [2 0 1 1 0]
-----------------------------------------------------
w =  [[-1  3 -2 -2]
 [ 2  2 -3  2]
 [ 2  3  0 -3]]


## Adding bias feature in the Data matrix (X) and in the Weights Matrix (W) using numpy.
#### Now, one thing to note that this cell must not be ran more than once, otherwise one more bias feature will be added.

#### So, we add randomly generated bis values in the last column of weights matrix W and assign a row of ones in the Data matrix X

In [33]:
print('X = ')
print(X)
print('-----------------Adding Bias Feature in the Data Matrix---------------------')
print('X with bias feature = ')
X = np.vstack([X, np.ones((1, num_samples))])
print(X)

X = 
[[2 4 0 2 3]
 [0 4 4 3 1]
 [2 1 3 4 3]
 [1 4 4 2 4]]
-----------------Adding Bias Feature in the Data Matrix---------------------
X with bias feature = 
[[2. 4. 0. 2. 3.]
 [0. 4. 4. 3. 1.]
 [2. 1. 3. 4. 3.]
 [1. 4. 4. 2. 4.]
 [1. 1. 1. 1. 1.]]


In [34]:
# Adjust the weight matrix and add bias values
# for bias as the last column
W = np.hstack([W, np.ones((num_labels, 1))]) # For simplicity All values are considered as 1
print('Bias Added Weights Matrix is')
print(W)

Bias Added Weights Matrix is
[[-1.  3. -2. -2.  1.]
 [ 2.  2. -3.  2.  1.]
 [ 2.  3.  0. -3.  1.]]


## Calculating the Raw Scores Vector Z as Z = WX
#### We don't consider the term 'b' for bias as it is already accounted in W and X

In [36]:
Z = np.dot(W,X)
print('Z = ')
print(Z)

print('-------------------------------------------------------------')

print('The Correct lables y =')
print(y)

# Quantify Unhappiness
# In actual, the 0th sample belongs to 2nd output class
# So, to be happy the the 1st sample's Raw Score will be highest for output class 2
# In this case, This is true as 2 is bigger than -7 and 1; that means I am happy
# Again I are happy for the 2nd sample.
# For other samples, I am not happy.

Z = 
[[ -7.  -1.  -1.  -4. -13.]
 [  1.  22.   8.   3.   8.]
 [  2.   9.   1.   8.  -2.]]
-------------------------------------------------------------
The Correct lables y =
[2 0 1 1 0]


## SoftMax Function and Its Inner working

In [39]:
# Inner Working of SoftMax Funtion

# Exponentiate (to ommit negative values) and then Normalize (to make the values small)
# Thus we get predicted probabilities for each sample considering the three output classes
print('Z = ')
print(Z)
print('---------')
print('Z_exp = ')
Z_exp = np.exp(Z)
print(Z_exp)
print('---------')
print('Predicted Probabilities Matrix = ')
print(Z_exp/np.sum(Z_exp, axis = 0))
print('---------')
print('Correct Output Class labels')
print(y)

# Interpretation of the predicted Probabilities matrix
# The predicted probability that the 0th sample will belong to output class 0 is 9.02*10^(-5)
# The predicted probability that the 0th sample will belong to output class 1 is 2.69*10^(-1)
# The predicted probability that the 0th sample will belong to output class 2 is 7.31*10^(-1)

# A value close to 1 means we are happy for the sample's output class label.
# An ideal predicted probability vector for the 0th sample will be [0 0 1]^T
# So, the loss is -log(1); which is 0; so 0 unhappiness means I are very happy
# If the predicted probability vector will be [0 0 1]^T and 0 is the actual class label
# Then loss will be -log(0) which is infinity; very unhappy

Z = 
[[ -7.  -1.  -1.  -4. -13.]
 [  1.  22.   8.   3.   8.]
 [  2.   9.   1.   8.  -2.]]
---------
Z_exp = 
[[9.12e-04 3.68e-01 3.68e-01 1.83e-02 2.26e-06]
 [2.72e+00 3.58e+09 2.98e+03 2.01e+01 2.98e+03]
 [7.39e+00 8.10e+03 2.72e+00 2.98e+03 1.35e-01]]
---------
Predicted Probabilities Matrix = 
[[9.02e-05 1.03e-10 1.23e-04 6.10e-06 7.58e-10]
 [2.69e-01 1.00e+00 9.99e-01 6.69e-03 1.00e+00]
 [7.31e-01 2.26e-06 9.11e-04 9.93e-01 4.54e-05]]
---------
Correct Output Class labels
[2 0 1 1 0]


In [40]:
# Defining the SoftMax function using def keyword
# Define softmax function
def softmax(Z): # 0th axis means top-bottom
  Z_exp =np.exp(Z - np.max(Z, axis=0)) # To eliminate overflow subtraction is done and then exponentiate it
  return(Z_exp /np.sum(Z_exp,axis= 0)) # Normalization is done in this step

## Calculating Predicted Probabilities using the softmax() function just defined in above code cell

In [41]:
# Calculate the probability matrix
P = softmax(Z)
print('Z = ')
print(Z)
print('-----------')
print('P = ')
print(P)
print('-----------')
# Sum in each column of matrix P
print('Sum of Each Column')
print(np.sum(P,axis= 0))
print('------------')
# Print the correct label for each sample
print('y = ')
print(y)

Z = 
[[ -7.  -1.  -1.  -4. -13.]
 [  1.  22.   8.   3.   8.]
 [  2.   9.   1.   8.  -2.]]
-----------
P = 
[[9.02e-05 1.03e-10 1.23e-04 6.10e-06 7.58e-10]
 [2.69e-01 1.00e+00 9.99e-01 6.69e-03 1.00e+00]
 [7.31e-01 2.26e-06 9.11e-04 9.93e-01 4.54e-05]]
-----------
Sum of Each Column
[1. 1. 1. 1. 1.]
------------
y = 
[2 0 1 1 0]


## Calculating The tRaining Loss for all samples and its inner workings

In [47]:
# Inner working of the Loss function
print('P = ')
print(P)
print('-----------')
print('y = ')
print(y)
print('-----------')
print('A 1*5 matrix = ')
print(np.arange(num_samples))
print('-----------')
print('Predicted Probabilities that Each sample will belong to its correct class')
print(P[y,np.arange(num_samples)])
print('-----------')
print('The Loss is given by')
print(-np.log(P[y,np.arange(num_samples)]))
print('-----------')
print('The Average Loss is given as')
print(np.mean(-np.log(P[y,np.arange(num_samples)])))

P = 
[[9.02e-05 1.03e-10 1.23e-04 6.10e-06 7.58e-10]
 [2.69e-01 1.00e+00 9.99e-01 6.69e-03 1.00e+00]
 [7.31e-01 2.26e-06 9.11e-04 9.93e-01 4.54e-05]]
-----------
y = 
[2 0 1 1 0]
-----------
A 1*5 matrix = 
[0 1 2 3 4]
-----------
Predicted Probabilities that Each sample will belong to its correct class
[7.31e-01 1.03e-10 9.99e-01 6.69e-03 7.58e-10]
-----------
The Loss is given by
[3.13e-01 2.30e+01 1.03e-03 5.01e+00 2.10e+01]
-----------
The Average Loss is given as
9.864231154223361


In [48]:
# Defining the Training loss for each sample
loss = -np.log(P[y,np.arange(num_samples)])
print('Loss = ')
print(loss)
print('-----------')
# Calculate average training loss
data_loss = np.mean(loss)
print('Total loss = %f'%(data_loss))

Loss = 
[3.13e-01 2.30e+01 1.03e-03 5.01e+00 2.10e+01]
-----------
Total loss = 9.864231


## Calculating Regularization Loss and Its Inner workings
* We don't want the training data to have the model overfit to specific features.
* Here L1 and L2 regularizations are mostly used.
* Regularization helps by shrunking the weights values to zero (0) and at the same time, they minimize the average training loss .
* In case of L1 regularization, most weights values are actually become zero. It is used when the user has pervious knowledge on which feature set matters the most by experience. They identify which features matter the most.
* In case of L2 regularization, all weights values are shrunk to 0 uniformly, resulting is small weights very close to 0. Used when the user don't know which feature set matters the most.
* Regularization is not applied on bias values as they don't contribute in overfitting. Bias values are specific to classes and they are not associated with samples. Bias are not applied on original features.

In [50]:
# Inner Working of Regularization Loss
print('The Weights Matrix W is')
print(W)
print('---------------')
print('The Weights Matrix excluding last Column')
print(W[:,:-1]) # As the last column represents the bias value
print('---------------')
print('Element-wise Product for L2 regularization')
print(W[:,:-1]*W[:,:-1]) # Calculation for the L2 Regularization; elementwise product
# Observe all the entries of W are just squared.

The Weights Matrix W is
[[-1.  3. -2. -2.  1.]
 [ 2.  2. -3.  2.  1.]
 [ 2.  3.  0. -3.  1.]]
---------------
The Weights Matrix excluding last Column
[[-1.  3. -2. -2.]
 [ 2.  2. -3.  2.]
 [ 2.  3.  0. -3.]]
---------------
Element-wise Product for L2 regularization
[[1. 9. 4. 4.]
 [4. 4. 9. 4.]
 [4. 9. 0. 9.]]


In [51]:
# Calculating Regularizatioin Loss (L2)
reg = 0.1 # strength of regularization = 10%
reg_loss = (np.sum(W[:,:-1]*W[:,:-1])) * reg
print('Total loss = %f'%(data_loss + reg_loss))

Total loss = 15.964231


## Calculating Gradient of Total Loss w.r.t to the Weights Matrix

In [52]:
# Adjust the probability matrix such that 1 is subtracted
# from each samples correct category probability.
P[y, range(num_samples)] = P[y, range(num_samples)] - 1

# Calculate the gradient of total loss w.r.t. the weights W
dW = (1/num_samples)* np.dot(P, X.T)+ reg * 2* np.hstack([W[:,:-1], np.zeros((num_labels,1))])
# np.zeros is added as no update for the bias values are considered
# So, the bias values will be 0
print(dW)

# Interpretation of the gradient object
# Consider three output categories Mild, High and VeryHigh
# The 5 features are HR, BP, Temp, Weight and Glucose
# The values in 0th row and 1st column is -0.4 (Python way of indexing) means
# If I were to tweak the weights for BP as far as the Mild category is considered,
# The loss is going to change
# If I increase the weight value for the BP feature of Mild category,
# The loss will decrease

[[-1.6  -0.4  -1.2  -2.   -0.4 ]
 [ 1.51  0.8  -0.49  1.66  0.25]
 [ 0.69  1.2   0.69 -0.26  0.15]]


## Applying the Gradient Descent Method To the Data set Created

In [53]:
alpha = 1e-02 # learning rate
tol = 1e-05 # stopping tolerance
iter = 0
maxiter = 1000

while np.linalg.norm(dW) >tol and iter< maxiter:
  W = W + alpha*(-dW)
  iter = iter+1
  print('Iteration = %d, ||gradL(W)|| = %f'%(iter, np.linalg.norm(dW)))
# Norm of a matrix is the square root of the sum of squared elements of the matrix

Iteration = 1, ||gradL(W)|| = 4.093389
Iteration = 2, ||gradL(W)|| = 4.093389
Iteration = 3, ||gradL(W)|| = 4.093389
Iteration = 4, ||gradL(W)|| = 4.093389
Iteration = 5, ||gradL(W)|| = 4.093389
Iteration = 6, ||gradL(W)|| = 4.093389
Iteration = 7, ||gradL(W)|| = 4.093389
Iteration = 8, ||gradL(W)|| = 4.093389
Iteration = 9, ||gradL(W)|| = 4.093389
Iteration = 10, ||gradL(W)|| = 4.093389
Iteration = 11, ||gradL(W)|| = 4.093389
Iteration = 12, ||gradL(W)|| = 4.093389
Iteration = 13, ||gradL(W)|| = 4.093389
Iteration = 14, ||gradL(W)|| = 4.093389
Iteration = 15, ||gradL(W)|| = 4.093389
Iteration = 16, ||gradL(W)|| = 4.093389
Iteration = 17, ||gradL(W)|| = 4.093389
Iteration = 18, ||gradL(W)|| = 4.093389
Iteration = 19, ||gradL(W)|| = 4.093389
Iteration = 20, ||gradL(W)|| = 4.093389
Iteration = 21, ||gradL(W)|| = 4.093389
Iteration = 22, ||gradL(W)|| = 4.093389
Iteration = 23, ||gradL(W)|| = 4.093389
Iteration = 24, ||gradL(W)|| = 4.093389
Iteration = 25, ||gradL(W)|| = 4.093389
Iteration