---

Load libraries

---

In [1]:
## Load libraries
import numpy as np
import sympy as sp
import sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
from keras.datasets import mnist
%matplotlib inline




---

Set printing precision

---

In [2]:
np.set_printoptions(precision = 2)

---

Import tensorflow and check version

---

In [3]:
import tensorflow as tf

In [4]:
tf.__version__

'2.15.0'

In [5]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/OddSem2023MAHE'
    DATA_DIR = DIR + '/Data/'
else:
    DATA_DIR = 'Data/'

---

Calculating softmax loss and gradient for a toy dataset

----

In [10]:
# Generate artificial data with 5 samples, 4 features per sample
# and 3 output classes. Here we arrange samples column by column so X will be 
# 4*5 instead of 5*4
num_samples = 5 # number of samples
num_features = 4 # number of features (a.k.a. dimensionality)
num_labels = 3 # number of output labels
# Data matrix (each column = single sample)
X = np.random.choice(np.arange(0, 5), size = (num_features, num_samples), replace = True)
# Class labels
y = np.random.choice([0, 1, 2], size = num_samples, replace = True)
# Randomly assign entries of weights matrix
W = np.random.choice(np.arange(-4, 4), size = (num_labels, num_features), replace = True)

print('X = ')
print(X)
print('y = ')
print(y)
print('W = ')
print(W)

X = 
[[3 3 1 3 0]
 [4 4 3 1 0]
 [3 3 4 1 1]
 [0 0 2 1 1]]
y = 
[2 1 2 2 0]
W = 
[[-1  3  0  0]
 [ 1  3  0  3]
 [ 3  2  1  0]]


---

Add the bias feature to the data matrix (run this cell only once!)

---

In [16]:
# Add the bias feature to the data matrix (run this cell only once!)
print('X = ')
print(X)
print('X with bias feature = ')
X = np.vstack([X, np.ones((1, num_samples))])
print(X)

X = 
[[3 3 1 3 0]
 [4 4 3 1 0]
 [3 3 4 1 1]
 [0 0 2 1 1]]
X with bias feature = 
[[3. 3. 1. 3. 0.]
 [4. 4. 3. 1. 0.]
 [3. 3. 4. 1. 1.]
 [0. 0. 2. 1. 1.]
 [1. 1. 1. 1. 1.]]


---

Adjust the weight matrix with (possibly random) values added
for bias as the last column (run this cell only once!)

---

In [18]:
# Adjust the weight matrix with (possibly random) values added
# for bias as the last column (run this cell only once!)
W = np.hstack([W, np.ones((num_labels, 1))])
print(W)

[[-1.  3.  0.  0.  1.]
 [ 1.  3.  0.  3.  1.]
 [ 3.  2.  1.  0.  1.]]


---

Calculate the raw zcores matrix

---

In [20]:
Z = np.dot(W,X)
print('Z = ')
print(Z)

Z = 
[[10. 10.  9.  1.  1.]
 [16. 16. 17. 10.  4.]
 [21. 21. 14. 13.  2.]]


In [22]:
print (y)

[2 1 2 2 0]


---

Define softmax function

---


In [23]:
# Define softmax function
def softmax(Z):
  # Convert scores to non-normalized probabilites matrix. Note that for each sample,
  # that is in each column, the values don't add up to 1. Also note that the
  # output values are typically large or small
  Z_exp =np.exp(Z - np.max(Z, axis=0))
 
    
  # Normalize probabilities matrix such that the sum across each column is equal to 1.
  # Now we have actually probability values for each sample.
  return(Z_exp /np.sum(Z_exp,axis= 0))

---

Calculate the probability matrix

---

In [25]:
#  Calculate the probability matrix
P = softmax(Z)
print(Z)
print(P)
# Sum in each column of matrix P
print(np.sum(P,axis= 0))
# Print the correct label for each sample
print(y)

[[10. 10.  9.  1.  1.]
 [16. 16. 17. 10.  4.]
 [21. 21. 14. 13.  2.]]
[[1.66e-05 1.66e-05 3.19e-04 5.85e-06 4.20e-02]
 [6.69e-03 6.69e-03 9.52e-01 4.74e-02 8.44e-01]
 [9.93e-01 9.93e-01 4.74e-02 9.53e-01 1.14e-01]]
[1. 1. 1. 1. 1.]
[2 1 2 2 0]


---

Calculate training loss for all samples.

---

In [28]:
loss = -np.log(P[y, np.arange(num_samples)])
print('Loss = ')
print(loss)
# Calculate average training loss
loss_data = np.mean(loss)
print('Total loss = %f'%(loss_data))

Loss = 
[0.01 5.01 3.05 0.05 3.17]
Total loss = 2.256162


---

Calculate regularization loss

---


In [31]:
print(W)
print(W[:,:-1])
print(W[:,:-1]*W[:,:-1])

[[-1.  3.  0.  0.  1.]
 [ 1.  3.  0.  3.  1.]
 [ 3.  2.  1.  0.  1.]]
[[-1.  3.  0.  0.]
 [ 1.  3.  0.  3.]
 [ 3.  2.  1.  0.]]
[[1. 9. 0. 0.]
 [1. 9. 0. 9.]
 [9. 4. 1. 0.]]


In [36]:
# Regularization loss
reg = 0.1 # strength of regularization = 10%
loss_reg = (np.sum(W[:,:-1]*W[:,:-1])) * reg
print('Total loss = %f'%(loss_data+loss_reg))

Total loss = 6.556162


---

Calculate the gradient of total loss w.r.t. the weights W

---


In [41]:
# Adjust the probability matrix such that 1 is subtracted
# from each samples correct category probability.
P[y, range(num_samples)] = P[y, range(num_samples)] - 1 

# Calculate the gradient of total loss w.r.t. the weights W
dW = (1/num_samples)* np.dot(P, X.T)+ reg * 2* np.hstack([W[:,:-1], np.zeros((num_labels,1))])
print(dW)

[[-0.2   0.6  -0.79 -0.79 -0.79]
 [-1.97 -2.01 -1.45  1.16 -0.43]
 [-3.23 -4.19 -4.76 -2.17 -1.78]]


---

Apply gradient descent to the toy dataset

---


In [45]:
alpha = 1e-02 # learning rate
tol = 1e-05 # stopping tolerance
iter = 0
maxiter = 1000

while np.linalg.norm(dW) >tol and iter< maxiter:
  W = W + alpha*(-dW)
  iter = iter+1
  print('Iteration = %d, ||gradL(W)|| = %f'%(iter, np.linalg.norm(dW) ))

Iteration = 1, ||gradL(W)|| = 8.504106
Iteration = 2, ||gradL(W)|| = 8.504106
Iteration = 3, ||gradL(W)|| = 8.504106
Iteration = 4, ||gradL(W)|| = 8.504106
Iteration = 5, ||gradL(W)|| = 8.504106
Iteration = 6, ||gradL(W)|| = 8.504106
Iteration = 7, ||gradL(W)|| = 8.504106
Iteration = 8, ||gradL(W)|| = 8.504106
Iteration = 9, ||gradL(W)|| = 8.504106
Iteration = 10, ||gradL(W)|| = 8.504106
Iteration = 11, ||gradL(W)|| = 8.504106
Iteration = 12, ||gradL(W)|| = 8.504106
Iteration = 13, ||gradL(W)|| = 8.504106
Iteration = 14, ||gradL(W)|| = 8.504106
Iteration = 15, ||gradL(W)|| = 8.504106
Iteration = 16, ||gradL(W)|| = 8.504106
Iteration = 17, ||gradL(W)|| = 8.504106
Iteration = 18, ||gradL(W)|| = 8.504106
Iteration = 19, ||gradL(W)|| = 8.504106
Iteration = 20, ||gradL(W)|| = 8.504106
Iteration = 21, ||gradL(W)|| = 8.504106
Iteration = 22, ||gradL(W)|| = 8.504106
Iteration = 23, ||gradL(W)|| = 8.504106
Iteration = 24, ||gradL(W)|| = 8.504106
Iteration = 25, ||gradL(W)|| = 8.504106
Iteration

---

TensorFlow includes a low-level API known as TensorFlow core and many high-level APIs, including Keras (tf.keras).

Now we will focus on the TensorFlow low-level API starting with *TensorFlow constants* (https://www.tensorflow.org/guide/tensor) which have the following proprties:

1. Values are stored at the time of defining the tensor
2. Immutable

---

In [46]:
T1 = tf.constant(5.0, dtype = tf.float16, name = 't1')
T2 = tf.constant(8.0, dtype = tf.float16, name = 't2')
T3 = tf.constant(10.0, dtype = tf.float16, name = 't3')
T4 = tf.constant([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]], dtype = tf.float32, name = 't4')

print(T1)
print(T2)
print(T3)
print(T4)

tf.Tensor(5.0, shape=(), dtype=float16)
tf.Tensor(8.0, shape=(), dtype=float16)
tf.Tensor(10.0, shape=(), dtype=float16)
tf.Tensor(
[[2. 1. 4. 3.]
 [1. 2. 3. 4.]
 [4. 3. 2. 1.]], shape=(3, 4), dtype=float32)


---

Elementwise operations on constant tensors

---

In [47]:
print(T1+T2)
print(T1-T2)

tf.Tensor(13.0, shape=(), dtype=float16)
tf.Tensor(-3.0, shape=(), dtype=float16)


---

Built-in operations

---

In [48]:
op1 = tf.add(T1, T2)
op2 = tf.exp(T4)
print(op1)
print(op2)

tf.Tensor(13.0, shape=(), dtype=float16)
tf.Tensor(
[[ 7.39  2.72 54.6  20.09]
 [ 2.72  7.39 20.09 54.6 ]
 [54.6  20.09  7.39  2.72]], shape=(3, 4), dtype=float32)


---

TensorFlow session is applicable only for **TensorFlow version 1** which allows for defining a computation (data flow) graph such that the nodes are the operations and edges are the tensors followed by an execution of the graph.

**TensorFlow version 2** has eager execution (execute immediately withut creating a session).

If version 1 is to be used then, we import TensorFlow as follows:

$$\begin{align*}&\texttt{import tensorflow.compat.v1 as tf}\\&\texttt{
tf.disable_v2_behavior()}\end{align*}$$

or eager execution in version 2 can be disabled using $$\begin{align*}&\texttt{import tensorflow as tf}\\&\texttt{tf.compat.v1.disable_eager_execution()}\end{align*}$$

----

In [49]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.__version__


Instructions for updating:
non-resource variables are not supported in the long term


'2.15.0'

In [50]:
T1 = tf.constant(5.0, dtype = tf.float16, name = 't1')
T2 = tf.constant(8.0, dtype = tf.float16, name = 't2')
T4 = tf.constant([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]], dtype = tf.float32, name = 't4')
op1 = tf.add(T1, T2)
op2 = tf.exp(T4)
print(op1)
print(op2)
with tf.Session() as sess:
  print(sess.run(op1))
  print(sess.run(op2))

Tensor("Add:0", shape=(), dtype=float16)
Tensor("Exp:0", shape=(3, 4), dtype=float32)
13.0
[[ 7.39  2.72 54.6  20.09]
 [ 2.72  7.39 20.09 54.6 ]
 [54.6  20.09  7.39  2.72]]


---

Placeholders (applicable only for **TensorFlow version 1**):  a TensorFlow computation graph can be parameterized to accept external inputs (such as input data for a machine learning algorithm) during runtime using placeholders. That is, placeholders let are empty tensors whose values can be provided at runtime.

---



In [51]:
T1 = tf.placeholder(tf.float32)
T2 = tf.placeholder(tf.float32)
# Define some operations
op1 = T1 + T2
op2 = T1 * T2
with tf.Session() as sess:
  print('mutiply: ', sess.run(op1, feed_dict = {T1: 2, T2: 3}))
  print('add: ', sess.run(op2, feed_dict = {T1: 2, T2: 3}))

mutiply:  5.0
add:  6.0


---

Re-import TensorFlow version 2

---

In [52]:
import tensorflow as tf
tf.__version__

'2.15.0'

---

Tensors from other Python objects such as lists, NumPy arrays, and pandas DataFrames using tf.convert_to_tensor()

---

In [53]:
T = tf.convert_to_tensor(np.array([1, 2, 3, 4]), dtype = tf.float64)
print(T)

Tensor("Const:0", shape=(4,), dtype=float64)


---

Variables (https://www.tensorflow.org/guide/variable)

---

---

Automatic differentiation using TF (https://www.tensorflow.org/guide/autodiff)

Example: calculate the sensitivity of $L(w) = 4w+w^3$ w.r.t. the input $w$ at $w=1.$

Sensitivity $\nabla_wL = 4+3w^2,$ which at $w=1$ is equal to $4+3\times1^2=7.$

---