## Choosing activation functions for multilayer networks

### Logistic function recap

In [1]:
import numpy as np
import tensorflow as tf

X = np.array([1, 1.4, 2.5]) ## first value must be 1 for x0 bias parameter
w = np.array([0.4, 0.3, 0.5])

def net_input(X, w):
    return np.dot(X, w)

def logistic(z):
    return 1.0 / (1.0 + np.exp(-z))

def logistic_activation(X, w):
    z = net_input(X, w)
    return logistic(z)

print('P(y=1|x) = %.3f' % logistic_activation(X, w))

P(y=1|x) = 0.888


In [3]:
#Above result shows that probability of y=1 is 88.8%.
#Normally if sigmoid probability > 0.5, we mark the final output as 1 or TRUE.

In [2]:
#Code below shows an output layer consisting of multiple logistic activation
#units does not produce meaningful, interpretable probability values:

In [4]:
# W : array with shape = (n_output_units, n_hidden_units+1)
#     note that the first column are the bias units

W = np.array([[1.1, 1.2, 0.8, 0.4],
              [0.2, 0.4, 1.0, 0.2],
              [0.6, 1.5, 1.2, 0.7]])

# A : data array with shape = (n_hidden_units + 1, n_samples)
#     note that the first column of this array must be 1

A = np.array([[1, 0.1, 0.4, 0.6]])

Z = np.dot(W, A[0])
y_probas = logistic(Z)

print('Net Input: \n', Z)

print('Output Units:\n', y_probas)

Net Input: 
 [1.78 0.76 1.65]
Output Units:
 [0.85569687 0.68135373 0.83889105]


In [5]:
#As we can see in the output, the resulting values cannot be interpreted as
#probabilities for a three-class problem. The reason for this is that they do not sum up
#to 1. However, this is in fact not a big concern if we only use our model to predict
#the class labels, not the class membership probabilities. One way to predict the class
#label from the output units obtained earlier is to use the maximum value:

In [6]:
y_class = np.argmax(Z, axis=0)
print('Predicted class label: %d' % y_class)

Predicted class label: 0


### Estimating class probabilities in multi-class classification via the softmax function

In [7]:
#Softmax function can be used to calculate multi-class probabilities

In [8]:
def softmax(z):
    return np.exp(z) / np.sum(np.exp(z))

y_probas = softmax(Z)
print('Probabilities:\n', y_probas)

Probabilities:
 [0.44668973 0.16107406 0.39223621]


In [9]:
#CellStrat - the total of probabilities adds upto 1 now.
#Intuitively, it may help to think of the softmax function as a normalized output that is useful to obtain meaningful
#classmembership predictions in multiclass settings.

In [10]:
np.sum(y_probas)

1.0

### Broadening the output spectrum by using a hyperbolic tangent

In [11]:
#CellStrat - the hyperbolic tangent or tanh creates a broader output spectrum from -1 to +1. This can improve the convergence
# during backpropagation.
#Let's plot both signoid and tanh functions.

In [12]:
import matplotlib.pyplot as plt

def tanh(z):
    e_p = np.exp(z)
    e_m = np.exp(-z)
    return (e_p - e_m) / (e_p + e_m)

z = np.arange(-5, 5, 0.005)
log_act = logistic(z)
tanh_act = tanh(z)

plt.ylim([-1.5, 1.5])
plt.xlabel('net input $z$')
plt.ylabel('activation $\phi(z)$')
plt.axhline(1, color='black', linestyle=':')
plt.axhline(0.5, color='black', linestyle=':')
plt.axhline(0, color='black', linestyle=':')
plt.axhline(-0.5, color='black', linestyle=':')
plt.axhline(-1, color='black', linestyle=':')

plt.plot(z, tanh_act,
         linewidth=3, linestyle='--',
         label='tanh')

plt.plot(z, log_act,
         linewidth=3,
         label='logistic')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

<Figure size 640x480 with 1 Axes>