An example code of a neural network having a single hidden layer for a binary classification using all weights as zeros!


In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [3]:
# Create blobs as synthetic dataset-
X, y = make_blobs(n_samples=100000, centers=2, n_features=3)

# Get shape of features (X) and label (y)-
X.shape, y.shape

((100000, 3), (100000,))

In [4]:
# Get distribution of label 'y'-
unique, counts = np.unique(y, return_counts=True)

# Create a dictionary such that-
# element: count
element_count = dict(zip(unique, counts))

element_count

{0: 50000, 1: 50000}

Since this is a synthetic dataset, the distribution of label 'y' is uniform

In [5]:
# Split features (X) and label (y) into training and
# testing sets-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((70000, 3), (70000,), (30000, 3), (30000,))

In [0]:
def sigmoid(z):
	'''
	Sigmoid activation function
	'''
	s = 1 / (1 + np.exp(-z))
	return s

In [0]:
def relu(x):
	'''
	Function to calculate ReLU for
	given 'x'
	'''
	return np.maximum(x, 0)

In [0]:
def relu_derivative(x):
	'''
	Function to calculate derivative
	of ReLU
	'''
	return np.where(x <= 0, 0, 1)

In [0]:
def tanh_derivative(x):
	'''
	Function to calculate derivative of hyperbolic tangent
	for given parameter 'x'
	Used as activation function for hidden layer
	'''
	return (1 - np.power(np.tanh(x), 2))

In [0]:
def initialize_parameters(input_layer, hidden_layer, output_layer):
	'''
	Function to initialize parameters for a neural network with-
	'input_layer' number of neurons in input layer
	'hidden_layer' number of neurons in hidden layer
	'output_layer' neuron in output layer [output layer has one neuron]

	Initialize weights as small numbers!
	'''
	# Initialize weights for hidden layer and input layer-
	W1 = np.random.randn(hidden_layer, input_layer) * 0.01

	# Initialize bias values for hidden layer-
	# b1 = np.zeros((hidden_layer, 1))
	# OR-
	# b1 = np.random.rand(hidden_layer, 1) * 0.01
	b1 = np.random.randn(1, hidden_layer) * 0.01

	# Initialize weights for output layer and hidden layer-
	W2 = np.random.randn(output_layer, hidden_layer) * 0.01

	# Initialize bias values for output layer-
	# b2 = np.zeros((output_layer, 1))
	# OR-
	# b2 = np.random.rand(output_layer, 1) * 0.01
	b2 = np.random.randn(1, output_layer) * 0.01

	# Return all weights and biases as a dictionary object-
	parameters = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

	return parameters

In [0]:
def initialize_parameters_zeros(input_layer, hidden_layer, output_layer):
	'''
	Function to initialize parameters for a neural network with-
	'input_layer' number of neurons in input layer
	'hidden_layer' number of neurons in hidden layer
	'output_layer' neuron in output layer [output layer has one neuron]

	Initialize ALL weights & biases to zero!
	'''

	# Initialize weights for hidden layer and input layer-
	# W1 = np.random.randn(hidden_layer, input_layer) * 0.01
	W1 = np.zeros(shape = (hidden_layer, input_layer))

	# Initialize bias values for hidden layer-
	# b1 = np.zeros((hidden_layer, 1))
	# OR-
	# b1 = np.random.rand(hidden_layer, 1) * 0.01
	b1 = np.random.randn(1, hidden_layer) * 0.01
	# b1 = np.zeros(shape = (1, hidden_layer))

	# Initialize weights for output layer and hidden layer-
	# W2 = np.random.randn(output_layer, hidden_layer) * 0.01
	W2 = np.zeros(shape = (output_layer, hidden_layer))

	# Initialize bias values for output layer-
	# b2 = np.zeros((output_layer, 1))
	# OR-
	# b2 = np.random.rand(output_layer, 1) * 0.01
	b2 = np.random.randn(1, output_layer) * 0.01
	# b2 = np.zeros(shape = (1, output_layer))

	# Return all weights and biases as a dictionary object-
	parameters = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

	return parameters

In [0]:
def forward_and_backward_propagation(X, parameters, Y):
	'''
	Function to compute forward propagation based on 'X', 'Y'
	and 'parameters' dictionary

	Input:
	1.) 'X' a numpy array 
	2.) 'Y' a numpy array
	3.) 'parameters' a Python dictionary containing weights & biases


	Returns:
	1.) gradients for weights and biases
	2.) cost

	Z1 is network input for hidden layer
	A1 is output of hidden layer (tanh output)
	Z2 is network input for output layer
	A2 is output of output layer (sigmoid output)
	'''

	# Retreive initialized weights & biases-
	W1 = parameters['W1']
	b1 = parameters['b1']
	W2 = parameters['W2']
	b2 = parameters['b2']


	# Implement forward propagation to compute A2 probabilities-

	# Network input for hidden layer-
	Z1 = np.dot(W1, X.T) + b1.T		# (hidden_layer, m)

	# Using tanh as activation function for hidden layer-
	A1 = np.tanh(Z1)	# (output_layer, m)

	# Using ReLU as activation function for hidden layer-
	# A1 = np.maximum(Z1, 0)

	# Network input for output layer-
	Z2 = np.dot(W2, A1) + b2	# (n_o, m) OR (1, m)

	# Using sigmoid activation function for output layer
	A2 = sigmoid(Z2)	# (1, m) OR (n_o, m)

	# Number of training examples-
	m = X.shape[0]

	# Implement backward propagation for adjusting weights and biases-

	# Partial derivative of cost wrt W2-
	dJ_dW2 = (1 / m) * np.dot((A2 - Y), A1.T)

	"""
	# Sanity check-
	if dJ_dW2.shape == W2.shape:
		print("\nW2.shape equals dJ_dW2.shape\n")
	else:
		print("\nW2.shape is NOT equal to dJ_dW2.shape! Recheck!\n")
	"""

	# Partial derivative of cost wrt W1 using tanh activation function-
	# dJ_dW1 = (1 / m) * np.dot(np.multiply(np.dot(W2.T, (A2 - Y)), (1 - np.square(A1))), X)

	# Partial derivative of cost wrt W1 using ReLU activation function-
	dJ_dW1 = (1 / m) * np.dot(np.multiply(np.dot(W2.T, (A2 - Y)), relu_derivative(A1)), X)

	# Sanity check-
	# dJ_dW1.shape == W1.shape
	# True

	# Partial detivative of cost wrt hidden layer bias (b1)- using tanh activation function-
	# dJ_db1 = (1 / m) * np.multiply(np.dot((A2 - Y), (1 - np.square(A1)).T), W2)

	# Partial derivative of cost wrt hidden layer bias (b1) using ReLU activation function-
	dJ_db1 = (1 / m) * np.multiply(np.dot((A2 - Y), relu_derivative(A1).T), W2)

	# Sanity check-
	# dJ_db1.shape == b1.shape
	# True

	# Partial derivative of cost wrt output layer bias (b2)-
	dJ_db2 = (1 / m) * np.sum(A2 - Y)
	# Returns a scalar with shape ()

	cost = compute_cost(A2, Y)

	gradient = {
		'dJ_dW1': dJ_dW1, 'dJ_dW2': dJ_dW2,
		'dJ_db1': dJ_db1, 'dJ_db2': dJ_db2
	}

	return gradient, cost

In [0]:
def compute_cost(A2, Y):
	'''
	Function to compute cost using binary cross-entropy
	loss

	Arguments-
	A2- Sigmoid output of output layer; shape- (1, m)
	Y-  groud truth labels vector; shape- (1, m)
	# parameters- Python dict containing W1, b1, W2, b2
	m - number of training examples

	Return-
	Binary cross-entropy cost
	'''

	# Number of training examples-
	# m = Y.shape[1]
	m = Y.shape[0]

	# Compute binary cross-entropy cost-
	logprobs = np.multiply(np.log(A2), Y) + np.multiply(np.log(1 - A2), (1 - Y))
	
	cost = (-1 / m) * np.sum(logprobs)

	# np.multiply() multiplies arguments element-wise
	# np.log() natural logarithm, element-wise

	# makes sure cost is the dimension we expect, E.g., turns [[51]] into 51-
	cost = np.squeeze(cost)

	return cost

In [0]:
def optimization(wts_bias_parameters, X, Y, num_iterations, learning_rate, print_cost = False):
	'''
	Function to perform optimization to learn weight 'W' and bias 'b'
	by using gradient descent algorithm

	Returns the learnt 'W' and 'b' parameters AFTER training on training data
	'''

	# List variable to hold cost/loss-
	costs = []

	for i in range(num_iterations):
		# Compute gradients and cost using defined function-
		gradients, cost = forward_and_backward_propagation(X, wts_bias_parameters, Y)

		# Get partial derivates from 'gradients'
		dJ_db1 = gradients['dJ_db1']
		dJ_db2 = gradients['dJ_db2']
		dJ_dW2 = gradients['dJ_dW2']
		dJ_dW1 = gradients['dJ_dW1']

		W1 = wts_bias_parameters['W1']
		W2 = wts_bias_parameters['W2']
		b1 = wts_bias_parameters['b1']
		b2 = wts_bias_parameters['b2']

		# Update weights-
		W1 = W1 - (learning_rate * dJ_dW1)
		W2 = W2 - (learning_rate * dJ_dW2)

		# Update biases-
		b1 = b1 - (learning_rate * dJ_db1)
		b2 = b2 - (learning_rate * dJ_db2)

		# Update 'wts_bias_parameters' dict for next call-
		wts_bias_parameters = {'W1': W1, 'W2': W2, 'b1': b1, 'b2': b2}

		# Store loss/cost AFTER every 100 iterations-
		if i % 100 == 0:
			costs.append(cost)

		# Print cost AFTER every 100 iterations-
		if print_cost and i % 100 == 0:
			print("\nLoss/Cost after {0} iterations = {1:.4f}\n".format(i, cost))


	# Update computed weights and biases-
	# The actual weights and biases AFTER training is done
	updated_params = {'W1': W1, 'W2': W2, 
				'b1': b1, 'b2': b2}

	# Update partial derivatives as a dictionary
	# Partial derivatives AFTER training is done-
	updated_gradients = {'dJ_dW2': dJ_dW2, 'dJ_dW1': dJ_dW1,
			'dJ_db1': dJ_db1, 'dJ_db2': dJ_db2}


	# Return everything-
	return updated_params, updated_gradients, costs

In [0]:
def predict(wts_bias_parameters, X):
	'''
	Function to make predictions using trained weights and biases

	
	'''

	# Number of examples-
	m = X.shape[0]

	y_pred = np.zeros((1, m))	# shape- (1, m)

	W1 = wts_bias_parameters['W1']
	W2 = wts_bias_parameters['W2']
	b1 = wts_bias_parameters['b1']
	b2 = wts_bias_parameters['b2']

	# Perform forward propagation to predict for given 'X'-
	Z1 = np.dot(W1, X.T) + b1.T 	# (output_layer, m)
	A1 = np.tanh(Z1)	# (output_layer, m)
	Z2 = np.dot(W2, A1) + b2	# (n_o, m) OR (1, m)
	A2 = sigmoid(Z2)

	for i in range(A2.shape[1]):
		# Convert probabilities A2[0, i] to binary labels-
		if A2[0, i] > 0.5:
			y_pred[0, i] = 1		# 1 is for CAT
		else:
			y_pred[0, i] = 0		# 0 is for DOG

	return y_pred

In [16]:
print("\nDimensions of training set are:")
print("X_train.shape = {0} & y_train.shape = {1}\n".format(X_train.shape, y_train.shape))


Dimensions of training set are:
X_train.shape = (70000, 3) & y_train.shape = (70000,)



In [17]:
print("\nDimensions of testing set are:")
print("X_test.shape = {0} & y_test.shape = {1}\n".format(X_test.shape, y_test.shape))


Dimensions of testing set are:
X_test.shape = (30000, 3) & y_test.shape = (30000,)



In [0]:
# Initialize parameters by specifying number of neurons in each layers-
n_hidden_neurons = 10

initialized_params = initialize_parameters(X_train.shape[1], n_hidden_neurons, 1)
# 'initialized_params' is a dict

In [19]:
for layer in initialized_params.keys():
	print("{0} layer has shape = {1}".format(layer, initialized_params[layer].shape))

W1 layer has shape = (10, 3)
b1 layer has shape = (1, 10)
W2 layer has shape = (1, 10)
b2 layer has shape = (1, 1)


In [20]:
updated_params, updated_gradients, costs = optimization(
	wts_bias_parameters = initialized_params,
	X = X_train, Y = y_train, num_iterations = 1000,
	learning_rate = 0.01, print_cost = True)


Loss/Cost after 0 iterations = 0.6944


Loss/Cost after 100 iterations = 0.6013


Loss/Cost after 200 iterations = 0.3542


Loss/Cost after 300 iterations = 0.2157


Loss/Cost after 400 iterations = 0.1474


Loss/Cost after 500 iterations = 0.1105


Loss/Cost after 600 iterations = 0.0880


Loss/Cost after 700 iterations = 0.0731


Loss/Cost after 800 iterations = 0.0626


Loss/Cost after 900 iterations = 0.0548



In [0]:
# Make predictions on test set-
y_predictions = predict(updated_params, X_test)
y_predictions = np.squeeze(y_predictions)

In [22]:
accuracy = accuracy_score(y_test, y_predictions)
precision = precision_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions)

print("\nTrained Neural Network model metrics are: ")
print("accuracy = {0:.4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(accuracy, precision, recall))


Trained Neural Network model metrics are: 
accuracy = 0.9882, precision = 1.0000 & recall = 0.9763



**Now, new parameters are which in which all weights & biases are zeros**

**Observation:**
If *both* weights and biases are set to zero, then no learning happens!

However, if weights are zeros, but biases are non-zeros, then learning happens, however, the Neural Network will take more epochs.

In [0]:
initialized_params_all_zeros = initialize_parameters_zeros(
	X_train.shape[1], n_hidden_neurons, 1)

In [33]:
initialized_params_all_zeros['b1']

array([[ 0.01548721,  0.00743888,  0.0115279 , -0.02497696,  0.00829825,
        -0.00037653,  0.00081903,  0.00749686,  0.00088415,  0.01137796]])

In [34]:
initialized_params_all_zeros['b2']

array([[-0.00050823]])

In [35]:
updated_params_zeros, updated_gradients_zeros, costs_zeros = optimization(
	wts_bias_parameters = initialized_params_all_zeros,
	X = X_train, Y = y_train, num_iterations = 1500,
	learning_rate = 0.01, print_cost = True)


Loss/Cost after 0 iterations = 0.6931


Loss/Cost after 100 iterations = 0.6931


Loss/Cost after 200 iterations = 0.6931


Loss/Cost after 300 iterations = 0.6921


Loss/Cost after 400 iterations = 0.5950


Loss/Cost after 500 iterations = 0.3223


Loss/Cost after 600 iterations = 0.1698


Loss/Cost after 700 iterations = 0.1027


Loss/Cost after 800 iterations = 0.0705


Loss/Cost after 900 iterations = 0.0530


Loss/Cost after 1000 iterations = 0.0425


Loss/Cost after 1100 iterations = 0.0354


Loss/Cost after 1200 iterations = 0.0303


Loss/Cost after 1300 iterations = 0.0265


Loss/Cost after 1400 iterations = 0.0236

