In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
np.random.seed(3)

# Activation Function

In [2]:
def sigmoid(Z):
    return 1. / (1. + np.exp(-Z))

def tanh(Z):
    return np.tanh(Z)

def relu(Z):
    return np.maximum(0,Z)

def tanh_derivative(Z):
    a = tanh(Z)
    return 1 - np.power(a, 2)

def sigmoid_derivative(Z):
    a = sigmoid(Z)
    return a * (1 - a)

def relu_derivative(Z):
    return Z > 0 * 1

# Init Parameters

In [3]:
def init_random_weights(m, n):
    return np.random.randn(m, n) / np.sqrt(n) #* .01

def init_zeros(m):
    return np.zeros(shape=(m, 1))

def init_params(layer_units):
    weights = []
    biases = []

    for i, l in enumerate(range(1, len(layer_units))):
        m = layer_units[l]
        n = layer_units[l-1]

        weights.append(init_random_weights(m, n))
        biases.append(init_zeros(m))
        
        assert(weights[i].shape == (m, n))
        assert(biases[i].shape == (m, 1))
    
    return weights, biases

# Forward Propagation

In [4]:
def linear_forward(A, W, b):
    Z = np.dot(W, A) + b
    assert(Z.shape == (W.shape[0], A.shape[1]))
    
    return Z

def activation_function(Z, activation):
    if activation == "sigmoid": return sigmoid(Z)
    elif activation == "relu": return relu(Z)
    elif activation == "tanh": return tanh(Z)
    else: return Z # Default to identity activation
    
def forward_propagation(A_prev, W, b, activation):
    Z = linear_forward(A=A_prev, W=W, b=b)
    A = activation_function(Z=Z, activation=activation)
    return A, Z

def model_forward_propagation(X, weights, biases, activation_functions, verbose = False):
    A = X              # X.T becomes A[0]
    L = len(weights)
    caches = {'A_prev': []
        , 'W': []
        , 'b': []
        , 'Z': []
    }
    print('| Forward Propagation |') if verbose else None
    for i in range(L):
        
        A_prev = A.copy()
        W = weights[i]
        b = biases[i]
        activation = activation_functions['hidden'] if i != L-1 else activation_functions['output']
        print(f'Iteration-{i} Layer-{i+1}',  activation) if verbose else None
        
        A, Z = forward_propagation(A_prev, W, b, activation)
        
        caches['A_prev'].append(A_prev)
        caches['W'].append(W)
        caches['b'].append(b)
        caches['Z'].append(Z)

    return A, caches

# Cost Function

In [5]:
# Cost Function

def compute_cost(y, y_pred):
    m = len(y)
    y_pred = y_pred[0]
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    
    cost = (-1./m) * np.sum(np.multiply(y, np.log(y_pred)) + np.multiply((1 - y), np.log(1 - y_pred)))
    #cost = np.squeeze(cost)
    return cost

def compute_cost_regularization(y, y_pred, weights, lambda_reg):
    m = len(y)
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    cost = -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
    regularization_term = 0
    for W in weights:
        regularization_term += np.sum(np.square(W))

    # Add regularization term to the cost
    cost = cost + (lambda_reg / (2 * m)) * regularization_term
    cost = np.squeeze(cost)
    
    return cost

# Backward Propagation

In [6]:
def linear_backward(dA, m, g_derivative, A_prev, W, b, lambda_reg=None):
    dZ = dA * g_derivative
    if lambda_reg is None:
        dW = np.dot(dZ, A_prev.T) * 1./m
    else:
        dW = (np.dot(dZ, A_prev.T) + lambda_reg * W) * 1./m
    db = np.sum(dZ, axis = 1, keepdims = True) * 1./m
    dA_prev = np.dot(W.T,dZ)

    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)

    return dA_prev, dW, db

def derivative_activation(Z, activation):
    if activation == "sigmoid": return sigmoid_derivative(Z)
    elif activation == "relu": return relu_derivative(Z)
    elif activation == "tanh": return tanh_derivative(Z)
    else: return 1 # Default to identity activation 

def model_backward_propagation(A, y, weights, biases, caches, activation_functions, verbose=False, lambda_reg=None):
    gradients_A_prev = []
    gradients_W = []
    gradients_b = []

    L = len(weights)
    y = y.reshape(A.shape)
    m = len(y)
    epsilon = 1e-15
    A = np.clip(A, epsilon, 1 - epsilon)

    dA = - (np.divide(y, A) - np.divide((1 - y), (1 - A)))
    
    print('| Backward Propagation |') if verbose else None
    for i, j in enumerate(reversed(range(L))):
        l = j + 1
        activation = activation_functions['output'] if i == 0 else activation_functions['hidden']
        Z = caches['Z'][j]
        A_prev = caches['A_prev'][j]
        W = caches['W'][j]
        b = caches['b'][j]
        dA_prev = dA
        
        g_derivative =  derivative_activation(Z, activation)
        print(f'Iteration-{i} Index-{j} Layer-{l}', activation, Z.shape, g_derivative.shape) if verbose else None
        
        dA_prev, dW, db = linear_backward(dA_prev, m, g_derivative, A_prev, W, b, lambda_reg=lambda_reg)
        gradients_A_prev.insert(0, dA_prev)
        gradients_W.insert(0, dW)
        gradients_b.insert(0, db)

    return gradients_A_prev, gradients_W, gradients_b

# Update Parameters

In [7]:
def update_parameters(weights, biases, gradients_W, gradients_b, learning_rate):
    L = len(weights)
    for i in range(L):
        weights[i] -=  (learning_rate * gradients_W[i])
        biases[i] -= (learning_rate * gradients_b[i])

    return weights, biases

In [8]:
def epoch_train(X, y, activation_functions, learning_rate, weights, biases, verbose=True, lambda_reg=None):
    A, caches = model_forward_propagation(X=X, 
                                        weights=weights, 
                                        biases=biases, 
                                        activation_functions=activation_functions,
                                        verbose=verbose)
    # Computing Cost
    if lambda_reg is None:
        cost = compute_cost(y=y, y_pred=A)
    else:
        cost = compute_cost_regularization(y, A, weights, lambda_reg = 0.01)

    gradients_A_prev, gradients_W, gradients_b = model_backward_propagation(A, 
                                                                            y, 
                                                                            weights, 
                                                                            biases, 
                                                                            caches, 
                                                                            activation_functions, 
                                                                            verbose=verbose,
                                                                            lambda_reg=lambda_reg)
    weights, biases = update_parameters(weights, 
                                        biases, 
                                        gradients_W, 
                                        gradients_b, 
                                        learning_rate)
    
    return weights, biases, cost, caches

# Train the model

In [9]:
def train_nn(X, y, layer_units, activation_functions, learning_rate, num_iteration, verbose=False, lambda_reg=None):
    print('='*75)
    print(f"Train Neural Network Model")
    print('='*75)
    print(f"\tlayer_units: {layer_units} ")
    print(f"\tactivation_functions: {activation_functions}")
    print(f"\tlearning_rate: {learning_rate}")
    print(f"\tnum_iteration: {num_iteration}")
    print('='*75)

    weights, biases = init_params(layer_units=layer_units)
    costs = []
    best_cost = 1.
    best_weights = None
    best_biases = None
    best_iteration = 0

    for i in range(num_iteration):
        weights, biases, cost, caches = epoch_train(X, y, activation_functions, learning_rate, weights, biases, verbose=verbose, lambda_reg=lambda_reg)
        if best_cost > cost:
            best_cost = cost
            best_iteration, best_weights, best_biases = i+1, weights, biases
        if (i+1) % 100 == 0 or i == 0:
            print(f'Iteration {i+1}\tCost: {cost}')
        costs.append(cost)
    
    best_parameters = {
        'weights': best_weights,
        'biases': best_biases
    }
    
    print('='*75)
    print('\tBest Iteration:', best_iteration, 'Cost Function:', best_cost)
    print('='*75)

    return best_parameters, costs

In [10]:
def predict(X, y, parameters, activation_functions):
    m = X.shape[1]
    p = np.zeros((1,m))
    best_weights = parameters['weights']
    best_biases = parameters['biases']
    
    probs, caches = model_forward_propagation(X=X, 
                                              weights=best_weights, 
                                              biases=best_biases, 
                                              activation_functions=activation_functions,
                                              verbose=False)
    for i in range(0, probs.shape[1]):
        if probs[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    return p

# Execute Model Training

In [22]:
## Prepare the Dataset
X, y = load_breast_cancer(return_X_y=True)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)
X_train, X_test = X_train.T, X_test.T

# Set Hyperparameters
layer_units = [X_train.shape[0], 16, 64, 4, 2, 1] # Number of neuron units for each layers 
activation_functions = {'hidden': 'tanh', 'output': 'sigmoid'}
learning_rate = .0001
num_iteration = 1000
lambda_reg = 0.01

# Run Training
best_parameters, costs = train_nn(X_train, y_train, 
                                  layer_units, 
                                  activation_functions, 
                                  learning_rate, 
                                  num_iteration=num_iteration, 
                                  lambda_reg=lambda_reg
                             )

Train Neural Network Model
	layer_units: [30, 16, 64, 4, 2, 1] 
	activation_functions: {'hidden': 'tanh', 'output': 'sigmoid'}
	learning_rate: 0.0001
	num_iteration: 1000
Iteration 1	Cost: 0.7385391896445938
Iteration 100	Cost: 0.16149060034620003
Iteration 200	Cost: 0.10075712696483807
Iteration 300	Cost: 0.08852392005030764
Iteration 400	Cost: 0.06904333713378183
Iteration 500	Cost: 0.05999052547649123
Iteration 600	Cost: 0.05401788472122086
Iteration 700	Cost: 0.049707189571740654
Iteration 800	Cost: 0.04644802932610347
Iteration 900	Cost: 0.04393146284303655
Iteration 1000	Cost: 0.0419626281235537
	Best Iteration: 1000 Cost Function: 0.0419626281235537


In [25]:
y_pred = predict(X_test, y_test, parameters=best_parameters, activation_functions=activation_functions)[0]
print('confusion_matrix:\n', confusion_matrix(y_test, y_pred))
print('accuracy_score', accuracy_score(y_test, y_pred))
print('f1_score', f1_score(y_test, y_pred))

confusion_matrix:
 [[ 62   2]
 [  5 102]]
accuracy_score 0.9590643274853801
f1_score 0.966824644549763


# Done