# Xor

Now let's try a slightly harder example - the XOR function. Unlike AND or OR, it cannot be solved via a single perceptron, because it is not "linearly separable".

In [1]:
import numpy as np
from typing import List

Xor is true (1 in this case) only when one of the inputs is true. If both or neither is true, then the result is false (or 0).

In [2]:
training_inputs = [
    np.array([1, 1]),
    np.array([1, 0]),
    np.array([0, 1]),
    np.array([0, 0])
]

training_labels = [0, 1, 1, 0]

In [3]:
def initialize_params(layer_dimensions: List[int]):
    parameters = {}
    numLayers = len(layer_dimensions)
    
    for l in range(1, numLayers):
        # Initialize weights to a small random number.
        parameters['W' + str(l)] = np.random.randn(layer_dimensions[l], layer_dimensions[l-1]) * 0.01
        # Initialize biases to zero.
        parameters['b' + str(l)] = np.zeros((layer_dimensions[l], 1))
    
    return parameters

In [4]:
parameters = initialize_params([2, 3, 1])

print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))

W1 = [[-0.0237695  -0.01379197]
 [-0.00611328 -0.01696749]
 [-0.00635592  0.00909459]]
b1 = [[0.]
 [0.]
 [0.]]


In [5]:
def linear_forward(A, W, b):
    """
    The "linear" part of a layer's forward propagation.
    
    Arguments::
    A -- Activations from the previous layer.
    W -- Weights matrix.
    b -- Bias vector.
    """
    
    # Input to the activation function.
    Z = np.dot(W, A) + b
    # Values we'll need when doing backward propagation.
    cache = (A, W, b)
    
    return Z, cache

In [6]:
def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    return A, Z

def relu(Z):
    A = np.maximum(0, Z)
    return A, Z

In [7]:
def linear_activation_forward(A_prev, W, b, activation):
    Z, linear_cache = linear_forward(A_prev, W, b)
    A, activation_cache = activation(Z)

In [8]:
def model_forward(X, parameters):
    """
    Forward propagation for each layer. Each layer is Linear->Relu, except
    for the output layer, which is Liner->Sigmoid.
    """
    
    caches = []
    # The initial "previous" layer is the inputs.
    A = X
    # The number of layers. Note that we divide by because the `parameters` contains
    # weights AND biases for each layer.
    numLayers = len(parameters) // 2
    
    # Relu layers
    for l in range(1, numLayers):
        A_prev = A
        W = parameters['W' + str(l)]
        b = parameters['b' + str(l)]
        A, cache = linear_activation_forward(A_prev, W, b, relu)
        caches.append(cache)
        
    # Sigmoid layer
    W = parameters['W' + str(numLayers)]
    b = parameters['b' + str(numLayers)]
    Yhat, cache = linear_activation_forward(A, W, b, sigmoid)
    caches.append(cache)
    
    return Yhat, caches

In [9]:
def compute_cost(Yhat, Y):
    """
    Compute the cross-entropy cost.
    
    Arguments:
    Yhat -- Probability vector corresponding to label predictions.
    Y -- Actual label vector.
    """
    
    numExamples = Y.shape[1]
    
    cost = -(1 / numExamples) * np.sum(np.multiply(Y, np.log(Yhat)) + np.multiply(1 - Y, np.log(1 - Yhat)))
    return np.squeeze(cost)