# 1. Training a Perceptron to perform inclusive disjunction using plain Swift

**[Christopher Boone](https://github.com/cboone)**

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](http://colab.research.google.com/github/cboone/swift-neural-intuition/blob/master/1-perceptron-inclusive-disjunction.ipynb)

_Following [Wikipedia's summary of the Perceptron learning algorithm](https://en.m.wikipedia.org/wiki/Perceptron#Learning_algorithm). Using Arrays to represent vectors and Doubles for all numbers, for simplicity._

This network takes features as input vector $\mathbf{x}_j$, a vector of weights (one per feature) as $\mathbf{w}$, and an activation function $\phi$, where $x_{j,i}, w_{i}, \hat{y}_j \in \{0, 1\}$ and $x_{j,0} = 1$:

$$\hat{y}_j = \phi(\mathbf{w} \cdot \mathbf{x}_j)$$

Training the Perceptron requires: 
- a learning rate $r \in [0, 1]$
- a training set of sampled data $D = \{(\mathbf{x}_1, d_1), \dots, (\mathbf{x}_m, d_m)\}$, where $\mathbf{x}_j$ is the $n$-dimensional input vector, $d_j$ is the sampled (expected) output, and $m$ is the number of samples
- an initial set of weights
- an error threshold $\gamma \in [0, 1]$

***

The dot product of two vectors is defined by $\mathbf{a} \cdot \mathbf{b} = \sum _{i=1}^{n} a_{i} b_{i}$:

In [1]:
func dotProduct(_ a: [Double], _ b: [Double]) -> Double {
    zip(a, b).map(*).reduce(0, +)
}

The activation function is the Heaviside or unit step function, which can be defined by $H(x) = \frac{x + \left|x\right|}{2x}$:

In [2]:
func unitStep(_ x: Double) -> Double {
    (x > 0) ? 1 : 0
}

The predicted output of the Perceptron can be calculated by:

In [3]:
func predictedOutput(_ inputs: [Double], weights: [Double], activation: (Double) -> (Double)) -> Double {
    activation(dotProduct(weights, inputs))
}

Or, given an array of input values, by:

In [4]:
func predictedOutputs(_ inputs: [[Double]], weights: [Double], activation: (Double) -> (Double)) -> [Double] {
    inputs.map { predictedOutput($0, weights: weights, activation: activation) }
}

The Perceptron's error function (or cost function, objective function, loss function) is defined by $E(\mathbf{x}_j) = y_j - \hat{y}_j$:

In [5]:
func error(prediction: Double, sample: Double) -> Double {
    sample - prediction
}

The mean error is given by $\frac{1}{m} \sum_{j=1}^{m} \lvert y_j - \hat{y}_j \rvert$, where $m$ is the number of samples:

In [6]:
func meanError(predictions: [Double], samples: [Double]) -> Double {
    let summedError = zip(samples, predictions).map(error).map(abs).reduce(0, +)
    return (1 / Double(samples.count)) * summedError
}

The accuracy (percentage correct) can be calculated by:

In [7]:
func predictionAccuracy(predictions: [Double], samples: [Double]) -> Double {
    let checkedPredictions = zip(predictions, samples).reduce(into: [Double]()) { checked, outputs in
        checked.append(outputs.0 == outputs.1 ? 1 : 0)
    }
    let correct = checkedPredictions.reduce(0, +)
    return correct / Double(predictions.count)
}

The number of true positive predictions can be calculated by:

In [8]:
func truePositivePredictions(predictions: [Double], samples: [Double]) -> Double {
    let truePositivePredictions = zip(predictions, samples).reduce(into: [Double]()) { checked, outputs in
        outputs == (1, 1) ? checked.append(1) : checked.append(0)
    }
    return truePositivePredictions.reduce(0, +)
}


The precision (proportion of positive identifications that were actually correct) can be calculated by:

In [9]:
func predictionPrecision(predictions: [Double], samples: [Double]) -> Double {
    let truePositives = truePositivePredictions(predictions: predictions, samples: samples)
    let allPositives = predictions.reduce(0, +)
    return (allPositives > 0) ? (truePositives / allPositives) : 0
}

The recall (proportion of true positives that were actually correct) can be calculated by:

In [10]:
func predictionRecall(predictions: [Double], samples: [Double]) -> Double {
    let truePositives = truePositivePredictions(predictions: predictions, samples: samples)
    let actualPositivePredictions = zip(predictions, samples).reduce(into: [Double]()) { checked, outputs in
        switch outputs {
        case (1, 1), (0, 1):
            checked.append(1)
        default:
            checked.append(0)
        }
    }
    let actualPositives = actualPositivePredictions.reduce(0, +)
    return truePositives / actualPositives
}

The F₁ measure (harmonic mean of precision and recall) can be calculated by:

In [11]:
func predictionF1(predictions: [Double], samples: [Double]) -> Double {
    let precision = predictionPrecision(predictions: predictions, samples: samples)
    let recall = predictionRecall(predictions: predictions, samples: samples)
    return 2 / ((1 / recall) + (1 / precision))
}

The training data is the truth table for $A \lor B$, structured as an array of tuples $[(\mathbf{x}_j, d_j)] = [((\mathbf{x}_{j, 1}, \mathbf{x}_{j, 2}, \mathbf{x}_{j, 3}), d_j)]$ where:

- $\mathbf{x}_j$ is the input
- $\mathbf{x}_{j, 0} = 1$ (to act as a bias value, in concert with $w_0$)
- $d_j$ is the correct, sampled output value

In [12]:
let orTrainingSet: [([Double], Double)] = [
    ([1, 0, 0], 0),
    ([1, 0, 1], 1),
    ([1, 1, 0], 1),
    ([1, 1, 1], 1)
]
let orTrainingSetInputs: [[Double]] = orTrainingSet.map { $0.0 }
let orTrainingSetOutputs: [Double] = orTrainingSet.map { $0.1 }

Given an untrained Perceptron with an initial set of weights $\mathbf{w} = (0, 0, 0)$, calculate the predicted outputs for the training inputs:

In [13]:
predictedOutputs(orTrainingSetInputs, weights: [0, 0, 0], activation: unitStep)

▿ 4 elements
  - 0 : 0.0
  - 1 : 0.0
  - 2 : 0.0
  - 3 : 0.0


Calculate the mean error for the untrained Perceptron on the training inputs:

In [14]:
meanError(
    predictions: predictedOutputs(orTrainingSetInputs, weights: [0, 0, 0], activation: unitStep), 
    samples: orTrainingSetOutputs
)

0.75


To update the weights during Perceptron training, modify each weight $w_i$ by adding $r E(x_j) x_{j,i}$ to it, where $r \in [0, 1]$ is the learning rate:

In [15]:
func updatedWeights(_ oldWeights: [Double], error: Double, inputs: [Double], learningRate: Double) -> [Double] {
    let weightsDelta = learningRate * error
    let newWeights = oldWeights.enumerated().map { $1 + (weightsDelta * inputs[$0]) }
    return newWeights
}

To train the Perceptron, calculate the predicted output $\hat{y}_j$ based on the current weights $\mathbf{w}$ and the activation function $\phi$, then update the weights based on the error $E(\mathbf{x}_j)$. Repeat until $E(\mathbf{x}_j) < \gamma$, where $\gamma$ is the error threshold:

In [26]:
func trainWeights(
    startingFrom startingWeights: [Double], 
    samples: [([Double], Double)], 
    learningRate: Double,
    errorThreshold: Double,
    activation: (Double) -> (Double)
) -> [Double] 
{
    let sampledInputs = samples.map { $0.0 }
    let sampledOutputs = samples.map { $0.1 }
    let shuffledSamples = samples.shuffled()
    let sampleCount = samples.count
    var currentWeights = startingWeights
    var predictions = predictedOutputs(sampledInputs, weights: currentWeights, activation: activation)
    var averageError = meanError(predictions: predictions, samples: sampledOutputs)
    var accuracy = predictionAccuracy(predictions: predictions, samples: sampledOutputs)
    var precision = predictionPrecision(predictions: predictions, samples: sampledOutputs)
    var recall = predictionRecall(predictions: predictions, samples: sampledOutputs)
    var f1 = predictionF1(predictions: predictions, samples: sampledOutputs)
    var iterations = 0

    print("Starting weights:", startingWeights)
    print("Predicted outputs:", predictions)
    print("Mean error:", averageError)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)
    print("")

    while averageError >= errorThreshold {
        iterations += 1
        
        let (currentInputs, currentOutput) = shuffledSamples[iterations % sampleCount]
        let predictions = predictedOutputs(sampledInputs, weights: currentWeights, activation: activation)
        averageError = meanError(predictions: predictions, samples: sampledOutputs)
        let prediction = predictedOutput(currentInputs, weights: currentWeights, activation: activation)
        let currentError = error(prediction: prediction, sample: currentOutput)
        currentWeights = updatedWeights(currentWeights, error: currentError, inputs: currentInputs, learningRate: learningRate)
    }

    predictions = predictedOutputs(sampledInputs, weights: currentWeights, activation: activation)
    averageError = meanError(predictions: predictions, samples: sampledOutputs)
    accuracy = predictionAccuracy(predictions: predictions, samples: sampledOutputs)
    precision = predictionPrecision(predictions: predictions, samples: sampledOutputs)
    recall = predictionRecall(predictions: predictions, samples: sampledOutputs)
    f1 = predictionF1(predictions: predictions, samples: sampledOutputs)

    print("Epochs:", iterations / sampleCount)
    print("Iterations:", iterations)
    print("Final weights:", currentWeights)
    print("Predicted outputs:", predictions)
    print("Mean error:", averageError)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1:", f1)

    return currentWeights
}

Train the Perceptron to perform inclusive disjunction:

In [29]:
trainWeights(
    startingFrom: [0, 0, 0], 
    samples: orTrainingSet, 
    learningRate: 0.1, 
    errorThreshold: 0.25, 
    activation: unitStep
)

Starting weights: [0.0, 0.0, 0.0]
Predicted outputs: [0.0, 0.0, 0.0, 0.0]
Mean error: 0.75
Accuracy: 0.25
Precision: 0.0
Recall: 0.0
F1: 0.0

Epochs: 1
Iterations: 7
Final weights: [0.0, 0.1, 0.1]
Predicted outputs: [0.0, 1.0, 1.0, 1.0]
Mean error: 0.0
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0


▿ 3 elements
  - 0 : 0.0
  - 1 : 0.1
  - 2 : 0.1
