In [10]:
import numpy as np
import random

In [2]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

#given that x is activated
def sigmoid_prime(x):
  return x * (1 - x)

In [3]:
def xor_net(inputs: list[2], weights: list[list[3]]):
  net = {}

  # calculate activations
  hidden_node1 = sigmoid(weights[0][0] * inputs[0] + weights[0][1] * inputs[1] + weights[0][2])
  hidden_node2 = sigmoid(weights[1][0] * inputs[0] + weights[1][1] * inputs[1] + weights[1][2])

  output_node = sigmoid(weights[2][0] * hidden_node1 + weights[2][1] * hidden_node2 + weights[2][2])

  # save activated values
  net['h1'] = hidden_node1
  net['h2'] = hidden_node2
  net['output'] = output_node

  return net

In [4]:
def mse(weights):
  inputs = [[0, 0], [0, 1], [1, 0], [1, 1]]
  targets = [0, 1, 1, 0]
  total_error = 0

  for i in range(len(inputs)):
    predicted = xor_net(inputs[i], weights)["output"]
    total_error += (targets[i] - predicted) ** 2

  return total_error / len(inputs)

In [5]:
def grdmse(weights):
    inputs = [[0, 0], [0, 1], [1, 0], [1, 1]]
    targets = [0, 1, 1, 0]

    gradient = [np.zeros_like(w) for w in weights]  # initialize gradient with zeros

    for i in range(len(inputs)):
        net_info = xor_net(inputs[i], weights)
        output_node = net_info['output']
        hidden_node1 = net_info['h1']
        hidden_node2 = net_info['h2']

        # partial derivatives of MSE with respect to each weight (negative)
        d_error_d_output = 2 * (targets[i] - output_node)
        d_output_d_net_output = sigmoid_prime(output_node)

        # output layer weights
        gradient[2][0] += d_error_d_output * d_output_d_net_output * hidden_node1
        gradient[2][1] += d_error_d_output * d_output_d_net_output * hidden_node2
        gradient[2][2] += d_error_d_output * d_output_d_net_output

        # hidden layer weights
        d_net_output_d_hidden1 = weights[2][0]
        d_net_output_d_hidden2 = weights[2][1]

        gradient[0][0] += (
            d_error_d_output
            * d_output_d_net_output
            * d_net_output_d_hidden1
            * sigmoid_prime(hidden_node1)
            * inputs[i][0]
        )
        gradient[0][1] += (
            d_error_d_output
            * d_output_d_net_output
            * d_net_output_d_hidden1
            * sigmoid_prime(hidden_node1)
            * inputs[i][1]
        )
        gradient[0][2] += (
            d_error_d_output
            * d_output_d_net_output
            * d_net_output_d_hidden1
            * sigmoid_prime(hidden_node1)
        )

        gradient[1][0] += (
            d_error_d_output
            * d_output_d_net_output
            * d_net_output_d_hidden2
            * sigmoid_prime(hidden_node2)
            * inputs[i][0]
        )
        gradient[1][1] += (
            d_error_d_output
            * d_output_d_net_output
            * d_net_output_d_hidden2
            * sigmoid_prime(hidden_node2)
            * inputs[i][1]
        )
        gradient[1][2] += (
            d_error_d_output
            * d_output_d_net_output
            * d_net_output_d_hidden2
            * sigmoid_prime(hidden_node2)
        )

    # the average gradient over all training samples
    for i in range(len(gradient)):
        gradient[i] /= len(inputs)

    return gradient


In [6]:
def train_neural_network(weights, inputs, targets, learning_rate, epochs):
  for epoch in range(epochs):
    misclassified_count = 0

    for i in range(len(inputs)):
      net_info = xor_net(inputs[i], weights)
      output_node = net_info['output']
      hidden_node1 = net_info['h1']
      hidden_node2 = net_info['h2']

      gradient = grdmse(weights)

      for j in range(len(weights)):
        weights[j] += learning_rate * gradient[j]

      predicted = 1 if output_node > 0.5 else 0
      if predicted != targets[i]:
          misclassified_count += 1

    if misclassified_count == 0:
      break
  print(f'Trained for {epoch} epochs, final mse {mse(weights)}')


In [7]:
def test_neural_network(weights, test_inputs, test_targets):
    misclassified = 0
    for i in range(len(test_inputs)):
        net_info = xor_net(test_inputs[i], weights)
        output_node = net_info['output']

        #interpret output
        predicted = 1 if output_node > 0.5 else 0

        if predicted != test_targets[i]:
            misclassified += 1

    accuracy = 1 - (misclassified / len(test_inputs))
    return accuracy

In [17]:
inputs = [[0, 0], [0, 1], [1, 0], [1, 1]]
targets = [0, 1, 1, 0]

random.seed(43)
weights = [[random.random() for _ in range(3)] for _ in range(3)]

print(f"The initial weights are: {weights}")

learning_rate = 0.5
print(f"The learning rate is: {learning_rate}")


epochs = 10000 #maximum number of iterations

train_neural_network(weights, inputs, targets, learning_rate, epochs)
print(f"The final weights are: {weights}")

accuracy = test_neural_network(weights, inputs, targets)
print("Accuracy:", accuracy)


The initial weights are: [[0.038551839337380045, 0.6962243226370528, 0.14393322139536102], [0.46253225482908755, 0.671646764117767, 0.7929512716552943], [0.45318922846621235, 0.4982722297980512, 0.01915710802434778]]
The learning rate is: 0.5
Trained for 400 epochs, final mse 0.12989346643136518
The final weights are: [array([ 1.07013135,  1.41475086, -1.33359191]), array([ 4.13721731,  4.35236667, -0.90811219]), array([-2.90725016,  4.16408344, -1.96128866])]
Accuracy: 1.0


3075 iterations.
LR = 0.1. Seed 43

160 iterations.
LR = 1. Seed 0

40 iterations.
LR = 10. Seed 0

400 iterations.
LR = 0.5. Seed 0