In [88]:
import numpy as np
import time

In [89]:
class Layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)  # He initialization
        self.biases = np.zeros((output_size, 1))

        self.activation_function = self.get_activation_function(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x

    def forward(self, input_data):
        self.input_data = input_data
        self.z = np.dot(self.weights, input_data) + self.biases
        self.a = self.activation_function(self.z)
        return self.a

In [90]:
class NeuralNetwork:
    def __init__(self, input_size):
        self.layers = []
        self.input_size = input_size

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data = layer.forward(input_data)
        return input_data

In [91]:
nn = NeuralNetwork(input_size=2)
nn.add_layer(Layer(input_size=2, output_size=10, activation='relu'))  # First hidden layer
nn.add_layer(Layer(input_size=10, output_size=10, activation='relu'))  # Second hidden layer
nn.add_layer(Layer(input_size=10, output_size=10, activation='relu'))  # Third hidden
nn.add_layer(Layer(input_size=10, output_size=1, activation='linear'))  # Output layer

x = np.random.rand()
y = np.random.rand()

input_vector = np.array([x, y]).reshape(2, 1)
output_vector = nn.forward(input_vector)
print(output_vector)

[[-0.29576971]]


In [92]:
# reverse mode AD

class ReverseLayer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        # Initialize weights and biases
        self.weights = np.random.uniform(-0.1, 0.1, (output_size, input_size))
        self.biases = np.zeros((output_size, 1))

        # Store activation function
        self.activation_function = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x  # Linear activation

    def get_activation_derivative(self, activation):
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'sigmoid':
            return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x)**2
        else:
            return lambda x: np.ones_like(x)  # Derivative for linear is 1

    def forward(self, input_data):
        self.input_data = input_data
        self.z = np.dot(self.weights, input_data) + self.biases
        self.a = self.activation_function(self.z)
        return self.a

    def backward(self, dA):
        m = self.input_data.shape[1]  # Batch size
        self.dZ = dA * self.activation_derivative(self.z)  # Elementwise multiplication of derivative
        self.dW = np.dot(self.dZ, self.input_data.T) / m
        self.db = np.sum(self.dZ, axis=1, keepdims=True) / m
        dA_prev = np.dot(self.weights.T, self.dZ)

        return dA_prev

class ReverseNeuralNetwork:
    def __init__(self):
        self.layers = []

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data = layer.forward(input_data)
        return input_data

    def compute_gradients(self, Y, learning_rate):
        m = Y.shape[1]
        dA = self.layers[-1].a - Y  # Loss derivative with respect to output
        for layer in reversed(self.layers):
            dA = layer.backward(dA)

In [93]:
nn = ReverseNeuralNetwork()

layer1 = ReverseLayer(input_size=2, output_size=10, activation='relu')
layer2 = ReverseLayer(input_size=10, output_size=10, activation='relu')
layer3 = ReverseLayer(input_size=10, output_size=10, activation='relu')
output_layer = ReverseLayer(input_size=10, output_size=1, activation='linear')

nn.add_layer(layer1)  # First hidden layer
nn.add_layer(layer2)  # Second hidden layer
nn.add_layer(layer3)  # Third hidden layer
nn.add_layer(output_layer)  # Output layer

x = np.random.rand()
y = np.random.rand()
f_xy = np.sin(2 * np.pi * x * y) + 2 * x * y**2

input_vector = np.array([x, y]).reshape(2, 1)
true_output = np.array([[f_xy]])
output_vector = nn.forward(input_vector)
nn.compute_gradients(true_output, learning_rate=0.01)

print(layer1.dW.shape, layer1.db.shape)
print(layer2.dW.shape, layer2.db.shape)
print(layer3.dW.shape, layer3.db.shape)
print(output_layer.dW.shape, output_layer.db.shape)

(10, 2) (10, 1)
(10, 10) (10, 1)
(10, 10) (10, 1)
(1, 10) (1, 1)


In [94]:
# average time to compute gradients
start_time = time.time()
for i in range(1000):

  nn = ReverseNeuralNetwork()

  layer1 = ReverseLayer(input_size=2, output_size=10, activation='relu')
  layer2 = ReverseLayer(input_size=10, output_size=10, activation='relu')
  layer3 = ReverseLayer(input_size=10, output_size=10, activation='relu')
  output_layer = ReverseLayer(input_size=10, output_size=1, activation='linear')

  nn.add_layer(layer1)  # First hidden layer
  nn.add_layer(layer2)  # Second hidden layer
  nn.add_layer(layer3)  # Third hidden layer
  nn.add_layer(output_layer)  # Output layer

  x = np.random.rand()
  y = np.random.rand()

  input_vector = np.array([x, y]).reshape(2, 1)
  output_vector = nn.forward(input_vector)
  nn.compute_gradients(true_output, learning_rate=0.01)

end_time = time.time()
print("Average Time:" + str((end_time - start_time) / 1000))

Average Time:0.00014938497543334962


In [95]:
# forward mode ad

class ForwardLayer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        self.weights = np.random.uniform(-0.1, 0.1, (output_size, input_size))
        self.biases = np.zeros((output_size, 1))

        self.n_params = output_size * input_size + output_size

        self.activation_function = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x

    def get_activation_derivative(self, activation):
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'sigmoid':
            return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x)**2
        else:
            return lambda x: np.ones_like(x)

    def forward(self, input_data, param_index=None):
        self.input_data = input_data
        batch_size = input_data.shape[1]

        # Initialize derivatives
        w_size = self.output_size * self.input_size
        if param_index is not None:
            # Create derivative matrices
            dw = np.zeros_like(self.weights)
            db = np.zeros_like(self.biases)

            if param_index < w_size:
                # This is a weight parameter
                i, j = param_index // self.input_size, param_index % self.input_size
                dw[i, j] = 1.0
            else:
                # This is a bias parameter
                bias_idx = param_index - w_size
                db[bias_idx] = 1.0

        # Forward pass
        z = np.dot(self.weights, input_data) + self.biases
        a = self.activation_function(z)

        if param_index is not None:
            # Compute derivative of z
            dz = np.dot(dw, input_data) + db
            # Compute derivative of activation
            da = self.activation_derivative(z) * dz
            return a, da
        return a, None

    def update_parameters(self, learning_rate, gradients):
        w_size = self.output_size * self.input_size

        # Reshape gradients for weights and biases
        dW = gradients[:w_size].reshape(self.output_size, self.input_size)
        db = gradients[w_size:].reshape(self.output_size, 1)

        # Update parameters
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db

class ForwardNeuralNetwork:
    def __init__(self):
        self.layers = []

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data, _ = layer.forward(input_data)
        return input_data

    def get_total_params(self):
        return sum(layer.n_params for layer in self.layers)

    def compute_gradients(self, X, Y):
        n_total_params = self.get_total_params()
        gradients = []

        # For each parameter in the network
        param_count = 0
        for layer_idx, layer in enumerate(self.layers):
            for param_idx in range(layer.n_params):
                # Forward pass with derivative with respect to this parameter
                input_data = X
                param_derivative = None

                for i, current_layer in enumerate(self.layers):
                    if i == layer_idx:
                        input_data, param_derivative = current_layer.forward(input_data, param_idx)
                    else:
                        input_data, _ = current_layer.forward(input_data)

                # Compute loss derivative
                m = Y.shape[1]
                output_derivative = (input_data - Y) / m

                # Compute gradient for this parameter
                if param_derivative is not None:
                    grad = np.sum(output_derivative * param_derivative)
                    gradients.append(grad)

                param_count += 1

        return np.array(gradients)

    def get_layer_gradients(self, layer_index, gradients):

      # Find the starting index for this layer's parameters
      start_idx = 0
      for i in range(layer_index):
          start_idx += self.layers[i].n_params

      # Get the layer
      layer = self.layers[layer_index]

      # Extract weight gradients
      w_size = layer.output_size * layer.input_size
      dW = gradients[start_idx:start_idx + w_size].reshape(layer.output_size, layer.input_size)

      # Extract bias gradients
      db = gradients[start_idx + w_size:start_idx + layer.n_params].reshape(layer.output_size, 1)

      return dW, db

In [96]:
nn = ForwardNeuralNetwork()

layer1 = ForwardLayer(input_size=2, output_size=10, activation='relu')
layer2 = ForwardLayer(input_size=10, output_size=10, activation='relu')
layer3 = ForwardLayer(input_size=10, output_size=10, activation='relu')
output_layer = ForwardLayer(input_size=10, output_size=1, activation='linear')

nn.add_layer(layer1)  # First hidden layer
nn.add_layer(layer2)  # Second hidden layer
nn.add_layer(layer3)  # Third hidden layer
nn.add_layer(output_layer)  # Output layer

x = np.random.rand()
y = np.random.rand()
f_xy = np.sin(2 * np.pi * x * y) + 2 * x * y**2

input_vector = np.array([x, y]).reshape(2, 1)
true_output = np.array([[f_xy]])

output_vector = nn.forward(input_vector)
gradients = nn.compute_gradients(input_vector, true_output)

dW0, db0 = nn.get_layer_gradients(layer_index=0, gradients=gradients)
print(dW0.shape, db0.shape)
dW1, db1 = nn.get_layer_gradients(layer_index=1, gradients=gradients)
print(dW1.shape, db1.shape)
dW2, db2 = nn.get_layer_gradients(layer_index=2, gradients=gradients)
print(dW2.shape, db2.shape)
dW3, db3 = nn.get_layer_gradients(layer_index=3, gradients=gradients)
print(dW3.shape, db3.shape)

(10, 2) (10, 1)
(10, 10) (10, 1)
(10, 10) (10, 1)
(1, 10) (1, 1)


In [97]:
# average time to compute gradients
start_time = time.time()
for i in range(1000):
  nn = ForwardNeuralNetwork()

  layer1 = ForwardLayer(input_size=2, output_size=10, activation='relu')
  layer2 = ForwardLayer(input_size=10, output_size=10, activation='relu')
  layer3 = ForwardLayer(input_size=10, output_size=10, activation='relu')
  output_layer = ForwardLayer(input_size=10, output_size=1, activation='linear')

  nn.add_layer(layer1)  # First hidden layer
  nn.add_layer(layer2)  # Second hidden layer
  nn.add_layer(layer3)  # Third hidden layer
  nn.add_layer(output_layer)  # Output layer

  x = np.random.rand()
  y = np.random.rand()

  input_vector = np.array([x, y]).reshape(2, 1)
  output_vector = nn.forward(input_vector)
  gradients = nn.compute_gradients(input_vector, output_vector)
end_time = time.time()
print("Average Time:" + str((end_time - start_time) / 1000))

Average Time:0.013175784349441529


In [103]:
# reverse mode training

class Layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        # Initialize weights and biases
        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)  # He initialization
        self.biases = np.zeros((output_size, 1))

        # Store activation function
        self.activation_function = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x  # Linear activation

    def get_activation_derivative(self, activation):
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'sigmoid':
            return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x)**2
        else:
            return lambda x: np.ones_like(x)  # Derivative for linear is 1

    def forward(self, input_data):
        self.input_data = input_data
        self.z = np.dot(self.weights, input_data) + self.biases
        self.a = self.activation_function(self.z)
        return self.a

    def backward(self, dA, learning_rate):
        m = self.input_data.shape[1]  # Batch size
        dZ = dA * self.activation_derivative(self.z)  # Elementwise multiplication of derivative
        dW = np.dot(dZ, self.input_data.T) / m
        db = np.sum(dZ, axis=1, keepdims=True) / m
        dA_prev = np.dot(self.weights.T, dZ)

        # Update weights and biases using SGD
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db

        return dA_prev

class NeuralNetwork:
    def __init__(self, input_size):
        self.layers = []
        self.input_size = input_size

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data = layer.forward(input_data)
        return input_data

    def backward(self, Y, learning_rate):
        m = Y.shape[1]
        dA = self.layers[-1].a - Y  # Loss derivative with respect to output
        for layer in reversed(self.layers):
            dA = layer.backward(dA, learning_rate)

    def compute_loss(self, predictions, Y):
        m = Y.shape[1]
        return np.sum((predictions - Y) ** 2) / (2 * m)

    def train(self, X, Y, epochs, learning_rate):
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)

            # Compute loss
            loss = self.compute_loss(output, Y)

            # Backward pass
            self.backward(Y, learning_rate)

            if epoch % 100 == 0:
                print(f"Epoch {epoch}/{epochs} complete, Loss: {loss}")

In [104]:
def generate_data(num_samples=1000):
    x = np.random.uniform(0, 1, num_samples)
    y = np.random.uniform(0, 1, num_samples)
    f_xy = np.sin(2 * np.pi * x * y) + 2 * x * y**2
    return np.vstack((x, y)), f_xy.reshape(1, -1)

# Define the neural network
nn = NeuralNetwork(input_size=2)

# Add layers: 3 hidden layers with 10 neurons each, ReLU activation
nn.add_layer(Layer(2, 10, activation='relu'))
nn.add_layer(Layer(10, 10, activation='relu'))
nn.add_layer(Layer(10, 10, activation='relu'))

# Output layer with 1 neuron, no activation (linear output)
nn.add_layer(Layer(10, 1, activation='linear'))

# Training the network
X, Y = generate_data(1000)  # Generate 1000 samples
epochs = 5000
learning_rate = 0.01

start_time = time.time()
nn.train(X, Y, epochs, learning_rate)
end_time = time.time()
print("Time:" + str((end_time - start_time)))

# Test the network
test_X, test_Y = generate_data(100)  # Generate test data
predictions = nn.forward(test_X)

# Compute test loss
test_loss = nn.compute_loss(predictions, test_Y)
print(f"Test Loss: {test_loss}")

Epoch 0/5000 complete, Loss: 1.685471974215143
Epoch 100/5000 complete, Loss: 0.07531531499099711
Epoch 200/5000 complete, Loss: 0.07439336372513458
Epoch 300/5000 complete, Loss: 0.07371612809878492
Epoch 400/5000 complete, Loss: 0.07314506998362563
Epoch 500/5000 complete, Loss: 0.07262280225583413
Epoch 600/5000 complete, Loss: 0.07212022411747808
Epoch 700/5000 complete, Loss: 0.0716345058138881
Epoch 800/5000 complete, Loss: 0.0711500901956785
Epoch 900/5000 complete, Loss: 0.07065672160380214
Epoch 1000/5000 complete, Loss: 0.07014824145622603
Epoch 1100/5000 complete, Loss: 0.06962557946022217
Epoch 1200/5000 complete, Loss: 0.0690888906705081
Epoch 1300/5000 complete, Loss: 0.06851064239113618
Epoch 1400/5000 complete, Loss: 0.06788335109688907
Epoch 1500/5000 complete, Loss: 0.06720790050104128
Epoch 1600/5000 complete, Loss: 0.06647056489964402
Epoch 1700/5000 complete, Loss: 0.06565448269228022
Epoch 1800/5000 complete, Loss: 0.06476178238990161
Epoch 1900/5000 complete, Los

In [108]:
# forward mode training

class Layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        # Initialize weights and biases
        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)
        self.biases = np.zeros((output_size, 1))

        # Total number of parameters
        self.n_params = output_size * input_size + output_size

        # Store activation function
        self.activation_function = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x

    def get_activation_derivative(self, activation):
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'sigmoid':
            return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x)**2
        else:
            return lambda x: np.ones_like(x)

    def forward(self, input_data, param_index=None):
        self.input_data = input_data
        batch_size = input_data.shape[1]

        # Initialize derivatives
        w_size = self.output_size * self.input_size
        if param_index is not None:
            # Create derivative matrices
            dw = np.zeros_like(self.weights)
            db = np.zeros_like(self.biases)

            if param_index < w_size:
                # This is a weight parameter
                i, j = param_index // self.input_size, param_index % self.input_size
                dw[i, j] = 1.0
            else:
                # This is a bias parameter
                bias_idx = param_index - w_size
                db[bias_idx] = 1.0

        # Forward pass
        z = np.dot(self.weights, input_data) + self.biases
        a = self.activation_function(z)

        if param_index is not None:
            # Compute derivative of z
            dz = np.dot(dw, input_data) + db
            # Compute derivative of activation
            da = self.activation_derivative(z) * dz
            return a, da
        return a, None

    def update_parameters(self, learning_rate, gradients):
        w_size = self.output_size * self.input_size

        # Reshape gradients for weights and biases
        dW = gradients[:w_size].reshape(self.output_size, self.input_size)
        db = gradients[w_size:].reshape(self.output_size, 1)

        # Update parameters
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db

class NeuralNetwork:
    def __init__(self, input_size):
        self.layers = []
        self.input_size = input_size

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data, _ = layer.forward(input_data)
        return input_data

    def get_total_params(self):
        return sum(layer.n_params for layer in self.layers)

    def compute_gradients(self, X, Y):
        n_total_params = self.get_total_params()
        gradients = []

        # For each parameter in the network
        param_count = 0
        for layer_idx, layer in enumerate(self.layers):
            for param_idx in range(layer.n_params):
                # Forward pass with derivative with respect to this parameter
                input_data = X
                param_derivative = None

                for i, current_layer in enumerate(self.layers):
                    if i == layer_idx:
                        input_data, param_derivative = current_layer.forward(input_data, param_idx)
                    else:
                        input_data, _ = current_layer.forward(input_data)

                # Compute loss derivative
                m = Y.shape[1]
                output_derivative = (input_data - Y) / m

                # Compute gradient for this parameter
                if param_derivative is not None:
                    grad = np.sum(output_derivative * param_derivative)
                    gradients.append(grad)

                param_count += 1

        return np.array(gradients)

    def train_step(self, X, Y, learning_rate):
        # Compute gradients for all parameters
        gradients = self.compute_gradients(X, Y)

        # Update parameters in each layer
        param_start = 0
        for layer in self.layers:
            param_end = param_start + layer.n_params
            layer.update_parameters(learning_rate, gradients[param_start:param_end])
            param_start = param_end

    def compute_loss(self, predictions, Y):
        m = Y.shape[1]
        return np.sum((predictions - Y) ** 2) / (2 * m)

    def train(self, X, Y, epochs, learning_rate):
        for epoch in range(epochs):
            # Perform training step
            self.train_step(X, Y, learning_rate)

            # Compute loss
            output = self.forward(X)
            loss = self.compute_loss(output, Y)

            if epoch % 100 == 0:
                print(f"Epoch {epoch}/{epochs} complete, Loss: {loss}")

In [109]:
def generate_data(num_samples=1000):
    x = np.random.uniform(0, 1, num_samples)
    y = np.random.uniform(0, 1, num_samples)
    f_xy = np.sin(2 * np.pi * x * y) + 2 * x * y**2
    return np.vstack((x, y)), f_xy.reshape(1, -1)

# Define the neural network
nn = NeuralNetwork(input_size=2)

# Add layers: 3 hidden layers with 10 neurons each, ReLU activation
nn.add_layer(Layer(2, 10, activation='relu'))
nn.add_layer(Layer(10, 10, activation='relu'))
nn.add_layer(Layer(10, 10, activation='relu'))

# Output layer with 1 neuron, no activation (linear output)
nn.add_layer(Layer(10, 1, activation='linear'))

# Training the network
X, Y = generate_data(1000)  # Generate 1000 samples
epochs = 5000
learning_rate = 0.01

start_time = time.time()
nn.train(X, Y, epochs, learning_rate)
end_time = time.time()
print("Time:" + str((end_time - start_time)))

# Test the network
test_X, test_Y = generate_data(100)  # Generate test data
predictions = nn.forward(test_X)

# Compute test loss
test_loss = nn.compute_loss(predictions, test_Y)
print(f"Test Loss: {test_loss}")

Epoch 0/5000 complete, Loss: 0.4518593406041179
Epoch 100/5000 complete, Loss: 0.09879157851573364
Epoch 200/5000 complete, Loss: 0.09762491947873883
Epoch 300/5000 complete, Loss: 0.09404424000222868
Epoch 400/5000 complete, Loss: 0.09052185802619502
Epoch 500/5000 complete, Loss: 0.08773187798010927
Epoch 600/5000 complete, Loss: 0.08571110119965288
Epoch 700/5000 complete, Loss: 0.08409964832784013
Epoch 800/5000 complete, Loss: 0.08275943694737176
Epoch 900/5000 complete, Loss: 0.08169743990258306
Epoch 1000/5000 complete, Loss: 0.0808333286927836
Epoch 1100/5000 complete, Loss: 0.0801113235893606
Epoch 1200/5000 complete, Loss: 0.07949472751755221
Epoch 1300/5000 complete, Loss: 0.0789742265329628
Epoch 1400/5000 complete, Loss: 0.07854155803984031
Epoch 1500/5000 complete, Loss: 0.07819327103486656
Epoch 1600/5000 complete, Loss: 0.07794236493746283
Epoch 1700/5000 complete, Loss: 0.07770301012833228
Epoch 1800/5000 complete, Loss: 0.07748005448258957
Epoch 1900/5000 complete, Lo