In [None]:
import numpy as np

In [None]:
class Layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)  # He initialization
        self.biases = np.zeros((output_size, 1))

        self.activation_function = self.get_activation_function(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x

    def forward(self, input_data):
        self.input_data = input_data
        self.z = np.dot(self.weights, input_data) + self.biases
        self.a = self.activation_function(self.z)
        return self.a

In [None]:
class NeuralNetwork:
    def __init__(self, input_size):
        self.layers = []
        self.input_size = input_size

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data = layer.forward(input_data)
        return input_data

In [None]:
nn = NeuralNetwork(input_size=2)
nn.add_layer(Layer(input_size=2, output_size=10, activation='relu'))  # First hidden layer
nn.add_layer(Layer(input_size=10, output_size=10, activation='relu'))  # Second hidden layer
nn.add_layer(Layer(input_size=10, output_size=1, activation='linear'))  # Output layer

x = np.random.rand()
y = np.random.rand()

input_vector = np.array([x, y]).reshape(2, 1)
output_vector = nn.forward(input_vector)
print(output_vector)

[[-0.09409977]]


In [None]:
# forward mode ad

class Layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)
        self.biases = np.zeros((output_size, 1))

        self.n_params = output_size * input_size + output_size

        self.activation_function = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x

    def get_activation_derivative(self, activation):
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'sigmoid':
            return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x)**2
        else:
            return lambda x: np.ones_like(x)

    def forward(self, input_data, param_index=None):
        self.input_data = input_data
        batch_size = input_data.shape[1]

        # Initialize derivatives
        w_size = self.output_size * self.input_size
        if param_index is not None:
            # Create derivative matrices
            dw = np.zeros_like(self.weights)
            db = np.zeros_like(self.biases)

            if param_index < w_size:
                # This is a weight parameter
                i, j = param_index // self.input_size, param_index % self.input_size
                dw[i, j] = 1.0
            else:
                # This is a bias parameter
                bias_idx = param_index - w_size
                db[bias_idx] = 1.0

        # Forward pass
        z = np.dot(self.weights, input_data) + self.biases
        a = self.activation_function(z)

        if param_index is not None:
            # Compute derivative of z
            dz = np.dot(dw, input_data) + db
            # Compute derivative of activation
            da = self.activation_derivative(z) * dz
            return a, da
        return a, None

    def update_parameters(self, learning_rate, gradients):
        w_size = self.output_size * self.input_size

        # Reshape gradients for weights and biases
        dW = gradients[:w_size].reshape(self.output_size, self.input_size)
        db = gradients[w_size:].reshape(self.output_size, 1)

        # Update parameters
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db

class NeuralNetwork:
    def __init__(self, input_size):
        self.layers = []
        self.input_size = input_size

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data, _ = layer.forward(input_data)
        return input_data

    def get_total_params(self):
        return sum(layer.n_params for layer in self.layers)

    def compute_gradients(self, X, Y):
        n_total_params = self.get_total_params()
        gradients = []

        # For each parameter in the network
        param_count = 0
        for layer_idx, layer in enumerate(self.layers):
            for param_idx in range(layer.n_params):
                # Forward pass with derivative with respect to this parameter
                input_data = X
                param_derivative = None

                for i, current_layer in enumerate(self.layers):
                    if i == layer_idx:
                        input_data, param_derivative = current_layer.forward(input_data, param_idx)
                    else:
                        input_data, _ = current_layer.forward(input_data)

                # Compute loss derivative
                m = Y.shape[1]
                output_derivative = (input_data - Y) / m

                # Compute gradient for this parameter
                if param_derivative is not None:
                    grad = np.sum(output_derivative * param_derivative)
                    gradients.append(grad)

                param_count += 1

        return np.array(gradients)

In [None]:
nn = NeuralNetwork(input_size=2)
nn.add_layer(Layer(input_size=2, output_size=10, activation='relu'))  # First hidden layer
nn.add_layer(Layer(input_size=10, output_size=10, activation='relu'))  # Second hidden layer
nn.add_layer(Layer(input_size=10, output_size=1, activation='linear'))  # Output layer

x = np.random.rand()
y = np.random.rand()

input_vector = np.array([x, y]).reshape(2, 1)
output_vector = nn.forward(input_vector)
gradients = nn.compute_gradients(input_vector, output_vector)

In [None]:
# average time to compute gradients
import time

start_time = time.time()
for i in range(1000):
  nn = NeuralNetwork(input_size=2)
  nn.add_layer(Layer(input_size=2, output_size=10, activation='relu'))  # First hidden layer
  nn.add_layer(Layer(input_size=10, output_size=10, activation='relu'))  # Second hidden layer
  nn.add_layer(Layer(input_size=10, output_size=1, activation='linear'))  # Output layer

  x = np.random.rand()
  y = np.random.rand()

  input_vector = np.array([x, y]).reshape(2, 1)
  output_vector = nn.forward(input_vector)
  gradients = nn.compute_gradients(input_vector, output_vector)
end_time = time.time()
print("Average Time:" + str((end_time - start_time) / 1000))

Average Time:0.006320048332214356


In [None]:
# reverse mode AD

class Layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        # Initialize weights and biases
        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)  # He initialization
        self.biases = np.zeros((output_size, 1))

        # Store activation function
        self.activation_function = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x  # Linear activation

    def get_activation_derivative(self, activation):
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'sigmoid':
            return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x)**2
        else:
            return lambda x: np.ones_like(x)  # Derivative for linear is 1

    def forward(self, input_data):
        self.input_data = input_data
        self.z = np.dot(self.weights, input_data) + self.biases
        self.a = self.activation_function(self.z)
        return self.a

    def backward(self, dA):
        m = self.input_data.shape[1]  # Batch size
        self.dZ = dA * self.activation_derivative(self.z)  # Elementwise multiplication of derivative
        self.dW = np.dot(self.dZ, self.input_data.T) / m
        self.db = np.sum(self.dZ, axis=1, keepdims=True) / m
        dA_prev = np.dot(self.weights.T, self.dZ)

        return dA_prev

class NeuralNetwork:
    def __init__(self, input_size):
        self.layers = []
        self.input_size = input_size

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data = layer.forward(input_data)
        return input_data

    def compute_gradients(self, Y, learning_rate):
        m = Y.shape[1]
        dA = self.layers[-1].a - Y  # Loss derivative with respect to output
        for layer in reversed(self.layers):
            dA = layer.backward(dA)

In [None]:
nn = NeuralNetwork(input_size=2)
nn.add_layer(Layer(input_size=2, output_size=10, activation='relu'))  # First hidden layer
nn.add_layer(Layer(input_size=10, output_size=10, activation='relu'))  # Second hidden layer
nn.add_layer(Layer(input_size=10, output_size=1, activation='linear'))  # Output layer

x = np.random.rand()
y = np.random.rand()

input_vector = np.array([x, y]).reshape(2, 1)
output_vector = nn.forward(input_vector)
nn.compute_gradients(output_vector, learning_rate=0.01)

In [None]:
# average time
import time

start_time = time.time()
for i in range(1000):
  nn = NeuralNetwork(input_size=2)
  nn.add_layer(Layer(input_size=2, output_size=10, activation='relu'))  # First hidden layer
  nn.add_layer(Layer(input_size=10, output_size=10, activation='relu'))  # Second hidden layer
  nn.add_layer(Layer(input_size=10, output_size=1, activation='linear'))  # Output layer

  x = np.random.rand()
  y = np.random.rand()

  input_vector = np.array([x, y]).reshape(2, 1)
  output_vector = nn.forward(input_vector)
  nn.compute_gradients(output_vector, learning_rate=0.01)
end_time = time.time()
print("Average Time:" + str((end_time - start_time) / 1000))

Average Time:0.00015849041938781737


In [None]:
# reverse mode training

class Layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        # Initialize weights and biases
        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)  # He initialization
        self.biases = np.zeros((output_size, 1))

        # Store activation function
        self.activation_function = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x  # Linear activation

    def get_activation_derivative(self, activation):
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'sigmoid':
            return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x)**2
        else:
            return lambda x: np.ones_like(x)  # Derivative for linear is 1

    def forward(self, input_data):
        self.input_data = input_data
        self.z = np.dot(self.weights, input_data) + self.biases
        self.a = self.activation_function(self.z)
        return self.a

    def backward(self, dA, learning_rate):
        m = self.input_data.shape[1]  # Batch size
        dZ = dA * self.activation_derivative(self.z)  # Elementwise multiplication of derivative
        dW = np.dot(dZ, self.input_data.T) / m
        db = np.sum(dZ, axis=1, keepdims=True) / m
        dA_prev = np.dot(self.weights.T, dZ)

        # Update weights and biases using SGD
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db

        return dA_prev

class NeuralNetwork:
    def __init__(self, input_size):
        self.layers = []
        self.input_size = input_size

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data = layer.forward(input_data)
        return input_data

    def backward(self, Y, learning_rate):
        m = Y.shape[1]
        dA = self.layers[-1].a - Y  # Loss derivative with respect to output
        for layer in reversed(self.layers):
            dA = layer.backward(dA, learning_rate)

    def compute_loss(self, predictions, Y):
        m = Y.shape[1]
        return np.sum((predictions - Y) ** 2) / (2 * m)

    def train(self, X, Y, epochs, learning_rate):
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)

            # Compute loss
            loss = self.compute_loss(output, Y)

            # Backward pass
            self.backward(Y, learning_rate)

            if epoch % 100 == 0:
                print(f"Epoch {epoch}/{epochs} complete, Loss: {loss}")

In [None]:
def generate_data(num_samples=1000):
    x = np.random.uniform(0, 1, num_samples)
    y = np.random.uniform(0, 1, num_samples)
    f_xy = np.sin(2 * np.pi * x * y) + 2 * x * y**2
    return np.vstack((x, y)), f_xy.reshape(1, -1)

# Define the neural network
nn = NeuralNetwork(input_size=2)

# Add layers: 3 hidden layers with 10 neurons each, ReLU activation
nn.add_layer(Layer(2, 10, activation='relu'))
nn.add_layer(Layer(10, 10, activation='relu'))
nn.add_layer(Layer(10, 10, activation='relu'))

# Output layer with 1 neuron, no activation (linear output)
nn.add_layer(Layer(10, 1, activation='linear'))

# Training the network
X, Y = generate_data(1000)  # Generate 1000 samples
epochs = 5000
learning_rate = 0.01

nn.train(X, Y, epochs, learning_rate)

# Test the network
test_X, test_Y = generate_data(100)  # Generate test data
predictions = nn.forward(test_X)

# Compute test loss
test_loss = nn.compute_loss(predictions, test_Y)
print(f"Test Loss: {test_loss}")

Epoch 0/5000 complete, Loss: 0.2271385636525113
Epoch 100/5000 complete, Loss: 0.07404594731873411
Epoch 200/5000 complete, Loss: 0.0711372334580047
Epoch 300/5000 complete, Loss: 0.0700151479142721
Epoch 400/5000 complete, Loss: 0.06921742021911749
Epoch 500/5000 complete, Loss: 0.06850878051565168
Epoch 600/5000 complete, Loss: 0.06784962641899123
Epoch 700/5000 complete, Loss: 0.06729003899454157
Epoch 800/5000 complete, Loss: 0.06678812180843348
Epoch 900/5000 complete, Loss: 0.06633584935339971
Epoch 1000/5000 complete, Loss: 0.06592175858575816
Epoch 1100/5000 complete, Loss: 0.06553801177884182
Epoch 1200/5000 complete, Loss: 0.06517746579376489
Epoch 1300/5000 complete, Loss: 0.06483152278946122
Epoch 1400/5000 complete, Loss: 0.06448471301126978
Epoch 1500/5000 complete, Loss: 0.06414060234570222
Epoch 1600/5000 complete, Loss: 0.06381750133575206
Epoch 1700/5000 complete, Loss: 0.06349531827374334
Epoch 1800/5000 complete, Loss: 0.0631880571496179
Epoch 1900/5000 complete, Lo

In [None]:
# forward mode training

class Layer:
    def __init__(self, input_size, output_size, activation='relu'):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation

        # Initialize weights and biases
        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)
        self.biases = np.zeros((output_size, 1))

        # Total number of parameters
        self.n_params = output_size * input_size + output_size

        # Store activation function
        self.activation_function = self.get_activation_function(activation)
        self.activation_derivative = self.get_activation_derivative(activation)

    def get_activation_function(self, activation):
        if activation == 'relu':
            return lambda x: np.maximum(0, x)
        elif activation == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif activation == 'tanh':
            return lambda x: np.tanh(x)
        else:
            return lambda x: x

    def get_activation_derivative(self, activation):
        if activation == 'relu':
            return lambda x: (x > 0).astype(float)
        elif activation == 'sigmoid':
            return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif activation == 'tanh':
            return lambda x: 1 - np.tanh(x)**2
        else:
            return lambda x: np.ones_like(x)

    def forward(self, input_data, param_index=None):
        self.input_data = input_data
        batch_size = input_data.shape[1]

        # Initialize derivatives
        w_size = self.output_size * self.input_size
        if param_index is not None:
            # Create derivative matrices
            dw = np.zeros_like(self.weights)
            db = np.zeros_like(self.biases)

            if param_index < w_size:
                # This is a weight parameter
                i, j = param_index // self.input_size, param_index % self.input_size
                dw[i, j] = 1.0
            else:
                # This is a bias parameter
                bias_idx = param_index - w_size
                db[bias_idx] = 1.0

        # Forward pass
        z = np.dot(self.weights, input_data) + self.biases
        a = self.activation_function(z)

        if param_index is not None:
            # Compute derivative of z
            dz = np.dot(dw, input_data) + db
            # Compute derivative of activation
            da = self.activation_derivative(z) * dz
            return a, da
        return a, None

    def update_parameters(self, learning_rate, gradients):
        w_size = self.output_size * self.input_size

        # Reshape gradients for weights and biases
        dW = gradients[:w_size].reshape(self.output_size, self.input_size)
        db = gradients[w_size:].reshape(self.output_size, 1)

        # Update parameters
        self.weights -= learning_rate * dW
        self.biases -= learning_rate * db

class NeuralNetwork:
    def __init__(self, input_size):
        self.layers = []
        self.input_size = input_size

    def add_layer(self, layer):
        self.layers.append(layer)

    def forward(self, X):
        input_data = X
        for layer in self.layers:
            input_data, _ = layer.forward(input_data)
        return input_data

    def get_total_params(self):
        return sum(layer.n_params for layer in self.layers)

    def compute_gradients(self, X, Y):
        n_total_params = self.get_total_params()
        gradients = []

        # For each parameter in the network
        param_count = 0
        for layer_idx, layer in enumerate(self.layers):
            for param_idx in range(layer.n_params):
                # Forward pass with derivative with respect to this parameter
                input_data = X
                param_derivative = None

                for i, current_layer in enumerate(self.layers):
                    if i == layer_idx:
                        input_data, param_derivative = current_layer.forward(input_data, param_idx)
                    else:
                        input_data, _ = current_layer.forward(input_data)

                # Compute loss derivative
                m = Y.shape[1]
                output_derivative = (input_data - Y) / m

                # Compute gradient for this parameter
                if param_derivative is not None:
                    grad = np.sum(output_derivative * param_derivative)
                    gradients.append(grad)

                param_count += 1

        return np.array(gradients)

    def train_step(self, X, Y, learning_rate):
        # Compute gradients for all parameters
        gradients = self.compute_gradients(X, Y)

        # Update parameters in each layer
        param_start = 0
        for layer in self.layers:
            param_end = param_start + layer.n_params
            layer.update_parameters(learning_rate, gradients[param_start:param_end])
            param_start = param_end

    def compute_loss(self, predictions, Y):
        m = Y.shape[1]
        return np.sum((predictions - Y) ** 2) / (2 * m)

    def train(self, X, Y, epochs, learning_rate):
        for epoch in range(epochs):
            # Perform training step
            self.train_step(X, Y, learning_rate)

            # Compute loss
            output = self.forward(X)
            loss = self.compute_loss(output, Y)

            if epoch % 100 == 0:
                print(f"Epoch {epoch}/{epochs} complete, Loss: {loss}")

In [None]:
def generate_data(num_samples=1000):
    x = np.random.uniform(0, 1, num_samples)
    y = np.random.uniform(0, 1, num_samples)
    f_xy = np.sin(2 * np.pi * x * y) + 2 * x * y**2
    return np.vstack((x, y)), f_xy.reshape(1, -1)

# Define the neural network
nn = NeuralNetwork(input_size=2)

# Add layers: 3 hidden layers with 10 neurons each, ReLU activation
nn.add_layer(Layer(2, 10, activation='relu'))
nn.add_layer(Layer(10, 10, activation='relu'))
nn.add_layer(Layer(10, 10, activation='relu'))

# Output layer with 1 neuron, no activation (linear output)
nn.add_layer(Layer(10, 1, activation='linear'))

# Training the network
X, Y = generate_data(1000)  # Generate 1000 samples
epochs = 5000
learning_rate = 0.01

nn.train(X, Y, epochs, learning_rate)

# Test the network
test_X, test_Y = generate_data(100)  # Generate test data
predictions = nn.forward(test_X)

# Compute test loss
test_loss = nn.compute_loss(predictions, test_Y)
print(f"Test Loss: {test_loss}")

Epoch 0/5000 complete, Loss: 0.3723659796208631
Epoch 100/5000 complete, Loss: 0.09071779304126253
Epoch 200/5000 complete, Loss: 0.09093226253849003
Epoch 300/5000 complete, Loss: 0.09110324583944299
Epoch 400/5000 complete, Loss: 0.09133402240723897
Epoch 500/5000 complete, Loss: 0.09154483128621708
Epoch 600/5000 complete, Loss: 0.09160357637130526
Epoch 700/5000 complete, Loss: 0.09156012650822834
Epoch 800/5000 complete, Loss: 0.09147719677056211
Epoch 900/5000 complete, Loss: 0.0913229611521464
Epoch 1000/5000 complete, Loss: 0.09113758881294831
Epoch 1100/5000 complete, Loss: 0.09091774053314575
Epoch 1200/5000 complete, Loss: 0.09067606068686922
Epoch 1300/5000 complete, Loss: 0.09044892454801591
Epoch 1400/5000 complete, Loss: 0.09038488537653026
Epoch 1500/5000 complete, Loss: 0.09079599106760092
Epoch 1600/5000 complete, Loss: 0.09086624793955517
Epoch 1700/5000 complete, Loss: 0.0907590646886624
Epoch 1800/5000 complete, Loss: 0.09062125830627724
Epoch 1900/5000 complete, L