In [3]:
import numpy as np

# Sigmoid activation function and its derivative
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Mean Squared Error (MSE) loss function and its derivative
def mse_loss(y_pred, y_true):
    return 0.5 * np.sum((y_pred - y_true) ** 2)

def mse_loss_derivative(y_pred, y_true):
    return y_pred - y_true

# Initialize input, weights, and target output
x1, x2 = 0.5, 1.0
x = np.array([x1, x2], dtype=np.float64)  # Input vector (2,)

y_true = np.array([0.8], dtype=np.float64)  # True output value

# Initialize weights
w1 = np.array([[2.0, 1.0], [1.0, -1.0], [1.0, 1.0]], dtype=np.float64)  # (3,2)
b1 = np.array([2.0, 1.0, -1.0], dtype=np.float64)  # (3,)

w2 = np.array([[3.0, -1.0, 1.0], [-1.0, -2.0, 2.0], [1.0, -1.0, 1.0]], dtype=np.float64)  # (3,3)
b2 = np.array([-2.0, 3.0, -1.0], dtype=np.float64)  # (3,)

w3 = np.array([-1.0, 3.0, 2.0], dtype=np.float64)  # (3,)
b3 = np.array([1.0], dtype=np.float64)  # (1,)

# Hyperparameters
learning_rate = 0.1
epochs = 2000  # Number of training iterations

# Training loop
for epoch in range(epochs):
    # Forward propagation
    z1 = np.dot(w1, x) + b1  # (3,)
    a1 = sigmoid(z1)  # (3,)

    z2 = np.dot(w2, a1) + b2  # (3,)
    a2 = sigmoid(z2)  # (3,)

    z3 = np.dot(w3, a2) + b3  # (1,)
    y_pred = sigmoid(z3)  # (1,)

    # Compute loss
    loss = mse_loss(y_pred, y_true)

    # Backward propagation
    d_loss = mse_loss_derivative(y_pred, y_true)  # (1,)

    d_z3 = d_loss * sigmoid_derivative(z3)  # (1,)
    d_w3 = d_z3 * a2  # (3,)
    d_b3 = d_z3  # (1,)

    d_a2 = w3 * d_z3  # (3,)
    d_z2 = d_a2 * sigmoid_derivative(z2)  # (3,)
    d_w2 = np.outer(d_z2, a1)  # (3,3)
    d_b2 = d_z2  # (3,)

    d_a1 = np.dot(w2.T, d_z2)  # (3,)
    d_z1 = d_a1 * sigmoid_derivative(z1)  # (3,)
    d_w1 = np.outer(d_z1, x)  # (3,2)
    d_b1 = d_z1  # (3,)

    # Update weights and biases using gradient descent
    w1 -= learning_rate * d_w1
    b1 -= learning_rate * d_b1
    w2 -= learning_rate * d_w2
    b2 -= learning_rate * d_b2
    w3 -= learning_rate * d_w3
    b3 -= learning_rate * d_b3

    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.5f}")

# Final output after training
print(f"\nFinal Loss: {loss:.5f}")
print(f"Final Prediction: {y_pred[0]:.5f}")
print("\nFinal Weights and Biases:")
print("w1:", w1)
print("b1:", b1)
print("w2:", w2)
print("b2:", b2)
print("w3:", w3)
print("b3:", b3)

Epoch 0, Loss: 0.01629
Epoch 100, Loss: 0.01578
Epoch 200, Loss: 0.01514
Epoch 300, Loss: 0.01429
Epoch 400, Loss: 0.01317
Epoch 500, Loss: 0.01164
Epoch 600, Loss: 0.00958
Epoch 700, Loss: 0.00691
Epoch 800, Loss: 0.00395
Epoch 900, Loss: 0.00159
Epoch 1000, Loss: 0.00044
Epoch 1100, Loss: 0.00009
Epoch 1200, Loss: 0.00002
Epoch 1300, Loss: 0.00000
Epoch 1400, Loss: 0.00000
Epoch 1500, Loss: 0.00000
Epoch 1600, Loss: 0.00000
Epoch 1700, Loss: 0.00000
Epoch 1800, Loss: 0.00000
Epoch 1900, Loss: 0.00000

Final Loss: 0.00000
Final Prediction: 0.80000

Final Weights and Biases:
w1: [[ 2.00483185  1.00966369]
 [ 1.0816828  -0.83663441]
 [ 0.92477578  0.84955156]]
b1: [ 2.00966369  1.16336559 -1.15044844]
w2: [[ 3.1580082  -0.89306841  1.09334593]
 [-1.26692501 -2.18100758  1.84267995]
 [ 0.7128582  -1.19304396  0.82907689]]
b2: [-1.8391253   2.72823435 -1.29235703]
w3: [-1.50829326  2.44432113  1.74488292]
b3: [0.31817271]


以上使用了 gpt-4 生成初始代码，并手写 forward and backward propagation 替代了它直接调用的包

In [2]:
# Verify the results above
import numpy as np

# Sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Input
X = np.array([0.5, 1])

# Weights and biases
w1 = np.array([[2.00483185, 1.00966369],
               [1.0816828, -0.83663441],
               [0.92477578, 0.84955156]])
b1 = np.array([ 2.00966369 , 1.16336559, -1.15044844])
w2 = np.array([[ 3.1580082 , -0.89306841,  1.09334593],
               [-1.26692501, -2.18100758,  1.84267995],
               [ 0.7128582 , -1.19304396 , 0.82907689]])
b2 = np.array([-1.8391253 ,  2.72823435, -1.29235703])
w3 = np.array([-1.50829326 , 2.44432113 , 1.74488292])
b3 = np.array([0.31817271])

# Calculate output
z1 = np.dot(X, w1.T) + b1
a1 = sigmoid(z1)

z2 = np.dot(a1, w2.T) + b2
a2 = sigmoid(z2)

z3 = np.dot(a2, w3.T) + b3
output = sigmoid(z3)

print(output)

[0.80000319]


以上验证 final weights and biases 正确
将上面的代码输入 gpt-4 并让它改用pytorch ,debug 完如下：

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Neural Network class
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(2, 3)  # Input layer (2) -> Hidden layer 1 (3)
        self.fc2 = nn.Linear(3, 3)  # Hidden layer 1 (3) -> Hidden layer 2 (3)
        self.fc3 = nn.Linear(3, 1)  # Hidden layer 2 (3) -> Output layer (1)
        self.activation = nn.Sigmoid()  # Sigmoid activation function
    
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        return x

# Select device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the network
model = NeuralNetwork().to(device)

# Training data
x_train = torch.tensor([[0.5, 1.0]], dtype=torch.float32).to(device)  # Input
y_train = torch.tensor([[0.8]], dtype=torch.float32).to(device)  # Target output

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

# Training parameters
epochs = 2000

# Training loop
for epoch in range(epochs):
    optimizer.zero_grad()  # Clear gradients
    y_pred = model(x_train)  # Forward pass
    loss = criterion(y_pred, y_train)  # Compute loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.5f}")

# Final results
print(f"Final Loss: {loss.item():.5f}")
print(f"Final Prediction: {y_pred.item():.5f}")

# Print final weights and biases
print("\nFinal Weights and Biases:")
for name, param in model.named_parameters():
    print(f"{name}: {param.data}")


Epoch 0, Loss: 0.18135
Epoch 100, Loss: 0.00000
Epoch 200, Loss: 0.00000
Epoch 300, Loss: 0.00000
Epoch 400, Loss: 0.00000
Epoch 500, Loss: 0.00000
Epoch 600, Loss: 0.00000
Epoch 700, Loss: 0.00000
Epoch 800, Loss: 0.00000
Epoch 900, Loss: 0.00000
Epoch 1000, Loss: 0.00000
Epoch 1100, Loss: 0.00000
Epoch 1200, Loss: 0.00000
Epoch 1300, Loss: 0.00000
Epoch 1400, Loss: 0.00000
Epoch 1500, Loss: 0.00000
Epoch 1600, Loss: 0.00000
Epoch 1700, Loss: 0.00000
Epoch 1800, Loss: 0.00000
Epoch 1900, Loss: 0.00000
Final Loss: 0.00000
Final Prediction: 0.80000

Final Weights and Biases:
fc1.weight: tensor([[0.4970, 0.5740],
        [0.3243, 0.8622],
        [0.5269, 1.4981]])
fc1.bias: tensor([0.4515, 0.3853, 1.0734])
fc2.weight: tensor([[-1.1474, -0.8509, -0.9329],
        [ 0.1969,  0.7284,  1.2665],
        [ 1.4185,  1.4959,  1.4884]])
fc2.bias: tensor([-0.9211,  0.9989,  1.0929])
fc3.weight: tensor([[0.4140, 0.5326, 0.8230]])
fc3.bias: tensor([0.0506])


这个内置的算法收敛速度更快，得到不同的final weights and biases，但也正确。不同的初始值可以得到满足要求的不同final weights and biases。

In [4]:
# Verify the results above
import numpy as np

# Sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Input
X = np.array([0.5, 1])

# Weights and biases
w1 = np.array([[0.4970, 0.5740],
        [0.3243, 0.8622],
        [0.5269, 1.4981]])
b1 = np.array([0.4515, 0.3853, 1.0734])
w2 = np.array([[-1.1474, -0.8509, -0.9329],
        [ 0.1969,  0.7284,  1.2665],
        [ 1.4185,  1.4959,  1.4884]])
b2 = np.array([-0.9211,  0.9989,  1.0929])
w3 = np.array([[0.4140, 0.5326, 0.8230]])
b3 = np.array([0.0506])

# Calculate output
z1 = np.dot(X, w1.T) + b1
a1 = sigmoid(z1)

z2 = np.dot(a1, w2.T) + b2
a2 = sigmoid(z2)

z3 = np.dot(a2, w3.T) + b3
output = sigmoid(z3)

print(output)

[0.79999391]
