# XOR 运算

In [1]:
import numpy as np

# 1. 数据准备
# 这里我们使用简单的示例数据
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])  # 输入
y = np.array([[0], [1], [1], [0]])             # 输出 (XOR)

# 2. 初始化函数
input_size = 2
hidden_size = 2
output_size = 1
learning_rate = 0.1

W1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1, output_size))

# 3. 激活函数及其导数
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

# 4. 前向传播函数
def forward(X):
    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)
    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)
    return z1, a1, z2, a2

# 5. 损失函数
def compute_loss(y, y_hat):
    return -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

# 6. 反向传播函数
def backward(X, y, z1, a1, z2, a2):
    m = y.shape[0]
    
    dz2 = a2 - y
    dW2 = 1/m * np.dot(a1.T, dz2)
    db2 = np.sum(dz2, axis=0, keepdims=True)
    
    da1 = np.dot(dz2, W2.T)
    dz1 = da1 * sigmoid_derivative(a1)
    dW1 = 1/m * np.dot(X.T, dz1)
    db1 = np.sum(dz1, axis=0)
    
    return dW1, db1, dW2, db2

# 7. 训练循环
def train(X, y, epochs=5000):
    global W1, b1, W2, b2
    for epoch in range(epochs):
        z1, a1, z2, a2 = forward(X)
        loss = compute_loss(y, a2)
        dW1, db1, dW2, db2 = backward(X, y, z1, a1, z2, a2)
        
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        
        if epoch % 500 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")

# 主函数
train(X, y)

# 测试
def predict(X):
    _, _, _, a2 = forward(X)
    return np.round(a2)

print("Predictions:")
print(predict(X))


Epoch 0, Loss: 0.7129708765647914
Epoch 500, Loss: 0.6662447984628146
Epoch 1000, Loss: 0.6069400201110507
Epoch 1500, Loss: 0.5457049644088914
Epoch 2000, Loss: 0.4785422643363517
Epoch 2500, Loss: 0.30278341986065305
Epoch 3000, Loss: 0.15330024419141391
Epoch 3500, Loss: 0.09219490273970883
Epoch 4000, Loss: 0.06405129411766428
Epoch 4500, Loss: 0.04851722034181673
Predictions:
[[0.]
 [1.]
 [1.]
 [0.]]


下面是一个稍微复杂的两层神经网络，它具有L2正则化和动量优化：

1. **L2正则化**是防止过拟合的常用技术，它在损失函数中添加了一个与权重大小成正比的项。

2. **动量优化**是一种模拟物理中的动量概念的优化技术，可以帮助加速学习。

In [2]:
import numpy as np

# 1. 数据准备
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])  # 输入
y = np.array([[0], [1], [1], [0]])             # 输出 (XOR)

# 2. 初始化参数
input_size = 2
hidden_size = 3
output_size = 1
learning_rate = 0.1
reg_lambda = 0.01
momentum = 0.9

W1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1, output_size))

dW1_prev = np.zeros_like(W1)
db1_prev = np.zeros_like(b1)
dW2_prev = np.zeros_like(W2)
db2_prev = np.zeros_like(b2)

# 3. 激活函数及其导数
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

# 4. 前向传播
def forward(X):
    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)
    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)
    return z1, a1, z2, a2

# 5. 损失函数（带有L2正则化）
def compute_loss(y, y_hat):
    m = y.shape[0]
    cross_entropy = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    L2_regularization = (np.sum(np.square(W1)) + np.sum(np.square(W2))) * reg_lambda / (2*m)
    return cross_entropy + L2_regularization

# 6. 反向传播（带有L2正则化）
def backward(X, y, z1, a1, z2, a2):
    m = y.shape[0]
    
    dz2 = a2 - y
    dW2 = 1/m * np.dot(a1.T, dz2) + reg_lambda/m * W2
    db2 = np.sum(dz2, axis=0, keepdims=True)
    
    da1 = np.dot(dz2, W2.T)
    dz1 = da1 * sigmoid_derivative(a1)
    dW1 = 1/m * np.dot(X.T, dz1) + reg_lambda/m * W1
    db1 = np.sum(dz1, axis=0)
    
    return dW1, db1, dW2, db2

# 7. 训练函数（使用动量）
def train(X, y, epochs=5000):
    global W1, b1, W2, b2, dW1_prev, db1_prev, dW2_prev, db2_prev
    
    for epoch in range(epochs):
        z1, a1, z2, a2 = forward(X)
        loss = compute_loss(y, a2)
        dW1, db1, dW2, db2 = backward(X, y, z1, a1, z2, a2)
        
        # 动量更新
        dW1 = momentum * dW1_prev + learning_rate * dW1
        db1 = momentum * db1_prev + learning_rate * db1
        dW2 = momentum * dW2_prev + learning_rate * dW2
        db2 = momentum * db2_prev + learning_rate * db2
        
        W1 -= dW1
        b1 -= db1
        W2 -= dW2
        b2 -= db2
        
        dW1_prev = dW1
        db1_prev = db1
        dW2_prev = dW2
        db2_prev = db2

        if epoch % 500 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")

# 主函数
train(X, y)

# 测试
def predict(X):
    _, _, _, a2 = forward(X)
    return np.round(a2)

print("Predictions:")
print(predict(X))

Epoch 0, Loss: 0.7380991039527407
Epoch 500, Loss: 0.29521127808968833
Epoch 1000, Loss: 0.2931867552712476
Epoch 1500, Loss: 0.2931153132380433
Epoch 2000, Loss: 0.2931094021843648
Epoch 2500, Loss: 0.2931089005887017
Epoch 3000, Loss: 0.2931088576785913
Epoch 3500, Loss: 0.2931088539846852
Epoch 4000, Loss: 0.2931088536649288
Epoch 4500, Loss: 0.29310885363710903
Predictions:
[[0.]
 [1.]
 [1.]
 [0.]]


# 手写数字识别

你设计一个3层的神经网络来解决手写数字识别问题，并使用经典的MNIST数据集进行训练。我们将在神经网络中使用ReLU激活函数和Softmax输出。

1. **数据准备**：我们首先使用keras的API来加载MNIST数据。
2. **神经网络设计**：使用3层网络。输入层、一个隐藏层和一个输出层。

步骤如下：



当然可以。让我们从头开始，逐行解析代码的内容和意义：

1. **导入必要的库**:
```python
import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils
```
这部分代码从`numpy`和`keras`导入所需的库和函数。`numpy`是用于数学和线性代数运算的库，而`keras`用于导入MNIST数据集和执行数据预处理。

2. **加载和预处理数据**:
```python
(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.reshape(60000, 784).astype('float32') / 255
X_test = X_test.reshape(10000, 784).astype('float32') / 255
y_train = np_utils.to_categorical(y_train, 10)
y_test = np_utils.to_categorical(y_test, 10)
```
在这里，我们首先使用`mnist.load_data()`加载MNIST数据集，该数据集包含手写数字的图像。接着，我们对数据进行预处理：将28x28的图像数据扁平化为784维向量，并将其归一化（除以255，使其在0到1的范围内）。最后，我们将目标变量（数字0-9）转化为“one-hot”编码。

3. **初始化网络参数**:
```python
input_size = 784
hidden_size = 128
output_size = 10
learning_rate = 0.01
epochs = 10
batch_size = 32

W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))
```
在这部分，我们定义了网络的结构和参数。该网络包含一个输入层、一个隐藏层和一个输出层。`input_size`、`hidden_size`和`output_size`定义了每一层的节点数。接着，我们初始化权重`W1`和`W2`及偏置`b1`和`b2`。权重是随机初始化的，而偏置则初始化为零。

4. **定义激活函数和它们的导数**:
```python
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(x):
    ...
```
这部分定义了两个激活函数：ReLU函数和Softmax函数。ReLU是隐藏层的激活函数，而Softmax则用于输出层。为了反向传播，我们还定义了ReLU函数的导数。

5. **定义前向传播和损失函数**:
```python
def forward(X):
    ...

def compute_loss(y, y_hat):
    ...
```
前向传播函数`forward`接受输入`X`并返回网络的输出。损失函数`compute_loss`计算了真实标签`y`和预测值`y_hat`之间的交叉熵损失。

6. **定义反向传播**:
```python
def backward(X, y, z1, a1, a2):
    ...
```
这个函数根据前向传播的结果和真实标签来计算权重和偏置的梯度。

7. **训练模型**:
```python
def train(X, y):
    ...
```
这个函数执行神经网络的训练过程。在每个epoch，它执行前向和反向传播，并更新权重和偏置。

8. **评估模型**:
```python
def evaluate(X, y):
    ...
```
这个函数评估了模型在给定数据上的性能，返回分类的准确率。

最后，通过调用`train`函数来训练模型，并使用`evaluate`函数来评估在测试集上的性能。

这就是代码的全面解释。希望这有助于您更好地理解它!

In [None]:
import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils

# 加载MNIST数据
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# 对数据进行预处理：将图像数据扁平化并归一化，将标签转换为one-hot编码
X_train = X_train.reshape(60000, 784).astype('float32') / 255
X_test = X_test.reshape(10000, 784).astype('float32') / 255
y_train = np_utils.to_categorical(y_train, 10)
y_test = np_utils.to_categorical(y_test, 10)

# 初始化网络参数
input_size = 784        # 输入层节点数（28*28像素）
hidden_size = 128      # 隐藏层节点数
output_size = 10       # 输出层节点数（0-9十个数字）
learning_rate = 0.01   # 学习率
epochs = 10            # 迭代轮数
batch_size = 32        # 每个批次的样本数

# 随机初始化权重和偏置
W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))

# ReLU激活函数及其导数
def relu(x):
    '''
    Parameters:
        x : numpy array
            Input data or activations.
    
    Returns:
        numpy array
            Element-wise ReLU activation of the input.
    '''
    return np.maximum(0, x)

def relu_derivative(x):
    '''
    Parameters:
        x : numpy array
            Input data or activations.
    
    Returns:
        numpy array
            Element-wise derivative of ReLU function for the input.
    '''
    return np.where(x > 0, 1, 0)

# Softmax函数
def softmax(x):
    '''
    Parameters:
        x : numpy array
            Input data or logits.
    
    Returns:
        numpy array
            Softmax activation for the input.
    '''
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# 前向传播
def forward(X):
    z1 = np.dot(X, W1) + b1          # 输入层到隐藏层的线性变换
    a1 = relu(z1)                    # 隐藏层的激活函数
    z2 = np.dot(a1, W2) + b2        # 隐藏层到输出层的线性变换
    a2 = softmax(z2)                # 输出层的激活函数
    return z1, a1, z2, a2

# 交叉熵损失函数
def compute_loss(y, y_hat):
    m = y.shape[0]
    return -np.sum(y * np.log(y_hat)) / m

# 反向传播
def backward(X, y, z1, a1, a2):
    '''
    Parameters:
        X : numpy array
            Input data.
        y : numpy array
            True labels in one-hot encoded format.
        z1, a1, a2 : numpy arrays
            Intermediate activations and computations from forward propagation.
    
    Returns:
        tuple
            Gradients (dW1, db1, dW2, db2) for weights and biases.
    '''
    m = y.shape[0]
    
    # 计算输出层的梯度
    dz2 = a2 - y
    dW2 = 1/m * np.dot(a1.T, dz2)
    db2 = np.sum(dz2, axis=0, keepdims=True)
    
    # 计算隐藏层的梯度
    da1 = np.dot(dz2, W2.T)
    dz1 = da1 * relu_derivative(a1)
    dW1 = 1/m * np.dot(X.T, dz1)
    db1 = np.sum(dz1, axis=0, keepdims=True)
    
    return dW1, db1, dW2, db2

# 训练模型
def train(X, y):
    global W1, b1, W2, b2
    
    for epoch in range(epochs):
        total_loss = 0
        
        # 迭代每个批次
        for i in range(0, X.shape[0], batch_size):
            X_batch = X[i:i+batch_size]
            y_batch = y[i:i+batch_size]
            
            # 前向传播
            z1, a1, z2, a2 = forward(X_batch)
            total_loss += compute_loss(y_batch, a2)
            
            # 反向传播
            dW1, db1, dW2, db2 = backward(X_batch, y_batch, z1, a1, a2)
            
            # 更新权重和偏置
            W1 -= learning_rate * dW1
            b1 -= learning_rate * db1
            W2 -= learning_rate * dW2
            b2 -= learning_rate * db2
        
        # 输出每轮的平均损失
        avg_loss = total_loss / (X.shape[0] / batch_size)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

train(X_train, y_train)

# 评估模型在测试集上的准确性
def evaluate(X, y):
    '''
    Parameters:
        X : numpy array
            Input data.
        y : numpy array
            True labels in one-hot encoded format.
    
    Returns:
        float
            Accuracy of the model on the provided data.
    '''
    _, _, _, a2 = forward(X)
    predictions = np.argmax(a2, axis=1)
    true_labels = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == true_labels)
    return accuracy

print(f"Accuracy: {evaluate(X_test, y_test) * 100:.2f}%")


当然可以。这次我将为您提供一个使用L2正则化的两层神经网络，并使用sigmoid激活函数。


- 我们将使用两层全连接的神经网络，包括一个隐藏层。
- 使用L2正则化以减少过拟合。
- 使用sigmoid激活函数，因为它是神经网络历史上的经典激活函数。
- 使用交叉熵作为损失函数。
- 执行前向传播、计算损失、后向传播，并更新权重。


In [None]:
import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils

# 加载MNIST数据
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# 数据预处理
X_train = X_train.reshape(60000, 784).astype('float32') / 255
X_test = X_test.reshape(10000, 784).astype('float32') / 255
y_train = np_utils.to_categorical(y_train, 10)
y_test = np_utils.to_categorical(y_test, 10)

# 网络参数
input_size = 784
hidden_size = 128
output_size = 10
learning_rate = 0.01
epochs = 10
batch_size = 32
lambd = 0.7  # L2正则化系数

# 初始化权重和偏置
W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))

# Sigmoid 激活函数及其导数
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

# Softmax 函数
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# 前向传播
def forward(X):
    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)
    z2 = np.dot(a1, W2) + b2
    a2 = softmax(z2)
    return z1, a1, z2, a2

# 计算损失，加入L2正则化
def compute_loss(y, y_hat):
    m = y.shape[0]
    cross_entropy = -np.sum(y * np.log(y_hat)) / m
    L2_regularization_cost = (lambd / (2 * m)) * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    return cross_entropy + L2_regularization_cost

# 反向传播
def backward(X, y, z1, a1, a2):
    m = y.shape[0]

    dz2 = a2 - y
    dW2 = 1/m * np.dot(a1.T, dz2) + (lambd/m)*W2
    db2 = np.sum(dz2, axis=0, keepdims=True)
    
    da1 = np.dot(dz2, W2.T)
    dz1 = da1 * sigmoid_derivative(a1)
    dW1 = 1/m * np.dot(X.T, dz1) + (lambd/m)*W1
    db1 = np.sum(dz1, axis=0, keepdims=True)
    
    return dW1, db1, dW2, db2

# 训练神经网络
def train(X, y):
    global W1, b1, W2, b2
    
    for epoch in range(epochs):
        total_loss = 0
        for i in range(0, X.shape[0], batch_size):
            X_batch = X[i:i+batch_size]
            y_batch = y[i:i+batch_size]
            
            z1, a1, z2, a2 = forward(X_batch)
            total_loss += compute_loss(y_batch, a2)
            
            dW1, db1, dW2, db2 = backward(X_batch, y_batch, z1, a1, a2)
            
            W1 -= learning_rate * dW1
            b1 -= learning_rate * db1
            W2 -= learning_rate * dW2
            b2 -= learning_rate * db2
        
        avg_loss = total_loss / (X.shape[0] / batch_size)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

train(X_train, y_train)

# 评估模型
def evaluate(X, y):
    _, _, _, a2 = forward(X)
    predictions = np.argmax(a2, axis=1)
    true_labels = np.argmax(y, axis=1)
    accuracy = np.mean(predictions == true_labels)
    return accuracy

print(f"Accuracy: {evaluate(X_test, y_test) * 100:.2f}%")
