In [14]:
import numpy as np

from sklearn.model_selection import train_test_split
from tqdm import tqdm  # 用于显示训练进度


In [15]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def relu(x):
    return np.maximum(0, x)

def tanh(x):
    return np.tanh(x)

def softmax(x):
    exps = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exps / np.sum(exps, axis=-1, keepdims=True)

def softmax_crossentropy(y_pred, y_true):
    m = y_pred.shape[0]
    log_likelihood = -np.log(y_pred[range(m), y_true.argmax(axis=1)])
    loss = np.sum(log_likelihood) / m
    return loss

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def l2_regularization(weights, lambda_param):
    return 0.5 * lambda_param * np.sum(weights**2)

def l1_regularization(weights, lambda_param):
    return lambda_param * np.sum(np.abs(weights))


In [16]:
class ConvLayer:
    def __init__(self, num_filters, filter_size, num_channels, stride=1, padding='same', weight_decay=0.0):
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.num_channels = num_channels
        self.stride = stride
        self.padding = padding
        self.filters = np.random.randn(num_filters, filter_size, filter_size, num_channels) * 0.1
        self.weight_decay = weight_decay
        self.biases = np.zeros((num_filters, 1))

    def forward(self, inputs):
        self.inputs = inputs
        batch_size, input_height, input_width, input_channels = inputs.shape
        output_height = (input_height - self.filter_size + 2 * (self.padding == 'same')) // self.stride + 1
        output_width = (input_width - self.filter_size + 2 * (self.padding == 'same')) // self.stride + 1
        self.outputs = np.zeros((batch_size, output_height, output_width, self.num_filters))

        padded_inputs = np.pad(inputs, [(0, 0), (self.filter_size // 2, self.filter_size // 2), (self.filter_size // 2, self.filter_size // 2), (0, 0)], mode='constant')

        for y in range(output_height):
            for x in range(output_width):
                for f in range(self.num_filters):
                    y_start = y * self.stride
                    y_end = y_start + self.filter_size
                    x_start = x * self.stride
                    x_end = x_start + self.filter_size
                    self.outputs[:, y, x, f] = np.sum(padded_inputs[:, y_start:y_end, x_start:x_end, :] * self.filters[f, :, :, :], axis=(1, 2, 3)) + self.biases[f]

        return self.outputs

    def backward(self, dloss_dout, learning_rate):
        batch_size, input_height, input_width, input_channels = self.inputs.shape
        dloss_dfilters = np.zeros_like(self.filters)
        dloss_dbiases = np.zeros_like(self.biases)
        padded_inputs = np.pad(self.inputs, [(0, 0), (self.filter_size // 2, self.filter_size // 2), (self.filter_size // 2, self.filter_size // 2), (0, 0)], mode='constant')
        dloss_dinputs = np.zeros_like(padded_inputs)

        for y in range(self.outputs.shape[1]):
            for x in range(self.outputs.shape[2]):
                for f in range(self.num_filters):
                    y_start = y * self.stride
                    y_end = y_start + self.filter_size
                    x_start = x * self.stride
                    x_end = x_start + self.filter_size
                    dloss_dfilters[f, :, :, :] += np.sum(np.expand_dims(dloss_dout[:, y, x, f], axis=-1) * padded_inputs[:, y_start:y_end, x_start:x_end, :], axis=0)
                    dloss_dbiases[f] += np.sum(dloss_dout[:, y, x, f], axis=0)
                    dloss_dinputs[:, y_start:y_end, x_start:x_end, :] += np.expand_dims(dloss_dout[:, y, x, f], axis=-1) * self.filters[f, :, :, :]

        self.filters -= learning_rate * dloss_dfilters / batch_size
        self.biases -= learning_rate * dloss_dbiases / batch_size

        dloss_dinputs = dloss_dinputs[:, self.filter_size // 2:-self.filter_size // 2, self.filter_size // 2:-self.filter_size // 2, :]

        return dloss_dinputs
    
    def regularization_loss(self):
        return 0.5 * self.weight_decay * np.sum(self.filters ** 2)


In [17]:
class MaxPoolLayer:
    def __init__(self, pool_size=2, stride=2):
        self.pool_size = pool_size
        self.stride = stride

    def forward(self, inputs):
        self.inputs = inputs
        batch_size, input_height, input_width, input_channels = inputs.shape
        output_height = (input_height - self.pool_size) // self.stride + 1
        output_width = (input_width - self.pool_size) // self.stride + 1
        self.outputs = np.zeros((batch_size, output_height, output_width, input_channels))

        for y in range(output_height):
            for x in range(output_width):
                for c in range(input_channels):
                    y_start = y * self.stride
                    y_end = y_start + self.pool_size
                    x_start = x * self.stride
                    x_end = x_start + self.pool_size
                    self.outputs[:, y, x, c] = np.max(inputs[:, y_start:y_end, x_start:x_end, c], axis=(1, 2))

        return self.outputs

    def backward(self, dloss_dout, learning_rate):
        batch_size, input_height, input_width, input_channels = self.inputs.shape
        dloss_dinputs = np.zeros_like(self.inputs)

        for y in range(dloss_dout.shape[1]):
            for x in range(dloss_dout.shape[2]):
                for c in range(input_channels):
                    y_start = y * self.stride
                    y_end = y_start + self.pool_size
                    x_start = x * self.stride
                    x_end = x_start + self.pool_size
                    max_pool = np.max(self.inputs[:, y_start:y_end, x_start:x_end, c], axis=(1, 2), keepdims=True)
                    mask = (self.inputs[:, y_start:y_end, x_start:x_end, c] == max_pool)
                    dloss_dinputs[:, y_start:y_end, x_start:x_end, c] += mask * np.expand_dims(dloss_dout[:, y, x, c], axis=-1)

        return dloss_dinputs
    
    def regularization_loss(self):
        return 0


In [18]:
class DenseLayer:
    def __init__(self, input_size, output_size, weight_decay=0.0):
        self.weight_decay = weight_decay
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
        self.biases = np.zeros((1, output_size))

    def forward(self, inputs):
        self.inputs = inputs
        self.outputs = np.dot(inputs, self.weights) + self.biases
        return self.outputs

    def backward(self, dloss_dout, learning_rate):
        dloss_dweights = np.dot(self.inputs.T, dloss_dout)
        dloss_dbiases = np.sum(dloss_dout, axis=0, keepdims=True)
        dloss_dinputs = np.dot(dloss_dout, self.weights.T)

        self.weights -= learning_rate * dloss_dweights
        self.biases -= learning_rate * dloss_dbiases

        return dloss_dinputs
    
    def regularization_loss(self):
        return 0.5 * self.weight_decay * np.sum(self.weights ** 2)


In [19]:
class BatchNormLayer:
    def __init__(self, num_features, epsilon=1e-5, momentum=0.9, weight_decay=0.0):
        self.weight_decay = weight_decay
        self.epsilon = epsilon
        self.momentum = momentum
        self.gamma = np.ones((1, num_features))
        self.beta = np.zeros((1, num_features))
        self.running_mean = np.zeros((1, num_features))
        self.running_var = np.ones((1, num_features))
        self.batch_size = None
        self.x_normalized = None

    def forward(self, inputs, mode='train'):
        self.inputs_shape = inputs.shape
        if mode == 'train':
            self.batch_size, self.num_features = inputs.shape[0], np.prod(inputs.shape[1:])
            self.mean = np.mean(inputs, axis=0)
            self.variance = np.var(inputs, axis=0)
            self.x_normalized = (inputs - self.mean) / np.sqrt(self.variance + self.epsilon)
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * self.mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * self.variance
        else:
            self.x_normalized = (inputs - self.running_mean) / np.sqrt(self.running_var + self.epsilon)

        self.outputs = self.gamma * self.x_normalized + self.beta
        return self.outputs

    def backward(self, dloss_dout, learning_rate):
        dgamma = np.sum(dloss_dout * self.x_normalized, axis=0, keepdims=True)
        dbeta = np.sum(dloss_dout, axis=0, keepdims=True)
        dx_normalized = dloss_dout * self.gamma
        dvariance = np.sum(dx_normalized * (self.inputs - self.mean) * -0.5 * np.power(self.variance + self.epsilon, -1.5), axis=0, keepdims=True)
        dmean = np.sum(dx_normalized * -1.0 / np.sqrt(self.variance + self.epsilon), axis=0, keepdims=True) + dvariance * np.mean(-2.0 * (self.inputs - self.mean), axis=0, keepdims=True)
        dinputs = (dx_normalized * 1.0 / np.sqrt(self.variance + self.epsilon)) + (dvariance * 2.0 * (self.inputs - self.mean) / self.batch_size) + (dmean / self.batch_size)
        self.gamma -= learning_rate * dgamma
        self.beta -= learning_rate * dbeta
        return dinputs.reshape(self.inputs_shape)
    
    def regularization_loss(self):
        reg_loss = 0.5 * self.weight_decay * (np.sum(self.gamma ** 2) + np.sum(self.beta ** 2))
        return reg_loss


In [20]:
class DropoutLayer:
    def __init__(self, dropout_prob):
        self.dropout_prob = dropout_prob
        self.mask = None

    def forward(self, inputs, mode='train'):
        if mode == 'train':
            self.mask = np.random.binomial(1, 1 - self.dropout_prob, size=inputs.shape) / (1 - self.dropout_prob)
            self.outputs = inputs * self.mask
        else:
            self.outputs = inputs
        return self.outputs

    def backward(self, dloss_dout, learning_rate):
        return dloss_dout * self.mask
    
    def regularization_loss(self):
        return 0


In [21]:
class ActivationLayer:
    def __init__(self, activation):
        self.activation = activation
        self.inputs = None

    def forward(self, inputs):
        self.inputs = inputs
        if self.activation == 'relu':
            return np.maximum(0, inputs)
        elif self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-inputs))
        elif self.activation == 'tanh':
            return np.tanh(inputs)
        elif self.activation == "softmax":
            return self.softmax(inputs)
        else:
            raise ValueError(f'Unsupported activation function: {self.activation}')
        
    def softmax(self, inputs):
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        return probabilities

    def backward(self, dloss_dout, learning_rate):
        if self.activation == 'relu':
            return dloss_dout * (self.inputs > 0)
        elif self.activation == 'sigmoid':
            sigmoid_output = 1 / (1 + np.exp(-self.inputs))
            return dloss_dout * sigmoid_output * (1 - sigmoid_output)
        elif self.activation == 'tanh':
            return
    def regularization_loss(self):
        return 0

In [22]:
class FlattenLayer:
    def __init__(self):
        self.input_shape = None

    def forward(self, inputs):
        self.input_shape = inputs.shape
        return inputs.reshape(self.input_shape[0], -1)

    def backward(self, dloss_dout, learning_rate):
        return dloss_dout.reshape(self.input_shape)
    
    def regularization_loss(self):
        return 0


In [23]:
class VGGNet:
    def __init__(self, input_shape, num_classes):
        self.layers = [
            ConvLayer(num_filters=32, filter_size=3, num_channels=input_shape[2]),
            BatchNormLayer(num_features=32),
            ActivationLayer(activation='relu'),
            ConvLayer(num_filters=32, filter_size=3, num_channels=32),
            BatchNormLayer(num_features=32),
            ActivationLayer(activation='relu'),
            MaxPoolLayer(pool_size=2, stride=2),
            DropoutLayer(dropout_prob=0.25),
            
            ConvLayer(num_filters=64, filter_size=3, num_channels=32),
            BatchNormLayer(num_features=64),
            ActivationLayer(activation='relu'),
            ConvLayer(num_filters=64, filter_size=3, num_channels=64),
            BatchNormLayer(num_features=64),
            ActivationLayer(activation='relu'),
            MaxPoolLayer(pool_size=2, stride=2),
            DropoutLayer(dropout_prob=0.25),
            
            FlattenLayer(),
            DenseLayer(input_size=64*8*8, output_size=512),
            BatchNormLayer(num_features=512),
            ActivationLayer(activation='relu'),
            DropoutLayer(dropout_prob=0.5),
            DenseLayer(input_size=512, output_size=num_classes),  # Output layer
            ActivationLayer(activation='softmax')  # Softmax activation for output layer
        ]

    def forward(self, X):
        for layer in self.layers:
            X = layer.forward(X)
        return X

    def backward(self, loss_grad, learning_rate):
        for layer in reversed(self.layers):
            loss_grad = layer.backward(loss_grad, learning_rate)
        return loss_grad

    def predict(self, X):
        logits = self.forward(X)
        return np.argmax(logits, axis=1)

    def evaluate(self, X, y):
        y_pred = self.predict(X)
        accuracy = np.mean(y_pred == np.argmax(y, axis=1))
        return accuracy
    
    def regularization_loss(self):
        reg_loss = 0.0
        for layer in self.layers:
            reg_loss += layer.regularization_loss()
        return reg_loss


In [24]:
def train(model, X_train, y_train, X_val, y_val, optimizer='sgd', learning_rate=0.01, epochs=10, batch_size=64, verbose=True):
    num_batches = X_train.shape[0] // batch_size
    train_losses = []
    val_losses = []

    if optimizer == 'sgd':
        optimizer = SGD()
    elif optimizer == 'adam':
        optimizer = Adam()
    elif optimizer == 'adagrad':
        optimizer = Adagrad()
    elif optimizer == 'gd':
        optimizer = GradientDescent()

    for epoch in range(epochs):
        epoch_train_loss = 0
        epoch_val_loss = 0

        for batch in tqdm(range(num_batches), disable=not verbose):
            start = batch * batch_size
            end = (batch + 1) * batch_size
            X_batch = X_train[start:end]
            y_batch = y_train[start:end]

            logits = model.forward(X_batch)
            loss = softmax_crossentropy(logits, y_batch)
            reg_loss = model.regularization_loss()
            total_loss = loss + reg_loss
            epoch_train_loss += total_loss

            grad = model.backward(loss_grad=1, learning_rate=learning_rate)
            optimizer.update(model.layers)

        train_losses.append(epoch_train_loss / num_batches)

        # Validation loss
        val_logits = model.forward(X_val, mode='test')
        val_loss = softmax_crossentropy(val_logits, y_val)
        val_losses.append(val_loss)

        if verbose:
            print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_losses[-1]}, Val Loss: {val_losses[-1]}")

    return train_losses, val_losses

class SGD:
    def update(self, layers):
        for layer in layers:
            if hasattr(layer, 'weights'):
                layer.weights -= learning_rate * layer.weights_grad
                layer.biases -= learning_rate * layer.biases_grad

class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = None
        self.v = None
        self.t = 0

    def update(self, layers):
        if self.m is None:
            self.m = [np.zeros_like(layer.weights) for layer in layers if hasattr(layer, 'weights')]
            self.v = [np.zeros_like(layer.weights) for layer in layers if hasattr(layer, 'weights')]

        self.t += 1
        for i, layer in enumerate(layers):
            if hasattr(layer, 'weights'):
                self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * layer.weights_grad
                self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (layer.weights_grad ** 2)
                m_hat = self.m[i] / (1 - self.beta1 ** self.t)
                v_hat = self.v[i] / (1 - self.beta2 ** self.t)
                layer.weights -= self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
                layer.biases -= self.lr * layer.biases_grad

class Adagrad:
    def __init__(self, lr=0.01, epsilon=1e-8):
        self.lr = lr
        self.epsilon = epsilon
        self.cache = None

    def update(self, layers):
        if self.cache is None:
            self.cache = [np.zeros_like(layer.weights) for layer in layers if hasattr(layer, 'weights')]

        for i, layer in enumerate(layers):
            if hasattr(layer, 'weights'):
                self.cache[i] += layer.weights_grad ** 2
                layer.weights -= self.lr * layer.weights_grad / (np.sqrt(self.cache[i]) + self.epsilon)
                layer.biases -= self.lr * layer.biases_grad


In [25]:
import torch
import torchvision
import torchvision.transforms as transforms

# 数据预处理和加载
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

trainset = torchvision.datasets.CIFAR10(root='../dataset/cifar10', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='../dataset/cifar10', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=100,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# 转换数据为numpy格式
train_data = []
train_labels = []

for images, labels in trainloader:
    for i in range(len(images)):
        train_data.append(torch.transpose(images[i], 0, 2).numpy())
        train_labels.append(np.eye(10)[labels[i]])

X_train = np.array(train_data)
y_train = np.array(train_labels)

# Create VGG model
model = VGGNet(input_shape=X_train.shape[1:], num_classes=10)

# Train the model
train_losses, val_losses = train(model, X_train, y_train, None, None, optimizer='adam', learning_rate=0.001, epochs=10, batch_size=64, verbose=True)

# Evaluate on test set
# test_accuracy = model.evaluate(X_test, y_test)
# print(f"Test accuracy: {test_accuracy}")


Files already downloaded and verified
Files already downloaded and verified


  0%|          | 0/781 [00:01<?, ?it/s]

[[2.57265202 0.         0.         ... 0.         0.45247728 0.        ]
 [0.         0.82837421 0.         ... 0.81093453 0.         0.        ]
 [0.         0.         0.         ... 1.15720429 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.83063317 0.        ]
 [0.         0.         0.         ... 0.         0.         3.3716495 ]]
None





TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'