In [211]:
import numpy as np

# 定义激活函数
def relu(x):
    return np.maximum(0, x)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x, axis=0)

In [212]:
# 激活函数的导数
def relu_derivative(x):
    return x > 0

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

In [213]:
# 定义损失函数
def cross_entropy_loss(pred, label):
    return -np.log(pred[label])

def cross_entropy_derivative(pred, label):
    grad = pred.copy()
    grad[label] -= 1
    return grad

In [214]:

class ConvLayer:
    def __init__(self, num_filters, filter_size, input_depth):
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.filters = np.random.randn(num_filters, filter_size, filter_size, input_depth) * 0.1
        self.biases = np.zeros(num_filters)

    def iterate_regions(self, image):
        h, w, d = image.shape
        for i in range(h - self.filter_size + 1):
            for j in range(w - self.filter_size + 1):
                im_region = image[i:(i + self.filter_size), j:(j + self.filter_size)]
                yield im_region, i, j

    def forward(self, input):
        self.last_input = input
        h, w, d = input.shape
        out_height = h - self.filter_size + 1
        out_width = w - self.filter_size + 1
        
        if out_height <= 0 or out_width <= 0:
            raise ValueError("Invalid output dimension. Check filter size and input shape.")
        
        output = np.zeros((out_height, out_width, self.num_filters))

        for im_region, i, j in self.iterate_regions(input):
            output[i, j] = np.sum(im_region * self.filters, axis=(1, 2, 3)) + self.biases

        return relu(output)

    def backward(self, d_L_d_out, learn_rate):
        d_L_d_filters = np.zeros(self.filters.shape)
        d_L_d_biases = np.zeros(self.biases.shape)
        h, w, d = self.last_input.shape

        for im_region, i, j in self.iterate_regions(self.last_input):
            for f in range(self.num_filters):
                d_L_d_filters[f] += d_L_d_out[i, j, f] * im_region
                d_L_d_biases[f] += d_L_d_out[i, j, f]

        self.filters -= learn_rate * d_L_d_filters
        self.biases -= learn_rate * d_L_d_biases

        return None

In [215]:
# 定义池化层
class MaxPoolLayer:
    def __init__(self, pool_size):
        self.pool_size = pool_size

    def iterate_regions(self, image):
        h, w, d = image.shape
        new_h = h // self.pool_size
        new_w = w // self.pool_size
        for i in range(new_h):
            for j in range(new_w):
                im_region = image[(i * self.pool_size):(i * self.pool_size + self.pool_size),
                                  (j * self.pool_size):(j * self.pool_size + self.pool_size)]
                yield im_region, i, j

    def forward(self, input):
        self.last_input = input
        h, w, d = input.shape
        output = np.zeros((h // self.pool_size, w // self.pool_size, d))

        for im_region, i, j in self.iterate_regions(input):
            output[i, j] = np.amax(im_region, axis=(0, 1))

        return output

    def backward(self, d_L_d_out):
        d_L_d_input = np.zeros(self.last_input.shape)

        for im_region, i, j in self.iterate_regions(self.last_input):
            h, w, f = im_region.shape
            amax = np.amax(im_region, axis=(0, 1))

            for i2 in range(h):
                for j2 in range(w):
                    for f2 in range(f):
                        if im_region[i2, j2, f2] == amax[f2]:
                            d_L_d_input[i * self.pool_size + i2, j * self.pool_size + j2, f2] = d_L_d_out[i, j, f2]

        return d_L_d_input

In [216]:
# 定义全连接层
class DenseLayer:
    def __init__(self, input_len, nodes):
        self.weights = np.random.randn(input_len, nodes) / input_len
        self.biases = np.zeros(nodes)

    def forward(self, input):
        self.last_input_shape = input.shape
        input = input.flatten()
        self.last_input = input
        input_len, nodes = self.weights.shape
        totals = np.dot(input, self.weights) + self.biases
        return relu(totals)

    def backward(self, d_L_d_out, learn_rate):
        d_L_d_w = np.dot(self.last_input[np.newaxis].T, d_L_d_out[np.newaxis] * relu_derivative(d_L_d_out))
        d_L_d_b = d_L_d_out
        d_L_d_input = np.dot(d_L_d_out, self.weights.T).reshape(self.last_input_shape)
        self.weights -= learn_rate * d_L_d_w
        self.biases -= learn_rate * d_L_d_b
        return d_L_d_input


In [217]:
# 定义BN层
class BatchNormLayer:
    def __init__(self, num_features, epsilon=1e-5, momentum=0.9):
        self.gamma = np.ones(num_features)
        self.beta = np.zeros(num_features)
        self.epsilon = epsilon
        self.momentum = momentum
        self.running_mean = np.zeros(num_features)
        self.running_var = np.zeros(num_features)

    def forward(self, input, training=True):
        if training:
            mean = np.mean(input, axis=0)
            var = np.var(input, axis=0)
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * var
        else:
            mean = self.running_mean
            var = self.running_var

        self.input_normalized = (input - mean) / np.sqrt(var + self.epsilon)
        return self.gamma * self.input_normalized + self.beta

    def backward(self, d_L_d_out, learn_rate):
        N, D = d_L_d_out.shape
        self.gamma -= learn_rate * np.sum(d_L_d_out * self.input_normalized, axis=0)
        self.beta -= learn_rate * np.sum(d_L_d_out, axis=0)
        d_input_normalized = d_L_d_out * self.gamma
        d_var = np.sum(d_input_normalized * (self.last_input - self.mean) * -0.5 * (self.var + self.epsilon)**(-1.5), axis=0)
        d_mean = np.sum(d_input_normalized * -1 / np.sqrt(self.var + self.epsilon), axis=0) + d_var * np.sum(-2 * (self.last_input - self.mean), axis=0) / N
        return d_input_normalized / np.sqrt(self.var + self.epsilon) + d_var * 2 * (self.last_input - self.mean) / N + d_mean / N


In [218]:
# 定义Dropout层
class DropoutLayer:
    def __init__(self, dropout_rate):
        self.dropout_rate = dropout_rate

    def forward(self, input, training=True):
        if training:
            self.mask = np.random.binomial(1, 1 - self.dropout_rate, size=input.shape)
            return input * self.mask
        else:
            return input * (1 - self.dropout_rate)

    def backward(self, d_L_d_out):
        return d_L_d_out * self.mask

In [219]:
# 定义优化器
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):
        for param, grad in zip(params, grads):
            param -= self.lr * grad

class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = []
        self.v = []
        self.t = 0

    def update(self, params, grads):
        if not self.m:
            self.m = [np.zeros_like(param) for param in params]
            self.v = [np.zeros_like(param) for param in params]

        self.t += 1
        lr_t = self.lr * (np.sqrt(1 - self.beta2 ** self.t) / (1 - self.beta1 ** self.t))

        for i, (param, grad) in enumerate(zip(params, grads)):
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * grad
            m_hat = self.m[i] / (1 - self.beta1 ** self.t)

In [220]:
# CNN
class CNN:
    def __init__(self, layers):
        # 初始化网络结构
        self.layers = layers

    def forward(self, input):
        # 前向传播
        for layer in self.layers:
            input = layer.forward(input)
        return input

    def backward(self, grad, learn_rate):
        # 反向传播
        for layer in reversed(self.layers):
            grad = layer.backward(grad, learn_rate)

In [221]:
# 定义VGG网络结构
class VGG:
    def __init__(self):
        self.layers = [
            ConvLayer(64, 3, 32),  # input depth = 3 (RGB image)
            ConvLayer(64, 3, 64),
            MaxPoolLayer(2),
            ConvLayer(128, 3, 64),
            ConvLayer(128, 3, 128),
            MaxPoolLayer(2),
            ConvLayer(256, 3, 128),
            ConvLayer(256, 3, 256),
            ConvLayer(256, 3, 256),
            MaxPoolLayer(2),
            ConvLayer(512, 3, 256),
            ConvLayer(512, 3, 512),
            ConvLayer(512, 3, 512),
            MaxPoolLayer(2),
            ConvLayer(512, 3, 512),
            ConvLayer(512, 3, 512),
            ConvLayer(512, 3, 512),
            MaxPoolLayer(2),
            DenseLayer(512 * 1 * 1, 4096),  # Adjust according to final output size
            DenseLayer(4096, 4096),
            DenseLayer(4096, 10)  # 10 classes for CIFAR-10
        ]

    def forward(self, x):
        print(x.shape)
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, grad, learn_rate):
        for layer in reversed(self.layers):
            grad = layer.backward(grad, learn_rate)
        return grad

    def train(self, x, y, learn_rate=0.001, epochs=1, optimizer=None, batch_size=32):
        if optimizer is None:
            optimizer = SGD(lr=learn_rate)
        
        for epoch in range(epochs):
            for i in range(0, len(x), batch_size):
                x_batch = x[i:i + batch_size]
                y_batch = y[i:i + batch_size]

                # Forward pass
                outputs = np.array([self.forward(xi) for xi in x_batch])

                # Compute loss and gradient
                loss = np.mean([cross_entropy_loss(output, yi) for output, yi in zip(outputs, y_batch)])
                grads = np.array([cross_entropy_derivative(output, yi) for output, yi in zip(outputs, y_batch)])

                # Backward pass
                for grad in grads:
                    self.backward(grad, learn_rate)

                print(f'Epoch {epoch + 1}/{epochs}, Batch {i // batch_size + 1}/{len(x) // batch_size + 1}, Loss: {loss:.4f}')

# Loss function and its derivative
def cross_entropy_loss(predictions, labels):
    return -np.sum(labels * np.log(predictions + 1e-8))

def cross_entropy_derivative(predictions, labels):
    return predictions - labels

In [222]:
import torch
import torchvision
import torchvision.transforms as transforms

# 数据预处理和加载
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

trainset = torchvision.datasets.CIFAR10(root='../dataset/cifar10', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='../dataset/cifar10', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=100,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# 转换数据为numpy格式
train_data = []
train_labels = []

for images, labels in trainloader:
    for i in range(len(images)):
        train_data.append(torch.transpose(images[i], 0, 2).numpy())
        train_labels.append(np.eye(10)[labels[i]])

train_data = np.array(train_data)
train_labels = np.array(train_labels)


# 实例化VGG-16网络
vgg = VGG()

# 训练网络
vgg.train(train_data, train_labels, learn_rate=0.001, epochs=10, batch_size=128)

Files already downloaded and verified
Files already downloaded and verified
(32, 32, 3)


ValueError: operands could not be broadcast together with shapes (3,3,3) (64,3,3,32) 

In [None]:
import numpy as np
from tqdm import tqdm

import torch
import torchvision
import torchvision.transforms as transforms

# 数据预处理和加载
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

trainset = torchvision.datasets.CIFAR10(root='../dataset/cifar10', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='../dataset/cifar10', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=100,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# 转换数据为numpy格式
train_data = []
train_labels = []

for images, labels in trainloader:
    for i in range(len(images)):
        train_data.append(images[i].numpy())
        train_labels.append(np.eye(10)[labels[i]])

x_train = np.array(train_data)
y_train = np.array(train_labels)

# Helper functions

def relu(x):
    return np.maximum(0, x)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def softmax(x):
    exp_scores = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)

def cross_entropy_loss(logits, labels):
    m = labels.shape[0]
    log_probs = -np.log(logits[np.arange(m), np.argmax(labels, axis=1)])
    loss = np.sum(log_probs) / m
    return loss

def softmax_backward(dout, cache):
    Z = cache
    s = np.exp(Z) / np.sum(np.exp(Z), axis=-1, keepdims=True)
    dZ = dout * s * (1 - s)
    return dZ

def relu_backward(dout, cache):
    dZ = np.array(dout, copy=True)
    dZ[cache <= 0] = 0
    return dZ

def sigmoid_backward(dout, cache):
    s = cache
    dZ = dout * s * (1 - s)
    return dZ

def tanh_backward(dout, cache):
    t = cache
    dZ = dout * (1 - np.square(t))
    return dZ

def initialize_parameters(layer_dims):
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)
    
    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2 / layer_dims[l-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
    return parameters

def linear_forward(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    return Z, cache

def batchnorm_forward(Z, gamma, beta, eps=1e-5):
    mu = np.mean(Z, axis=0, keepdims=True)
    var = np.var(Z, axis=0, keepdims=True)
    Z_norm = (Z - mu) / np.sqrt(var + eps)
    out = gamma * Z_norm + beta
    cache = (Z, Z_norm, gamma, beta, mu, var, eps)
    return out, cache

def batchnorm_backward(dout, cache):
    print(cache)
    Z, Z_norm, gamma, beta, mu, var, eps = cache
    m = Z.shape[0]
    
    dbeta = np.sum(dout, axis=0)
    dgamma = np.sum(dout * Z_norm, axis=0)
    dZ_norm = dout * gamma
    
    divar = np.sum(dZ_norm * (Z - mu), axis=0)
    dvar = -0.5 * np.sum(dZ_norm * (Z - mu) * np.power(var + eps, -1.5), axis=0)
    dmu = np.sum(dZ_norm * (-1 / np.sqrt(var + eps)), axis=0) + dvar * np.mean(-2 * (Z - mu), axis=0)
    
    dZ = dZ_norm / np.sqrt(var + eps) + dvar * 2 * (Z - mu) / m + dmu / m
    
    return dZ, dgamma, dbeta

def dropout_forward(A, dropout_prob=0.5, mode='train'):
    if mode == 'train':
        mask = (np.random.rand(*A.shape) < (1 - dropout_prob)) / (1 - dropout_prob)
        A = A * mask
        cache = mask
    else:
        cache = None
    return A, cache

def dropout_backward(dout, cache):
    mask = cache
    dA = dout * mask
    return dA

def linear_activation_forward(A_prev, W, b, activation):
    Z, linear_cache = linear_forward(A_prev, W, b)
    
    if activation == "relu":
        A, activation_cache = relu(Z), Z
    elif activation == "sigmoid":
        A, activation_cache = sigmoid(Z), Z
    elif activation == "tanh":
        A, activation_cache = tanh(Z), Z
    elif activation == "softmax":
        A, activation_cache = softmax(Z), Z
    
    cache = (linear_cache, activation_cache)
    return A, cache

def linear_activation_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
    elif activation == "tanh":
        dZ = tanh_backward(dA, activation_cache)
    elif activation == "softmax":
        dZ = softmax_backward(dA, activation_cache)
    
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

def linear_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    
    for l in range(L):
        parameters["W" + str(l+1)] -= learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] -= learning_rate * grads["db" + str(l+1)]
        
    return parameters

# CNN Model class
class CNNModel:
    def __init__(self, layer_dims, activations, dropout_probs, batchnorm=False):
        self.layer_dims = layer_dims
        self.activations = activations
        self.dropout_probs = dropout_probs
        self.batchnorm = batchnorm
        self.parameters = initialize_parameters(layer_dims)
    
    def forward_propagation(self, X, mode='train'):
        caches = []
        A = X
        L = len(self.layer_dims) - 1
        
        for l in range(1, L):
            A_prev = A
            W = self.parameters['W' + str(l)]
            b = self.parameters['b' + str(l)]
            activation = self.activations[l-1]
            
            A, cache = linear_activation_forward(A_prev, W, b, activation)
            
            if self.batchnorm:
                gamma = np.ones((self.layer_dims[l], 1))
                beta = np.zeros((self.layer_dims[l], 1))
                A, bn_cache = batchnorm_forward(A, gamma, beta)
                cache = (cache, bn_cache)
            
            if mode == 'train' and self.dropout_probs[l-1] > 0:
                A, dropout_cache = dropout_forward(A, self.dropout_probs[l-1], mode)
                cache = (cache, dropout_cache)
            
            caches.append(cache)
        
        # Output layer
        W = self.parameters['W' + str(L)]
        b = self.parameters['b' + str(L)]
        AL, cache = linear_activation_forward(A, W, b, "softmax")
        caches.append(cache)
        
        return AL, caches
    
    def compute_cost(self, AL, Y):
        m = Y.shape[1]
        cost = cross_entropy_loss(AL, Y)
        return cost
    
    def backward_propagation(self, AL, Y, caches):
        grads = {}
        L = len(self.parameters) // 2
        m = AL.shape[1]
        Y = Y.reshape(AL.shape)
        
        # Compute gradients
        dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
        
        # Backpropagate through layers
        current_cache = caches[-1]
        grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, "sigmoid")
        
        for l in reversed(range(L-1)):
            current_cache = caches[l]
            
            if self.batchnorm:
                cache, dropout_cache = current_cache
                bn_cache, activation_cache = cache
                dA_prev_bn, dgamma, dbeta = batchnorm_backward(grads["dA" + str(l + 2)], bn_cache)
                grads["dA" + str(l + 2)] = dA_prev_bn
                grads["dgamma" + str(l + 1)] = dgamma
                grads["dbeta" + str(l + 1)] = dbeta
                dA_prev_dropout = dropout_backward(grads["dA" + str(l + 2)], dropout_cache)
                grads["dA" + str(l + 2)] = dA_prev_dropout
            else:
                grads["dA" + str(l + 2)], grads["dW" + str(l + 2)], grads["db" + str(l + 2)] = linear_activation_backward(grads["dA" + str(l + 2)], current_cache, self.activations[l])
        
        # First layer gradients
        current_cache = caches[0]
        grads["dA1"], grads["dW1"], grads["db1"] = linear_activation_backward(grads["dA2"], current_cache, self.activations[0])
        
        return grads
    
    def fit(self, X, Y, learning_rate=0.01, epochs=10, batch_size=64, optimizer='adam', print_cost=True):
        m = X.shape[1]
        costs = []
        
        for epoch in range(epochs):
            minibatches = random_mini_batches(X, Y, batch_size)
            minibatch_cost = 0
            
            for minibatch in minibatches:
                (minibatch_X, minibatch_Y) = minibatch
                AL, caches = self.forward_propagation(minibatch_X)
                minibatch_cost += self.compute_cost(AL, minibatch_Y)
                grads = self.backward_propagation(AL, minibatch_Y, caches)
                
                if optimizer == "adam":
                    self.parameters = update_parameters_with_adam(self.parameters, grads, learning_rate)
                elif optimizer == "gd":
                    self.parameters = update_parameters_with_gd(self.parameters, grads, learning_rate)
                    
            epoch_cost = minibatch_cost / m
            
            if print_cost and epoch % 10 == 0:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
            if print_cost and epoch % 1 == 0:
                costs.append(epoch_cost)
                
        return costs

    def predict(self, X):
        AL, _ = self.forward_propagation(X, mode='test')
        predictions = np.argmax(AL, axis=0)
        return predictions

    def evaluate(self, X, Y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == np.argmax(Y, axis=1))
        return accuracy * 100
    
# Utility functions

def random_mini_batches(X, Y, mini_batch_size=64):
    m = X.shape[1]
    mini_batches = []
    
    # Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((Y.shape[0], m))
    
    # Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = m // mini_batch_size
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size : (k + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

def update_parameters_with_gd(parameters, grads, learning_rate):
    L = len(parameters) // 2
    
    for l in range(L):
        parameters["W" + str(l+1)] -= learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] -= learning_rate * grads["db" + str(l+1)]
        
    return parameters

def update_parameters_with_adam(parameters, grads, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8, t=1):
    L = len(parameters) // 2
    v_corrected = {}
    s_corrected = {}
    
    for l in range(L):
        v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1 - beta1) * grads['dW' + str(l + 1)]
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - np.power(beta1, t))
        s["dW" + str(l+1)] = beta2 * s["dW" + str(l+1)] + (1 - beta2) * np.power(grads['dW' + str(l + 1)], 2)
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - np.power(beta2, t))
        parameters["W" + str(l+1)] -= learning_rate * v_corrected["dW" + str(l+1)] / np.sqrt(s_corrected["dW" + str(l+1)] + epsilon)
        v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1 - beta1) * grads['db' + str(l + 1)]
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - np.power(beta1, t))
        s["db" + str(l+1)] = beta2 * s["db" + str(l+1)] + (1 - beta2) * np.power(grads['db' + str(l + 1)], 2)
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - np.power(beta2, t))
        parameters["b" + str(l+1)] -= learning_rate * v_corrected["db" + str(l+1)] / np.sqrt(s_corrected["db" + str(l+1)] + epsilon)
    
    return parameters

# Model architecture for VGG-like network
layer_dims = [32*32*3, 512, 512, 10]
activations = ["relu", "relu", "softmax"]
dropout_probs = [0.5, 0.5, 0]

# Initialize and train the model
model = CNNModel(layer_dims, activations, dropout_probs, batchnorm=True)
costs = model.fit(x_train.reshape(x_train.shape[0], -1).T, y_train.T, learning_rate=0.001, epochs=50, batch_size=128, optimizer='adam')

# Evaluate the model
accuracy = model.evaluate(x_test.reshape(x_test.shape[0], -1).T, y_test.T)
print(f'Test Accuracy: {accuracy}%')



Files already downloaded and verified
Files already downloaded and verified
((array([[-1.45429954,  4.04258727, -1.40327739, ..., -0.        ,
        -1.29217646,  0.02903511],
       [-0.        , -0.        ,  0.        , ..., -0.        ,
        -1.19897518, -0.84347416],
       [ 0.        ,  5.06377975, -1.29259632, ...,  0.46079516,
        -0.        , -1.39598981],
       ...,
       [-1.45429954, -1.3688808 , -0.        , ..., -1.4102685 ,
        -1.29217646, -1.39598981],
       [-1.24962541, -0.        , -0.        , ..., -1.4070241 ,
        -1.29217646, -0.        ],
       [-1.30292505,  0.59011771, -0.        , ...,  0.        ,
        -1.29217646, -0.        ]]), array([[ 0.06577336, -0.01200863,  0.0531466 , ..., -0.02077525,
         0.01268506,  0.06442468],
       [-0.03896336, -0.0441773 ,  0.15262379, ..., -0.11071828,
         0.08655519, -0.02646854],
       [-0.02948666, -0.02868987, -0.0902056 , ..., -0.01340982,
        -0.07400051,  0.02142804],
       .

ValueError: not enough values to unpack (expected 7, got 2)