In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torchvision import transforms
import torchvision
# Define the custom neural network architecture
class CustomNeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(CustomNeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.relu3(self.fc3(x))
        x = self.relu4(self.fc4(x))
        x = self.fc5(x)
        return x

# Custom dataset, DataLoader, and training loop are assumed to be available from the previous question.

# Initialize the model, loss function, and optimizer
input_size = 28 * 28  # Assuming input size for MNIST-like data
hidden_size = 32
num_classes = 10
learning_rate = 0.0003
epochs = 60

model = CustomNeuralNetwork(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Custom backpropagation implementation
def custom_backward(model, inputs, labels, learning_rate):
    # Forward pass
    logits = model(inputs)
    loss = criterion(logits, labels)

    # Backward pass
    model.zero_grad()

    # Calculate gradients manually
    grad_output = (logits.argmax(dim=1) == labels).float().view(-1, 1)
    grad_logits = (logits - grad_output) / inputs.size(0)

    # Backward pass through fully connected layers
    grad_fc5 = grad_logits.t() @ model.relu4(model.fc4.weight).t()
    grad_relu4 = (grad_logits @ model.fc5.weight) * (model.relu4(model.fc4.weight) > 0).float()

    # Adjust shape for matrix multiplication
    grad_fc4 = grad_relu4.t() @ model.relu3(model.fc3.weight).t()
    grad_relu3 = (grad_relu4 @ model.fc4.weight) * (model.relu3(model.fc3.weight) > 0).float()

    grad_fc3 = grad_relu3.t() @ model.relu2(model.fc2.weight).t()
    grad_relu2 = (grad_relu3 @ model.fc3.weight) * (model.relu2(model.fc2.weight) > 0).float()

    grad_fc2 = grad_relu2.t() @ model.relu1(model.fc1.weight).t()
    grad_relu1 = (grad_relu2 @ model.fc2.weight) * (model.relu1(model.fc1.weight) > 0).float()

    # Update weights
    model.fc1.weight.data -= learning_rate * grad_fc2
    model.fc2.weight.data -= learning_rate * grad_fc2
    model.fc3.weight.data -= learning_rate * grad_fc3
    model.fc4.weight.data -= learning_rate * grad_fc4
    model.fc5.weight.data -= learning_rate * grad_logits.t() @ model.fc5.weight

    return loss.item()

# Training loop
train_losses, val_losses, test_losses = [], [], []
train_accuracies, val_accuracies, test_accuracies = [], [], []
from torch.utils.data import DataLoader

# Define the dataset and data loaders
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Assuming you have datasets named train_dataset, val_dataset, and test_dataset
test_loader = DataLoader(test_dataset, batch_size= 64, shuffle=False)

for epoch in range(epochs):
    # Training
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        loss = custom_backward(model, inputs.view(inputs.size(0), -1), labels, learning_rate)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        train_loss, train_accuracy = evaluate_model(model, train_loader, criterion)
        val_loss, val_accuracy = evaluate_model(model, val_loader, criterion)
        test_loss, test_accuracy = evaluate_model(model, test_loader, criterion)

        # Store metrics for plotting
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        test_losses.append(test_loss)
        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)
        test_accuracies.append(test_accuracy)

        print(f'Epoch {epoch + 1}/{epochs}, '
              f'Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, '
              f'Training Accuracy: {train_accuracy:.2f}%, Validation Accuracy: {val_accuracy:.2f}%')

# Plotting the graphs
plt.figure(figsize=(12, 5))

# Loss graphs
plt.subplot(1, 2, 1)
plt.plot(range(1, epochs + 1), train_losses, label='Training')
plt.plot(range(1, epochs + 1), val_losses, label='Validation')
plt.plot(range(1, epochs + 1), test_losses, label='Testing')
plt.title('Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Accuracy graphs
plt.subplot(1, 2, 2)
plt.plot(range(1, epochs + 1), train_accuracies, label='Training')
plt.plot(range(1, epochs + 1), val_accuracies, label='Validation')
plt.plot(range(1, epochs + 1), test_accuracies, label='Testing')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend()

plt.show()


RuntimeError: mat1 and mat2 shapes cannot be multiplied (10x64 and 32x32)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

def load_mnist():
    path = 'mnist.npz'  
    f = np.load(path)
    x_train, y_train = f['x_train'], f['y_train']
    x_test, y_test = f['x_test'], f['y_test']
    f.close()
    return (x_train, y_train, x_test, y_test)

def calculate_loss(model, X, y): 
    W1, b1, W2, b2, W3, b3, W4, b4, W5, b5 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model[
        'b3'], model['W4'], model['b4'], model['W5'], model['b5']
    z1 = X.dot(W1) + b1  
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    a2 = np.tanh(z2)
    z3 = a2.dot(W3) + b3 
    a3 = np.tanh(z3)
    z4 = a3.dot(W4) + b4
    a4 = np.tanh(z4)
    z5 = a4.dot(W5) + b5
    exp_scores = np.exp(z5)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    num_examples = X.shape[0]
    corect_logprobs = -np.log(probs[range(num_examples), y])
    data_loss = np.sum(corect_logprobs)
    # data_loss += reg_lambda / 2 * (
    #         np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3)) + np.sum(np.square(W4)))
    return 1. / num_examples * data_loss


def predict(model, x):  # Forward
    W1, b1, W2, b2, W3, b3, W4, b4, W5, b5 = \
        model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], \
        model['b3'], model['W4'], model['b4'], model['W5'], model['b5']
    z1 = x.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    a2 = np.tanh(z2)
    z3 = a2.dot(W3) + b3
    a3 = np.tanh(z3)
    z4 = a3.dot(W4) + b4
    a4 = np.tanh(z4)
    z5 = a4.dot(W5) + b5
    exp_scores = np.exp(z5)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return np.argmax(probs, axis=1) 

def build_model(X, y, nn_hdim, epsilon, reg_lambda, num_passes=60, print_loss=False):
    np.random.seed(0)  
    num_examples = X.shape[0]
    nn_input_dim = nn_hdim[0]
    print('input dim', nn_input_dim)

    hdim1 = nn_hdim[1]
    W1 = np.random.randn(nn_input_dim, hdim1) / np.sqrt(hdim1)
    b1 = np.zeros((1, hdim1)) 
    print('fc: %d -> %d' % (nn_input_dim, hdim1))
    hdim2 = nn_hdim[2]
    W2 = np.random.randn(hdim1, hdim2) / np.sqrt(hdim2)
    b2 = np.zeros((1, hdim2))
    print('fc: %d -> %d' % (hdim1, hdim2))
    hdim3 = nn_hdim[3]
    W3 = np.random.randn(hdim2, hdim3) / np.sqrt(hdim3)
    b3 = np.zeros((1, hdim3))
    print('fc: %d -> %d' % (hdim2, hdim3))
    hdim4 = nn_hdim[4]
    W4 = np.random.randn(hdim3, hdim4) / np.sqrt(hdim4)
    b4 = np.zeros((1, hdim4))
    print('fc: %d -> %d' % (hdim3, hdim4))
    hdim5 = nn_hdim[5]
    W5 = np.random.randn(hdim4, hdim5) / np.sqrt(hdim5)
    b5 = np.zeros((1, hdim5))
    print('fc: %d -> %d' % (hdim4, hdim5))

    # train：
    model = {}
    bs = 128 #batchsize
    nbs_per_epoch = int(num_examples / bs)
    for i in range(0, num_passes):
        j = i % nbs_per_epoch
        if 0 == j:
            ridx = np.asarray(list(range(num_examples)))
            np.random.shuffle(ridx)
            X = X[ridx, :]
            y = y[ridx]
        Xb = X[j * bs:(j + 1) * bs, :]
        yb = y[j * bs:(j + 1) * bs]
        # Forward propagation
        z1 = Xb.dot(W1) + b1
        a1 = np.maximum(0, z1)  # ReLU activation
        z2 = a1.dot(W2) + b2
        a2 = np.maximum(0, z2)  # ReLU activation
        z3 = a2.dot(W3) + b3
        a3 = np.maximum(0, z3)  # ReLU activation
        z4 = a3.dot(W4) + b4
        a4 = np.maximum(0, z4)  # ReLU activation
        z5 = a4.dot(W5) + b5
        exp_scores = np.exp(z5)
        # Backpropagation
        delta_loss = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        delta_loss[range(bs), yb] -= 1
        dW5 = (a4.T).dot(delta_loss)
        db5 = np.sum(delta_loss, axis=0, keepdims=True)
        delta5 = delta_loss.dot(W5.T) * (a4 > 0)  # Derivative of ReLU
        dW4 = (a3.T).dot(delta5)
        db4 = np.sum(delta5, axis=0, keepdims=True)
        delta4 = delta5.dot(W4.T) * (a3 > 0)  # Derivative of ReLU
        dW3 = (a2.T).dot(delta4)
        db3 = np.sum(delta4, axis=0, keepdims=True)
        delta3 = delta4.dot(W3.T) * (a2 > 0)  # Derivative of ReLU
        dW2 = a1.T.dot(delta3)
        db2 = np.sum(delta3, axis=0, keepdims=True)
        delta2 = delta3.dot(W2.T) * (a1 > 0)  # Derivative of ReLU
        dW1 = (Xb.T).dot(delta2)
        db1 = np.sum(delta2, axis=0)
        # dW5 += reg_lambda * W5
        # dW4 += reg_lambda * W4
        # dW3 += reg_lambda * W3
        # dW2 += reg_lambda * W2
        # dW1 += reg_lambda * W1
        W1 += -epsilon * dW1
        b1 += -epsilon * db1
        W2 += -epsilon * dW2
        b2 += -epsilon * db2
        W3 += -epsilon * dW3
        b3 += -epsilon * db3
        W4 += -epsilon * dW4
        b4 += -epsilon * db4
        W5 += -epsilon * dW5
        b5 += -epsilon * db5
        model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2,
                 'W3': W3, 'b3': b3, 'W4': W4, 'b4': b4, 'W5': W5, 'b5': b5}

        if print_loss and i % 2000 == 0:
            epsilon *= 0.99
            y_pred = predict(model, X_test)
            accuracy = sum(0 == (y_pred - Y_test)) / Y_test.shape[0]
            print("loss after iteration {}: {:.2f}, testing accuracy: {:.2f}%".
                  format(i, calculate_loss(model, X, y), accuracy * 100))
    return model



(train_images, train_labels, test_images, test_labels) = load_mnist() 
n_train, w, h = train_images.shape  
X_train = train_images.reshape((n_train, w * h))  
Y_train = train_labels 
n_test, w, h = test_images.shape
X_test = test_images.reshape((n_test, w * h))
Y_test = test_labels
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)
# train
X_train = (X_train.astype(float) - 128.0) / 128.0  
X_test = (X_test.astype(float) - 128.0) / 128.0
num_examples, input_dim = X_train.shape
epsilon = 0.0003
reg_lambda = 0.00
model = build_model(X_train, Y_train, [input_dim, 256, 128, 63, 32, 10], epsilon, reg_lambda, 60, print_loss=True)
# test output
X_test0=X_test[0:3,:]
y_pred0 = predict(model, X_test0)
print(y_pred0)
X_test0=X_test0.reshape(3,w,h)
plt.figure('第一张图预测')
plt.imshow(X_test0[0,:,:])
plt.figure('第二张图预测')
plt.imshow(X_test0[1,:,:])
plt.figure('第三张图预测')
plt.imshow(X_test0[2,:,:])
pylab.show()

FileNotFoundError: [Errno 2] No such file or directory: 'mnist.npz'

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

# Define the neural network architecture
class FeedForwardNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.relu3(self.fc3(x))
        x = self.relu4(self.fc4(x))
        x = self.fc5(x)
        return x

# Define the dataset and data loaders
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
input_size = 28 * 28
hidden_size = 40
output_size = 10
model = FeedForwardNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0003)
num_epochs = 60
train_losses, val_losses, test_losses = [], [], []
train_accuracies, val_accuracies, test_accuracies = [], [], []

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss, correct_train = 0, 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Manual backpropagation
        grads = {}
        grads['fc1.weight'] = torch.zeros_like(model.fc1.weight)
        grads['fc1.bias'] = torch.zeros_like(model.fc1.bias)
        grads['fc2.weight'] = torch.zeros_like(model.fc2.weight)
        grads['fc2.bias'] = torch.zeros_like(model.fc2.bias)
        grads['fc3.weight'] = torch.zeros_like(model.fc3.weight)
        grads['fc3.bias'] = torch.zeros_like(model.fc3.bias)
        grads['fc4.weight'] = torch.zeros_like(model.fc4.weight)
        grads['fc4.bias'] = torch.zeros_like(model.fc4.bias)
        grads['fc5.weight'] = torch.zeros_like(model.fc5.weight)
        grads['fc5.bias'] = torch.zeros_like(model.fc5.bias)

        # Backward pass
        # Backward pass
        # Backward pass
        d_loss_dy = 1.0  # Gradient of the loss with respect to the final output
        grads['fc5.weight'] += d_loss_dy * model.relu4(model.fc4(outputs)).t() @ torch.diag_embed(torch.ones_like(model.fc5.weight))
        grads['fc5.bias'] += d_loss_dy * torch.ones_like(model.fc5.bias)

        d_loss_dz4 = d_loss_dy * model.fc5(outputs).mm(torch.diag_embed(model.fc5.weight.t()))
        d_loss_dx4 = d_loss_dz4 * (outputs > 0).float()
        grads['fc4.weight'] += model.relu4(model.fc4(outputs)).t() @ d_loss_dx4
        grads['fc4.bias'] += d_loss_dx4.sum(dim=0)

        d_loss_dz3 = d_loss_dx4 @ model.fc4.weight.t()
        d_loss_dx3 = d_loss_dz3 * (model.fc4(outputs) > 0).float()
        grads['fc3.weight'] += model.relu3(model.fc3(inputs)).t() @ d_loss_dx3
        grads['fc3.bias'] += d_loss_dx3.sum(dim=0)

        d_loss_dz2 = d_loss_dx3 @ model.fc3.weight.t()
        d_loss_dx2 = d_loss_dz2 * (model.fc3(inputs) > 0).float()
        grads['fc2.weight'] += model.relu2(model.fc2(inputs)).t() @ d_loss_dx2
        grads['fc2.bias'] += d_loss_dx2.sum(dim=0)

        d_loss_dz1 = d_loss_dx2 @ model.fc2.weight.t()
        d_loss_dx1 = d_loss_dz1 * (model.fc2(inputs) > 0).float()
        grads['fc1.weight'] += inputs.t() @ d_loss_dx1
        grads['fc1.bias'] += d_loss_dx1.sum(dim=0)
# Update parameters manually
with torch.no_grad():
    for param_name, param in model.named_parameters():
        param -= 0.0003 * grads[param_name]  # Using the learning rate of 0.0003

        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        correct_train += (predicted == labels).sum().item()

    train_loss = total_loss / len(train_loader)
    train_accuracy = correct_train / len(train_loader.dataset)
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)
    # Validation
    model.eval()
    val_loss, correct_val = 0, 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            correct_val += (predicted == labels).sum().item()

    val_loss /= len(test_loader)
    val_accuracy = correct_val / len(test_loader.dataset)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    # Testing
    model.eval()
    test_loss, correct_test = 0, 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            correct_test += (predicted == labels).sum().item()

    test_loss /= len(test_loader)
    test_accuracy = correct_test / len(test_loader.dataset)
    test_losses.append(test_loss)
    test_accuracies.append(test_accuracy)

    # Print progress
    print(f"Epoch {epoch+1}/{num_epochs}: "
          f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Test Loss: {test_loss:.4f}, "
          f"Train Acc: {train_accuracy:.4f}, Val Acc: {val_accuracy:.4f}, Test Acc: {test_accuracy:.4f}")

# Plotting graphs
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.plot(test_losses, label='Test')
plt.title('Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train')
plt.plot(val_accuracies, label='Validation')
plt.plot(test_accuracies, label='Test')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x10 and 40x40)