In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import scipy.special

In [None]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Data
train_df = pd.read_csv('/content/drive/MyDrive/archive/fashion-mnist_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/archive/fashion-mnist_test.csv')

y_train = train_df.iloc[:, 0].values
y_test = test_df.iloc[:, 0].values
x_train = train_df.iloc[:, 1:].values.reshape(-1, 1, 28, 28)
x_test = test_df.iloc[:, 1:].values.reshape(-1, 1, 28, 28)

# Normalize the data
x_train = x_train / 255.0
x_test = x_test / 255.0

# Convert to torch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.float32, device=device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long, device=device)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32, device=device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long, device=device)

# Create DataLoaders
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Using device: cuda


Part (a) Basic CNN testing pooling layers

In [None]:
# Define CNN Model
class CNNModel(nn.Module):
    def __init__(self, pooling_type='max'):
        super(CNNModel, self).__init__()
        self.pooling_type = pooling_type

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding='same')
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding='same')
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding='same')
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding='same')
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, padding='same')

        if pooling_type == 'max':
            self.pool = nn.MaxPool2d(2, 2)
        elif pooling_type == 'avg':
            self.pool = nn.AvgPool2d(2, 2)
        elif pooling_type == 'global_avg':
            self.pool = nn.AdaptiveAvgPool2d(1)
        else:
            raise ValueError("Invalid pooling type. Choose from 'max', 'avg', or 'global_avg'")

        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = self.pool(F.relu(self.conv5(x)))

        if self.pooling_type == 'global_avg':
            x = x.view(x.size(0), -1)  # Global Average Pooling reduces to [batch_size, 512]
        else:
            x = x.view(x.size(0), -1)  # Flatten the tensor

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Training function
def train_model(model, train_loader, test_loader, epochs=10, learning_rate=0.001):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

    # Evaluate
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy: {100 * correct / total}%')

# Train with different pooling methods
for pooling in ['max', 'avg', 'global_avg']:
    print(f"Training with {pooling} pooling:")
    model = CNNModel(pooling_type=pooling)
    train_model(model, train_loader, test_loader, epochs=10)

Training with max pooling:
Epoch 1, Loss: 0.5437493671549917
Epoch 2, Loss: 0.3038203815407336
Epoch 3, Loss: 0.24829656190709518
Epoch 4, Loss: 0.21088486550046182
Epoch 5, Loss: 0.17996883856406662
Epoch 6, Loss: 0.15714192900981414
Epoch 7, Loss: 0.13471318129946525
Epoch 8, Loss: 0.11558979905065475
Epoch 9, Loss: 0.09618687289091808
Epoch 10, Loss: 0.08537116453651267
Accuracy: 92.92%
Training with avg pooling:
Epoch 1, Loss: 0.7426949795375246
Epoch 2, Loss: 0.4133508071494001
Epoch 3, Loss: 0.33038794565429563
Epoch 4, Loss: 0.28641402906478086
Epoch 5, Loss: 0.24663612279079872
Epoch 6, Loss: 0.22608193463242765
Epoch 7, Loss: 0.20681984317518753
Epoch 8, Loss: 0.19030366988499153
Epoch 9, Loss: 0.17493743482810348
Epoch 10, Loss: 0.15861067073399832
Accuracy: 92.82%
Training with global_avg pooling:
Epoch 1, Loss: 1.6014007796356673
Epoch 2, Loss: 0.9540242553392707
Epoch 3, Loss: 0.7963256694868938
Epoch 4, Loss: 0.7161462169084976
Epoch 5, Loss: 0.6845568948463082
Epoch 6, L

Part (b) CNN testing various combinations of kernels

In [None]:
from itertools import product

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameter combinations
kernel_sizes = [3, 5]
filter_schemes = [[32, 32, 32, 32, 32], [32, 48, 64, 96, 128], [32, 64, 128, 256, 512]]
combinations = list(product(kernel_sizes, filter_schemes))

best_accuracy = 0
best_params = None

for kernel_size, filters in combinations:
    padding = kernel_size // 2  # Ensure 'same' padding

    class CNNBackbone(nn.Module):
        def __init__(self, num_classes=10):
            super(CNNBackbone, self).__init__()

            self.conv_layers = nn.Sequential(
                nn.Conv2d(1, filters[0], kernel_size=kernel_size, stride=1, padding='same'),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2),

                nn.Conv2d(filters[0], filters[1], kernel_size=kernel_size, stride=1, padding='same'),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2),

                nn.Conv2d(filters[1], filters[2], kernel_size=kernel_size, stride=1, padding='same'),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2),

                nn.Conv2d(filters[2], filters[3], kernel_size=kernel_size, stride=1, padding='same'),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2),

                nn.Conv2d(filters[3], filters[4], kernel_size=kernel_size, stride=1, padding='same'),
                nn.ReLU(),
                nn.AdaptiveAvgPool2d(1),
            )

            self.fc_layers = nn.Sequential(
                nn.Linear(filters[4], 256),
                nn.ReLU(),
                nn.Linear(256, 128),
                nn.ReLU(),
                nn.Linear(128, num_classes)
            )

        def forward(self, x):
            x = self.conv_layers(x)
            x = torch.flatten(x, start_dim=1)
            x = self.fc_layers(x)
            return x

    # Initialize model
    model = CNNBackbone(num_classes=10).to(device)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

    # Evaluate model
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Kernel Size: {kernel_size}, Filters: {filters}, Test Accuracy: {accuracy:.2f}%")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = (kernel_size, filters)

print(f"Best Kernel Size: {best_params[0]}, Best Filters: {best_params[1]}, Best Accuracy: {best_accuracy:.2f}%")

Kernel Size: 3, Filters: [32, 32, 32, 32, 32], Test Accuracy: 87.53%
Kernel Size: 3, Filters: [32, 48, 64, 96, 128], Test Accuracy: 90.68%
Kernel Size: 3, Filters: [32, 64, 128, 256, 512], Test Accuracy: 92.45%
Kernel Size: 5, Filters: [32, 32, 32, 32, 32], Test Accuracy: 90.15%
Kernel Size: 5, Filters: [32, 48, 64, 96, 128], Test Accuracy: 91.92%
Kernel Size: 5, Filters: [32, 64, 128, 256, 512], Test Accuracy: 92.27%
Best Kernel Size: 3, Best Filters: [32, 64, 128, 256, 512], Best Accuracy: 92.45%


Part (c) CNN testing weight initialization methods

In [None]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define CNN Model with Avg Pooling and Weight Initialization
class CNNModel(nn.Module):
    def __init__(self, init_type='random'):
        super(CNNModel, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding='same')
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding='same')
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding='same')
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding='same')
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, padding='same')

        self.pool = nn.AvgPool2d(2, 2)

        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)

        self.initialize_weights(init_type)

    def initialize_weights(self, init_type):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                if init_type == 'xavier':
                    nn.init.xavier_uniform_(m.weight)
                elif init_type == 'he':
                    nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                else:  # Random initialization
                    nn.init.uniform_(m.weight, -0.1, 0.1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = self.pool(F.relu(self.conv5(x)))

        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Training function
def train_model(model, train_loader, test_loader, epochs=10, learning_rate=0.001):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

    # Evaluate
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy: {100 * correct / total}%')

# Train with different weight initialization methods
for init_type in ['random', 'xavier', 'he']:
    print(f"Training with {init_type} initialization:")
    model = CNNModel(init_type=init_type)
    train_model(model, train_loader, test_loader, epochs=10)


Using device: cuda
Training with random initialization:
Epoch 1, Loss: 0.6280890377536257
Epoch 2, Loss: 0.36096671830489435
Epoch 3, Loss: 0.2939871041171713
Epoch 4, Loss: 0.253052733647925
Epoch 5, Loss: 0.22233603148858175
Epoch 6, Loss: 0.19814434830250263
Epoch 7, Loss: 0.17855706330396728
Epoch 8, Loss: 0.16077871360520182
Epoch 9, Loss: 0.14041553848365476
Epoch 10, Loss: 0.12432889002554023
Accuracy: 93.01%
Training with xavier initialization:
Epoch 1, Loss: 0.6392001075022764
Epoch 2, Loss: 0.3734952395976479
Epoch 3, Loss: 0.30425750915366195
Epoch 4, Loss: 0.25702474469235587
Epoch 5, Loss: 0.2261471711138863
Epoch 6, Loss: 0.20126886491868287
Epoch 7, Loss: 0.18297598672820242
Epoch 8, Loss: 0.16262527127891208
Epoch 9, Loss: 0.1469826427574303
Epoch 10, Loss: 0.13254878617354485
Accuracy: 92.57%
Training with he initialization:
Epoch 1, Loss: 0.5210387435422015
Epoch 2, Loss: 0.31184293266965635
Epoch 3, Loss: 0.25017407046420487
Epoch 4, Loss: 0.21288156178174242
Epoch 5

Part (d) CNN extracted features with MLP classification

In [None]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Data
train_df = pd.read_csv('/content/drive/MyDrive/archive/fashion-mnist_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/archive/fashion-mnist_test.csv')

y_train = train_df.iloc[:, 0].values
y_test = test_df.iloc[:, 0].values
x_train = train_df.iloc[:, 1:].values.reshape(-1, 1, 28, 28)
x_test = test_df.iloc[:, 1:].values.reshape(-1, 1, 28, 28)

# Normalize the data
x_train = x_train / 255.0
x_test = x_test / 255.0

# Convert to torch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.float32, device=device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long, device=device)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32, device=device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long, device=device)

# Create DataLoaders
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define CNN Feature Extractor
class CNNFeatureExtractor(nn.Module):
    def __init__(self):
        super(CNNFeatureExtractor, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding='same')
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding='same')
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding='same')
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding='same')
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, padding='same')

        self.pool = nn.AvgPool2d(2, 2)

        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = self.pool(F.relu(self.conv5(x)))

        x = x.view(x.size(0), -1)  # Flatten the tensor
        return x

# Train CNN Feature Extractor
def train_cnn(model, train_loader, epochs=10, learning_rate=0.001):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Initialize and train CNN Feature Extractor
feature_extractor = CNNFeatureExtractor().to(device)
train_cnn(feature_extractor, train_loader, epochs=10)

# Extract features using CNN
def extract_features(model, data_loader):
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for images, label in data_loader:
            images = images.to(device)
            features.append(model(images).cpu().numpy())
            labels.append(label.cpu().numpy())
    return np.concatenate(features), np.concatenate(labels)

train_features, train_labels = extract_features(feature_extractor, train_loader)
test_features, test_labels = extract_features(feature_extractor, test_loader)

# Convert labels to one-hot encoding
train_labels = to_categorical(train_labels, num_classes=10)
test_labels = to_categorical(test_labels, num_classes=10)

# Transpose features to match MLP input format
train_features = train_features.T
test_features = test_features.T
train_labels = train_labels.T
test_labels = test_labels.T

# ======================================================
# MLP Classifier
# ======================================================

# Hyperparameters & Network Architecture
hidden_layers = 5
input_dim = train_features.shape[0]  # Number of features extracted by CNN
hidden_neurons = [256, 256, 256, 256, 256]
output_dim = 10

# For hidden layers, you can choose any activation among:
# "relu", "leaky_relu", "tanh", "gelu" : The output layer will always use "softmax".
activations = ['relu', 'relu', 'relu', 'relu', 'relu', 'softmax']

# Build list of layer dimensions
layers_dims = [input_dim] + hidden_neurons + [output_dim]

# Initialize weights and biases
weights = []
biases = []
for i in range(len(layers_dims) - 1):
    if i < len(activations) - 1 and activations[i].lower() in ['relu', 'leaky_relu']:
        W = np.random.randn(layers_dims[i+1], layers_dims[i]) * np.sqrt(2.0 / layers_dims[i])
    else:
        W = np.random.randn(layers_dims[i+1], layers_dims[i]) * np.sqrt(1.0 / layers_dims[i])
    b = np.zeros((layers_dims[i+1], 1))
    weights.append(W)
    biases.append(b)

# Activation Functions
def activation(x, func="relu", alpha=0.01):
    func = func.lower()
    if func == "relu":
        return np.maximum(0, x)
    elif func == "leaky_relu":
        return np.where(x > 0, x, alpha * x)
    elif func == "tanh":
        return np.tanh(x)
    elif func == "gelu":
        return 0.5 * x * (1 + scipy.special.erf(x / np.sqrt(2)))
    elif func == "softmax":
        exps = np.exp(x - np.max(x, axis=0, keepdims=True))
        return exps / np.sum(exps, axis=0, keepdims=True)
    else:
        raise ValueError("Unsupported activation function: " + func)

def activation_derivative(z, func="relu", alpha=0.01):
    func = func.lower()
    if func == "relu":
        return (z > 0).astype(float)
    elif func == "leaky_relu":
        return np.where(z > 0, 1.0, alpha)
    elif func == "tanh":
        return 1 - np.tanh(z)**2
    elif func == "gelu":
        return (0.5 * (1 + scipy.special.erf(z / np.sqrt(2))) +
                (z * np.exp(-0.5 * z**2) / np.sqrt(2 * np.pi)))
    else:
        raise ValueError("Unsupported activation function for derivative: " + func)

# Forward Pass
def forward(X, weights, biases):
    activations_list = [X]
    pre_activations_list = []
    A = X
    for i in range(len(weights) - 1):
        Z = np.dot(weights[i], A) + biases[i]
        pre_activations_list.append(Z)
        A = activation(Z, activations[i])
        activations_list.append(A)
    Z = np.dot(weights[-1], A) + biases[-1]
    pre_activations_list.append(Z)
    A = activation(Z, "softmax")
    activations_list.append(A)
    return A, activations_list, pre_activations_list

# Loss Function (Cross-Entropy)
def CE_loss(y, y_pred):
    epsilon = 1e-12
    y_pred = np.clip(y_pred, epsilon, 1.0 - epsilon)
    loss = -np.sum(y * np.log(y_pred), axis=0)
    return np.mean(loss)

def loss(X, y, weights, biases):
    y_pred, _, _ = forward(X, weights, biases)
    return CE_loss(y, y_pred)

# Backpropagation
def backprop(weights, biases, X, y, learning_rate=0.01):
    m = X.shape[1]
    y_pred, activations_list, pre_activations_list = forward(X, weights, biases)
    dA = y_pred - y

    dW_list = []
    dB_list = []

    for i in reversed(range(len(weights))):
        A_prev = activations_list[i]
        dZ = dA
        dW = np.dot(dZ, A_prev.T) / m
        dB = np.sum(dZ, axis=1, keepdims=True) / m
        dW_list.insert(0, dW)
        dB_list.insert(0, dB)

        if i != 0:
            dA = np.dot(weights[i].T, dZ)
            act_func = activations[i-1]
            dZ = dA * activation_derivative(pre_activations_list[i-1], func=act_func)
            dA = dZ

    for i in range(len(weights)):
        weights[i] -= learning_rate * dW_list[i]
        biases[i]  -= learning_rate * dB_list[i]

    return weights, biases

# Training Loop
def train(X, y, weights, biases, epochs=100, learning_rate=0.01, batch_size=128):
    m = X.shape[1]
    for epoch in range(epochs):
        permutation = np.random.permutation(m)
        X_shuffled = X[:, permutation]
        y_shuffled = y[:, permutation]

        for i in range(0, m, batch_size):
            end = i + batch_size
            X_batch = X_shuffled[:, i:end]
            y_batch = y_shuffled[:, i:end]
            weights, biases = backprop(weights, biases, X_batch, y_batch, learning_rate)

        if epoch % 10 == 0:
            current_loss = loss(X, y, weights, biases)
            print(f"Epoch {epoch}, Loss: {current_loss:.4f}")

    return weights, biases

# Train MLP Classifier
weights, biases = train(train_features, train_labels, weights, biases, epochs=100, learning_rate=0.01, batch_size=128)

# Evaluate MLP Classifier
def evaluate_mlp(X, y, weights, biases):
    y_pred, _, _ = forward(X, weights, biases)
    y_pred = np.argmax(y_pred, axis=0)
    y_true = np.argmax(y, axis=0)
    accuracy = np.mean(y_pred == y_true)
    return accuracy

test_accuracy = evaluate_mlp(test_features, test_labels, weights, biases)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Using device: cuda
Epoch 1, Loss: 1.340407866531852
Epoch 2, Loss: 1.0304028465231854
Epoch 3, Loss: 0.9625205767752011
Epoch 4, Loss: 0.9269135058370989
Epoch 5, Loss: 0.8972541771845014
Epoch 6, Loss: 0.8759593098466076
Epoch 7, Loss: 0.8544583037686246
Epoch 8, Loss: 0.8364871701896827
Epoch 9, Loss: 0.81949006724777
Epoch 10, Loss: 0.8075548625990018
Epoch 0, Loss: 0.2769
Epoch 10, Loss: 0.1508
Epoch 20, Loss: 0.1354
Epoch 30, Loss: 0.1351
Epoch 40, Loss: 0.1389
Epoch 50, Loss: 0.1353
Epoch 60, Loss: 0.1341
Epoch 70, Loss: 0.1304
Epoch 80, Loss: 0.1347
Epoch 90, Loss: 0.1294
Test Accuracy: 93.27%
