In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
import warnings
warnings.filterwarnings('ignore')
import os
from typing import List
from tqdm import tqdm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# Load data
train_df = pd.read_csv('archive/sign_mnist_train.csv')
test_df = pd.read_csv('archive/sign_mnist_test.csv')

# Split data into features and labels
x_train = train_df.iloc[:, 1:].values
x_test = test_df.iloc[:, 1:].values
y_train = train_df['label'].values
y_test = test_df['label'].values

# Standardize data
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

x_train = (x_train - mean) / (std + 1e-8)  # Adding a small value to avoid division by zero
x_test = (x_test - mean) / (std + 1e-8)


# Reshape data for MLP
x_train_mlp = x_train.reshape(-1, 28*28).astype(np.float32)
x_test_mlp = x_test.reshape(-1, 28*28).astype(np.float32)


# Convert labels to one-hot encoded format
num_classes = 26
y_train_encoded = np.eye(num_classes)[y_train]
y_test_encoded = np.eye(num_classes)[y_test]


In [None]:
class NeuralNetLayer:
    def __init__(self):
        self.gradient = None
        self.parameters = None
        
    def forward(self, x):
        raise NotImplementedError

    def backward(self, gradient):
        raise NotImplementedError
    
class LinearLayer(NeuralNetLayer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.ni = input_size
        self.no = output_size
        self.w = np.random.randn(output_size, input_size) * np.sqrt(2. / input_size)  # He initialization
        self.b = np.random.randn(output_size)
        self.cur_input = None
        self.parameters = [self.w, self.b]

    def forward(self, x):
        self.cur_input = x
        return x @ self.w.T + self.b

    def backward(self, gradient):
        assert self.cur_input is not None, "Must call forward before backward"
        dw = gradient.T @ self.cur_input
        db = gradient.sum(axis=0)
        self.gradient = [dw, db]
        return gradient @ self.w
    
class ReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.0)
        return np.maximum(0, x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient
    
class SoftmaxOutputLayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
        self.cur_probs = None

    def forward(self, x):
        exps = np.exp(x - np.max(x,axis=-1, keepdims=True))
        self.cur_probs = exps / np.sum(exps, axis=-1, keepdims=True)
        return self.cur_probs

    def backward(self, target):
        assert self.cur_probs is not None, "Must call forward before backward"
        return self.cur_probs - target
    
class MLP:
    def __init__(self, *args: List[NeuralNetLayer]):
        self.layers = args

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, target):
        for layer in self.layers[::-1]:
            target = layer.backward(target)

    def fit(self, x, y, x_val, y_val, learning_rate, num_iterations, optimizer, batch_size=26, decaying_lr=True, verbose=True):
        sample_count = len(x)
        training_acc_history = []
        training_loss_history = []
        val_acc_history = []
        val_loss_history = []
        decay_rate = 0.95
        num_batches = np.ceil(sample_count / batch_size).astype(int)
        
        for epoch in range(num_iterations):
            train_iter_loss = 0.0
            val_iter_loss = 0.0
            predictions = []

            indices = np.random.permutation(sample_count)
            random_x = x[indices]
            random_y = y[indices]

            for i in range(0, sample_count, batch_size):
                end = min(i + batch_size, sample_count)
                x_batch = random_x[i:end]
                y_batch = random_y[i:end]

                # calculate train loss and update weights
                prediction = self.forward(x_batch)
                loss = compute_loss(prediction, y_batch)
                train_iter_loss += loss

                self.backward(y_batch)
                optimizer.step()

                y_pred = np.argmax(prediction, axis=-1)
                predictions.extend(y_pred)
                
                # calculate validation loss
                val_predictions = self.forward(x_val)
                val_loss = compute_loss(val_predictions, y_val)
                val_iter_loss += val_loss
            
            val_loss_history.append(val_iter_loss)
                         
            # Calculate training accuracy
            training_accuracy = self.evaluate_acc(np.array(predictions), np.argmax(y[indices], axis=1))
            training_acc_history.append(training_accuracy)
            training_loss_history.append(train_iter_loss)
            
            # Calculate validation accuracy
            y_val_pred = self.predict(x_val)
            val_acc = self.evaluate_acc(y_val_pred, np.argmax(y_val, axis=1))
            val_acc_history.append(val_acc)
            
            # Update learning rate with decay
            if decaying_lr:
                current_lr = learning_rate * (decay_rate ** epoch)
                optimizer.lr = current_lr
                
            if verbose:
                print(f"Epoch {epoch + 1}/{num_iterations} - Training Loss: {train_iter_loss:.4f} - Training Accuracy: {training_accuracy:.4f} - Validation Loss: {val_iter_loss:.4f} - Validation Accuracy: {val_acc:.4f}")
            
        return training_accuracy, training_acc_history, training_loss_history, val_acc_history, val_loss_history

    def evaluate_acc(self, y_pred, y_true):
        acc = np.mean(y_pred == y_true)
        return acc

    def predict(self, X):
        predictions = self.forward(X)
        return np.argmax(predictions, axis=1)

def compute_loss(y_pred, y_true):
    epsilon = 1e-8  # A small value to prevent log(0)
    probability = np.log(y_pred + epsilon)  # Add epsilon inside the log
    loss = (-np.sum(y_true * probability)) / len(y_true)
    return loss

class Optimizer:
    def __init__(self, net: MLP):
        self.net = net

    def step(self):
        for layer in self.net.layers[::-1]:
            if layer.parameters is not None:
                self.update(layer.parameters, layer.gradient)

    def update(self, params, gradient):
        raise NotImplementedError


class GradientDescentOptimizer(Optimizer):
    def __init__(self, net: MLP, lr: float):
        super().__init__(net)
        self.lr = lr

    def update(self, params, gradient):
        for (p, g) in zip(params, gradient):
            p -= self.lr * g

class L2GradientDescentOptimizer(Optimizer):
    def __init__(self, net: MLP, lr: float, l2_lambda=0.001):
        super().__init__(net)
        self.lr = lr
        self.l2_lambda = l2_lambda

    def update(self, params, gradient):
        grad_w, grad_b = gradient
        w, b = params
        # Update weights and biases with L2 regularization
        w -= self.lr * (grad_w + self.l2_lambda * w)
        b -= self.lr * (grad_b + self.l2_lambda * b)

class AdamOptimizer(Optimizer):
    def __init__(self, net: MLP, lr: float, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super().__init__(net)
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        # Initialize first and second moment vectors to match the shape of each parameter
        self.m = []
        self.v = []
        for layer in self.net.layers:
            if layer.parameters is not None:
                for param in layer.parameters:
                    self.m.append(np.zeros_like(param))
                    self.v.append(np.zeros_like(param))
        self.t = 0

    def update(self, params, gradient):
        self.t += 1
        lr_t = self.lr * np.sqrt(1 - self.beta2 ** self.t) / (1 - self.beta1 ** self.t)
        
        param_index = 0  # Keep track of the parameter being updated
        for layer in self.net.layers:
            if layer.parameters is not None:
                for p, g in zip(layer.parameters, layer.gradient):
                    m = self.m[param_index]
                    v = self.v[param_index]
                    m[:] = self.beta1 * m + (1 - self.beta1) * g
                    v[:] = self.beta2 * v + (1 - self.beta2) * (g ** 2)
                    
                    m_hat = m / (1 - self.beta1 ** self.t)
                    v_hat = v / (1 - self.beta2 ** self.t)
                    
                    p -= lr_t * m_hat / (np.sqrt(v_hat) + self.epsilon)
                    
                    param_index += 1

            
hidden_layer1 = 256
hidden_layer2 = 256

def train_and_plot(mlp_model, model_name, x_train, y_train, x_test, y_test, **kwargs):
    num_iterations = kwargs.get('num_iterations', 50)
    learning_rate = kwargs.get('learning_rate', 0.01)
    optimizer_type = kwargs.get('optimizer', 'gradient_descent')
    decaying_lr = kwargs.get('decaying_lr', True)
    verbose = kwargs.get('verbose', True)
    batch_size = kwargs.get('batch_size', 26)
    l2_lambda = kwargs.get('l2_lambda', 0.001)

    # Initialize the appropriate optimizer based on the optimizer_type argument
    if optimizer_type == "adam":
        optimizer = AdamOptimizer(mlp_model, learning_rate)
    elif optimizer_type == "l2_gradient_descent":
        optimizer = L2GradientDescentOptimizer(mlp_model, learning_rate, l2_lambda=l2_lambda)
    else:
        optimizer = GradientDescentOptimizer(mlp_model, learning_rate)
    
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1)

    # Train the model and plot results
    training_acc, train_acc_history, train_loss_history, val_acc_history, val_loss_history = mlp_model.fit(
        x_train, y_train, x_val, y_val, learning_rate, num_iterations, optimizer, batch_size=batch_size, decaying_lr=decaying_lr, verbose=verbose
    )

    # Evaluate the model on the test set
    y_pred = mlp_model.predict(x_test)
    y_test_indices = np.argmax(y_test, axis=1)
    acc = mlp_model.evaluate_acc(y_pred, y_test_indices)
    
    if verbose:
        print(f"Test Accuracy: {acc*100}%")

    # Plotting
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss', color=color)
    ax1.plot(range(1, num_iterations + 1), train_loss_history, color=color, label='Training Loss')
    ax1.plot(range(1, num_iterations + 1), val_loss_history, color='tab:orange', label='Validation Loss')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel('Accuracy', color=color)
    ax2.plot(range(1, num_iterations + 1), train_acc_history, color=color, label='Training Accuracy')
    ax2.plot(range(1, num_iterations + 1), val_acc_history, color='tab:green', label='Validation Accuracy')
    ax2.tick_params(axis='y', labelcolor=color)

    # Adding the legend
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc='upper left')

    plt.title(f'Training Loss and Accuracy - {model_name}')
    fig.tight_layout()
    if verbose:
        plt.savefig(f"{model_name}.png")
        # 🤩🫣
        plt.show()

    return acc

def train_with_different_hidden_units(x_train, y_train, x_test, y_test, num_iterations, learning_rate):
    hidden_units_options = [32, 64, 128, 256]
    best_acc = 0
    best_model = None
    best_hidden_units = 0
    
    
    mlp_no_hidden_layer = MLP(
        LinearLayer(x_train.shape[-1], num_classes),
        SoftmaxOutputLayer()
    )

    for units in hidden_units_options:
        print(f"Training models with {units} hidden units.")
        # Define models with the specified number of hidden units
        
        mlp_one_hidden_layer = MLP(
            LinearLayer(x_train.shape[-1], units),
            ReLULayer(),
            LinearLayer(units, num_classes),
            SoftmaxOutputLayer()
        )

        mlp_two_hidden_layer = MLP(
            LinearLayer(x_train.shape[-1], units),
            ReLULayer(),
            LinearLayer(units, units),  # Second layer with half the units
            ReLULayer(),
            LinearLayer(units, num_classes),
            SoftmaxOutputLayer()
        )

        models = [
            (mlp_one_hidden_layer, f"One Hidden Layer with {units} Units"),
            (mlp_two_hidden_layer, f"Two Hidden Layers with {units} Units")
        ]
        accuracies = []
        
        # First train the model with no hidden layer
        print("\nTraining model with no hidden layer...")
        acc = train_and_plot(mlp_no_hidden_layer, "No Hidden Layer", x_train, y_train, x_test, y_test, num_iterations=num_iterations, learning_rate=learning_rate, verbose=False)
        accuracies.append((acc, "No Hidden Layer"))
        if acc > best_acc:
            best_acc = acc
            best_model = "No Hidden Layer"
            best_hidden_units = 0
        
        for model, name in models:
            print(f"\nTraining {name}...")
            acc = train_and_plot(model, name, x_train, y_train, x_test, y_test, num_iterations=num_iterations, learning_rate=learning_rate, verbose=False)
            accuracies.append((acc, name))
            if acc > best_acc:
                best_acc = acc
                best_model = name
                best_hidden_units = units

    print(f"\nBest model: {best_model} with {best_hidden_units} hidden units. Test Accuracy: {best_acc*100}%")
    # print the accuracies in a column format
    print("\nAccuracies:")
    for acc, name in accuracies:
        print(f"{name}: {acc*100:.2f}%")
        
train_with_different_hidden_units(x_train_mlp, y_train_encoded, x_test_mlp, y_test_encoded, num_iterations=3, learning_rate=0.01)


In [None]:
# Experiment 2
# Define two new layers with different activation functions

class SigmoidLayer(NeuralNetLayer):
    def forward(self, x):
        self.cur_output = 1 / (1 + np.exp(-x))
        return self.cur_output

    def backward(self, gradient):
        return gradient * self.cur_output * (1 - self.cur_output)

class LeakyReLULayer(NeuralNetLayer):
    def __init__(self, alpha=0.01):
        super().__init__()
        self.alpha = alpha

    def forward(self, x):
        self.cur_input = x
        return np.where(x > 0, x, self.alpha * x)

    def backward(self, gradient):
        dx = np.ones_like(self.cur_input)
        dx[self.cur_input < 0] = self.alpha
        return gradient * dx
    
class DropoutLayer(NeuralNetLayer):
    def __init__(self, dropout_rate):
        super().__init__()
        self.dropout_rate = dropout_rate
        self.mask = None

    def forward(self, x, training=True):
        if training:
            self.mask = (np.random.rand(*x.shape) > self.dropout_rate) / (1.0 - self.dropout_rate)
            return x * self.mask
        else:
            return x

    def backward(self, gradient):
        return gradient * self.mask
    
class MLPWithActivation(MLP):
    def __init__(self, input_size, hidden_sizes, output_size, activation='relu', use_dropout=False):
        layers = []
        for i, hidden_size in enumerate(hidden_sizes):
            if i == 0:
                prev_size = input_size
            else:
                prev_size = hidden_sizes[i-1]

            layers.append(LinearLayer(prev_size, hidden_size))
            
            if activation == 'relu':
                layers.append(ReLULayer())
            elif activation == 'sigmoid':
                layers.append(SigmoidLayer())
            elif activation == 'leakyrelu':
                layers.append(LeakyReLULayer())
            else:
                raise ValueError(f"Unsupported activation function: {activation}")

            if use_dropout:
                layers.append(DropoutLayer(0.5))
                
        layers.append(LinearLayer(hidden_sizes[-1], output_size))
        layers.append(SoftmaxOutputLayer())

        super().__init__(*layers)

# Create two models with different activation functions
# The hidden layers should use Leaky ReLU and Sigmoid activation functions respectively

leaky_relu_model = MLPWithActivation(
    x_train_mlp.shape[-1], 
    [256, 256],
    num_classes, 
    activation='leakyrelu'
)

sigmoid_model = MLPWithActivation(
    x_train_mlp.shape[-1], 
    [256, 256], 
    num_classes, 
    activation='sigmoid'
)

models = zip(
    [leaky_relu_model, sigmoid_model],
    ["Leaky ReLU", "Sigmoid"]
)

def compare_activations(x_train, y_train, x_test, y_test, num_iterations, learning_rate):
    best_acc = 0
    best_model = None

    for model, name in models:
        print(f"\nTraining {name}...")
        acc = train_and_plot(model, name, x_train, y_train, x_test, y_test, num_iterations=num_iterations, learning_rate=learning_rate, verbose=False)
        if acc > best_acc:
            best_acc = acc
            best_model = name

    print(f"\nBest model: {best_model}. Test Accuracy: {best_acc*100}%")
    
# Compare l2 values
l2_values = [0.0001, 0.001, 0.01, 0.1, 0.5, 1]
results = {}
for l2 in l2_values:
    print(f"\nTraining models with L2 regularization value: {l2}")
    mlp_l2 = MLP(
        LinearLayer(x_train_mlp.shape[-1], 256),
        ReLULayer(),
        LinearLayer(256, 256),
        ReLULayer(),
        LinearLayer(256, num_classes),
        SoftmaxOutputLayer()
    )
    acc = train_and_plot(mlp_l2, f"L2 Regularization - {l2}", x_train_mlp, y_train_encoded, x_test_mlp, y_test_encoded, num_iterations=3, learning_rate=0.001, optimizer='l2_gradient_descent', l2_lambda=l2)
    results[l2] = acc
    
l2_keys = [str(l2) for l2 in results.keys()]
acc_values = list(results.values())

plt.figure(figsize=(10, 6))
bars = plt.bar(l2_keys, acc_values, color='skyblue', edgecolor='black')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval, round(yval, 3), va='bottom')
    # add text annotation corresponding to the y value

plt.xticks(l2_keys, rotation=45)
plt.xlabel('L2 Regularization Value', fontsize=14)
plt.ylabel('Test Accuracy', fontsize=14)
plt.title('Effect of L2 Regularization on Test Accuracy', fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("l2_regularization_values.png")
plt.show()
compare_activations(x_train_mlp, y_train_encoded, x_test_mlp, y_test_encoded, num_iterations=3, learning_rate=0.001)

In [None]:
# Experiment 3        
# Plot the accuracy and loss for a model with 2 hidden ReLU layers and L2 regularization
mlp_two_hidden_layer_l2 = MLP(
    LinearLayer(x_train_mlp.shape[-1], hidden_layer1),
    ReLULayer(),
    LinearLayer(hidden_layer1, hidden_layer2),
    ReLULayer(),
    LinearLayer(hidden_layer2, num_classes),
    SoftmaxOutputLayer()
)

train_and_plot(
    mlp_two_hidden_layer_l2, 
    "Two Hidden Layers with L2 Regularization", 
    x_train_mlp, y_train_encoded, 
    x_test_mlp, y_test_encoded, 
    num_iterations=3, 
    learning_rate=0.001, 
    optimizer="l2_gradient_descent")

In [None]:
# Experiment 4
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt

#(x_train, y_train), (x_test, y_test) = mnist.load_data()
#x_train = x_train.reshape((-1, 28, 28, 1)).astype('float32') / 255
#y_train = to_categorical(y_train)
#y_test = to_categorical(y_test)
x_train_cnn = x_train.reshape(-1, 28, 28, 1).astype('float32')
x_test_cnn = x_test.reshape(-1, 28, 28, 1).astype('float32')

def build_and_train_model(hidden_units):
    model = models.Sequential()
    model.add(layers.Conv2D(128, (3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(layers.MaxPooling2D((3, 3)))
    model.add(layers.Conv2D(64, (5, 5), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(hidden_units, activation='relu'))
    model.add(layers.Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train_cnn, y_train_encoded, epochs=5, batch_size=32, validation_split=0.2)
    test_loss, test_acc = model.evaluate(x_test_cnn, y_test_encoded)
    return history, test_acc


hidden_units_options = [32, 64, 128, 256]
performances = {}
histories = []

for units in hidden_units_options:
    print(f"Training model with {units} hidden units")
    history, test_acc = build_and_train_model(units)
    performances[units] = test_acc
    histories.append(history)
    print(f"Accuracy with {units} hidden units: {test_acc}")

accuracies = [performances[units] for units in hidden_units_options]
plt.figure(figsize=(10, 6))
plt.plot(hidden_units_options, accuracies, marker='o', linestyle='-', color='b')
plt.title('Model Performance by Number of Hidden Units')
plt.xlabel('Number of Hidden Units')
plt.ylabel('Accuracy')
plt.xticks(hidden_units_options)
plt.grid(True)
plt.show()

fig, axs = plt.subplots(1, len(hidden_units_options), figsize=(20, 5), sharey=True)

for i, history in enumerate(histories):
    axs[i].plot(history.history['loss'], label='Train Loss')
    axs[i].plot(history.history['val_loss'], label='Validation Loss')
    axs[i].set_title(f'{hidden_units_options[i]} Units')
    axs[i].set_xlabel('Epoch')
    axs[i].set_ylabel('Loss')
    axs[i].legend()

plt.tight_layout()
plt.savefig('Hidden_units_comp_cnn.png')
plt.show()

best_units = max(performances, key=performances.get)
print(f"Best performing model used {best_units} hidden units with an accuracy of {performances[best_units]}")


In [None]:
# Experiment 7

import tensorflow as tf
from tensorflow.keras import layers, models

def build_and_train_model_with_filter_sizes(filter_sizes, x_train_cnn, y_train_encoded, x_test_cnn, y_test_encoded, num_classes):
    model = models.Sequential()
    # Adjust the model architecture according to the provided filter sizes
    model.add(layers.Conv2D(32, filter_sizes[0], activation='relu', input_shape=(28, 28, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, filter_sizes[1], activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, filter_sizes[2], activation='relu', padding='same'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))  # Consistent hidden units
    model.add(layers.Dense(num_classes, activation='softmax'))  # Adjust for the correct number of classes

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train_cnn, y_train_encoded, epochs=5, batch_size=64, validation_split=0.2)
    test_loss, test_acc = model.evaluate(x_test_cnn, y_test_encoded)
    return history, test_acc

# Prepare to test different filter configurations
filter_configurations = [((3, 3), (3, 3), (3, 3)), 
                         ((5, 5), (5, 5), (5, 5)), 
                         ((3, 3), (5, 5), (3, 3)),
                         ((7, 7), (5, 5), (3, 3))]

performances = {}
for config in filter_configurations:
    config_str = ', '.join([f"{fs[0]}x{fs[1]}" for fs in config])  # String representation for plot labels
    print(f"\nTraining model with filter sizes: {config_str}")
    history, test_acc = build_and_train_model_with_filter_sizes(config, x_train_cnn, y_train_encoded, x_test_cnn, y_test_encoded, num_classes)
    performances[config_str] = test_acc
    print(f"Accuracy with filter sizes {config_str}: {test_acc:.4f}")

# Plotting the performance of different filter sizes
plt.figure(figsize=(12, 8))
bars = plt.bar(performances.keys(), performances.values(), color='lightblue', edgecolor='black')
plt.xlabel('Filter Sizes', fontsize=14)
plt.ylabel('Test Accuracy', fontsize=14)
plt.title('Impact of Filter Sizes on Model Accuracy for Sign Language MNIST', fontsize=16)
plt.xticks(rotation=45)
plt.ylim(min(performances.values()) - 0.05, max(performances.values()) + 0.05)
plt.grid(axis='y', linestyle='--')

# Add accuracy values on top of bars
for rect in bars:
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width() / 2.0, height, f'{height:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig()
plt.show()


In [None]:
# Experiment 5
# Doing our best to mimic the performance of the CNN model with a fully connected MLP


input_size = x_train_mlp.shape[-1]
num_hidden_layers = 5
units_per_hidden_layer = 128
output_size = 26

optimal_mlp = MLPWithActivation(
    input_size, 
    [units_per_hidden_layer] * num_hidden_layers, 
    output_size, 
    activation='relu'
)

train_and_plot(
    optimal_mlp, 
    "MLP with 5 Hidden Layers (CNN Approximation)",  
    x_train_mlp, y_train_encoded, 
    x_test_mlp, y_test_encoded, 
    num_iterations=3, 
    learning_rate=0.001, 
    optimizer="l2_gradient_descent",
    l2_lambda=0.5,
    decaying_lr=True,
    verbose=True
)

