# Implementation of the Backpropagation Algorithm

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

import random
import copy
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Functions for the Multilayer Perceptron Neural Network

In [2]:
def xavier_init(fan_in, fan_out):
    limit = np.sqrt(6 / (fan_in + fan_out))
    return np.random.uniform(-limit, limit, size=(fan_in, fan_out))

In [3]:
def get_activation_potential(input_neurons, weight):
    return input_neurons @ weight.T

def augment_matrix(matrix):
    return np.hstack((np.ones((matrix.shape[0], 1)), matrix))

def compute_delta(input_, gradient, learning_rate):
    batch_size = input_.shape[0]
    i = 0
    overall_delta = []
    while i < batch_size:
        batch_delta = np.array([])
        for neuron in gradient[i]:
            batch_delta = np.concatenate((batch_delta, input_[i] * neuron * learning_rate))
        i += 1
        overall_delta.append(batch_delta)
    return np.mean(overall_delta, axis=0)

def choose_function(function_name, logistic, tanh, relu):
    if function_name == 'logistic':
        return logistic
    elif function_name == 'tanh':
        return tanh
    elif function_name == 'relu':
        return relu
    else: return 'Invalid Function Name'

def get_error(desired_output, output_neurons):
    return (1/2)*((desired_output - output_neurons)**2).sum()

In [4]:
def logistic_output(activation,logistic_slope_param):
    return 1/(1+np.exp(-1*logistic_slope_param*activation))

# Local gradient output of the output layer should have the same dimension as the output matrix of the output layer
def logistic_gradient_output(desired_output, output_neurons, logistic_slope_param):
    return logistic_slope_param * np.multiply(np.multiply((desired_output - output_neurons),
                                            output_neurons
                                            ),(1-output_neurons))

def logistic_gradient_hidden(hidden_neurons, output_gradient,
                                   output_weight, logistic_slope_param):

    summation = output_gradient @ output_weight
    summation = summation[:, 1:]

    return logistic_slope_param * np.multiply(np.multiply(hidden_neurons,
                                            (1-hidden_neurons)),
                                            summation)

In [5]:
def tanh_output(activation, a, b):
    return a * np.tanh(b * activation)

def tanh_gradient_output(desired_output, output_neurons, a, b):
    return np.multiply(
        (b / a) * (desired_output - output_neurons),
        np.multiply(a - output_neurons, a + output_neurons))

def tanh_gradient_hidden(hidden_neurons, output_gradient_tanh, output_weight, a, b):
    return (b / a) * (output_gradient_tanh @ output_weight)[:, 1:] * (a - hidden_neurons) * (a + hidden_neurons)

In [6]:
def softmax_output(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # For numerical stability
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)


In [7]:
def relu_output(activation, leaky_param):
    return np.where(activation > 0, activation, activation * leaky_param)

def relu_gradient_output(output_neurons, leaky_param):
    return np.where(output_neurons >= 0, output_neurons, leaky_param)

def relu_gradient_hidden(hidden_neurons, output_gradient, output_weight, leaky_param):
    multiplier = np.where(hidden_neurons > 0, 1, leaky_param)
    return multiplier * ((output_gradient @ output_weight)[:, 1:])

## Functions for the plot and metrics

In [10]:
def plot_curves(training_results, validation_results, filename, display_to_screen=False, save_to_file=True):
    df_train = pd.DataFrame(training_results)
    df_val = pd.DataFrame(validation_results)

    fig, ax1 = plt.subplots(figsize=(10, 5))

    # Plot Training SSE on primary y-axis
    x_train = df_train['Epoch'] + (df_train['Iteration'] - 1) * 5
    ax1.plot(x_train, df_train['SSE'], label='Training SSE', color='tab:blue')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Training SSE', color='tab:blue')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    # Create secondary y-axis for Validation SSE
    ax2 = ax1.twinx()
    x_val = df_val['Iteration'] * 5
    ax2.plot(x_val, df_val['SSE'], label='Validation SSE', color='tab:orange', linestyle='--')
    ax2.set_ylabel('Validation SSE', color='tab:orange')
    ax2.tick_params(axis='y', labelcolor='tab:orange')

    # Title and grid
    plt.title('Training vs Validation SSE')
    ax1.grid(True)

    # Create a combined legend
    lines_1, labels_1 = ax1.get_legend_handles_labels()
    lines_2, labels_2 = ax2.get_legend_handles_labels()
    ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper right')
    
    plt.tight_layout()
    if save_to_file:
        plt.savefig(filename)
    if display_to_screen:
        plt.show()
    plt.close()

def plot_curves_misclassifications(training_results, validation_results, filename, display_to_screen=False, save_to_file=True):
    df_train = pd.DataFrame(training_results)
    df_val = pd.DataFrame(validation_results)

    fig, ax1 = plt.subplots(figsize=(10, 5))

    # Plot Training SSE on primary y-axis
    x_train = df_train['Epoch'] + (df_train['Iteration'] - 1) * 5
    ax1.plot(x_train, df_train['Misclassifications'], label='Training Misclassifications', color='tab:blue')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Training Misclassifications', color='tab:blue')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    # Create secondary y-axis for Validation SSE
    ax2 = ax1.twinx()
    x_val = df_val['Iteration'] * 5
    ax2.plot(x_val, df_val['Misclassifications'], label='Validation Misclassifications', color='tab:orange', linestyle='--')
    ax2.set_ylabel('Validation Misclassifications', color='tab:orange')
    ax2.tick_params(axis='y', labelcolor='tab:orange')

    # Title and grid
    plt.title('Training vs Validation Misclassifications')
    ax1.grid(True)

    # Create a combined legend
    lines_1, labels_1 = ax1.get_legend_handles_labels()
    lines_2, labels_2 = ax2.get_legend_handles_labels()
    ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper right')

    plt.tight_layout()
    if save_to_file:
        plt.savefig(filename)
    if display_to_screen:
        plt.show()
    plt.close()

def compute_confusion_matrix(predictions, labels, num_classes=8):
    matrix = np.zeros((num_classes, num_classes), dtype=int)

    for true, pred in zip(labels, predictions):
        matrix[true-1][pred-1] += 1  # row = true, column = predicted

    return matrix

def plot_confusion_matrix_pure_matplotlib(conf_matrix, filename, class_names=None, normalize=False,  display_to_screen=False, save_to_file=True):
    if normalize:
        conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1, keepdims=True)
        conf_matrix = np.round(conf_matrix, 2)

    fig, ax = plt.subplots(figsize=(8, 6))
    cax = ax.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix" + (" (Normalized)" if normalize else ""))
    fig.colorbar(cax)

    # Tick labels
    if class_names is None:
        class_names = [str(i) for i in range(conf_matrix.shape[0])]

    ax.set_xticks(np.arange(len(class_names)))
    ax.set_yticks(np.arange(len(class_names)))
    ax.set_xticklabels(class_names)
    ax.set_yticklabels(class_names)

    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")

    # Rotate x-axis labels
    plt.xticks(rotation=45)

    # Display numbers inside the heatmap
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            text = f"{conf_matrix[i, j]:.2f}" if normalize else int(conf_matrix[i, j])
            ax.text(j, i, text,
                    ha="center", va="center",
                    color="white" if conf_matrix[i, j] > conf_matrix.max() / 2 else "black")

    plt.tight_layout()
    if save_to_file:
        plt.savefig(filename)
    if display_to_screen:
        plt.show()
    plt.close()

def accuracy(conf_matrix):
    correct = np.trace(conf_matrix)
    total = np.sum(conf_matrix)
    return correct / total

def precision_recall_f1(conf_matrix):
    num_classes = conf_matrix.shape[0]
    precision = np.zeros(num_classes)
    recall = np.zeros(num_classes)
    f1 = np.zeros(num_classes)

    for i in range(num_classes):
        TP = conf_matrix[i, i]
        FP = conf_matrix[:, i].sum() - TP
        FN = conf_matrix[i, :].sum() - TP

        precision[i] = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        recall[i] = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        if (precision[i] + recall[i]) > 0:
            f1[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i])
        else:
            f1[i] = 0.0

    return precision, recall, f1

def matthews_corrcoef(conf_matrix):
    num_classes = conf_matrix.shape[0]
    mcc = np.zeros(num_classes)

    for i in range(num_classes):
        TP = conf_matrix[i, i]
        FP = conf_matrix[:, i].sum() - TP
        FN = conf_matrix[i, :].sum() - TP
        TN = conf_matrix.sum() - (TP + FP + FN)

        numerator = (TP * TN) - (FP * FN)
        denominator = np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

        mcc[i] = numerator / denominator if denominator > 0 else 0.0

    return mcc

def save_metrics_to_file(conf_matrix, filepath):
    acc = accuracy(conf_matrix)
    prec, rec, f1 = precision_recall_f1(conf_matrix)
    mcc = matthews_corrcoef(conf_matrix)

    with open(filepath, 'w') as f:
        f.write("=== Evaluation Metrics ===\n\n")
        f.write(f"Accuracy: {acc:.4f}\n\n")

        f.write(f"{'Class':<10}{'Precision':<12}{'Recall':<12}{'F1-Score':<12}{'MCC':<12}\n")
        f.write("-" * 58 + "\n")
        for i in range(len(prec)):
            f.write(f"{i:<10}{prec[i]:<12.4f}{rec[i]:<12.4f}{f1[i]:<12.4f}{mcc[i]:<12.4f}\n")

## Read the dataset

In [11]:
df = pd.read_csv('data.csv', header=None)
df_labels = pd.read_csv('data_labels.csv', header=None)
df_test = pd.read_csv('test_set.csv', header=None)

## Apply SMOTE to the imbalanced dataset, then split the dataset into training set and validation set.

In [12]:
epoch = 5

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(df, df_labels)

print("After SMOTE - X:", X_resampled.shape)
print("After SMOTE - y:", y_resampled.shape)

# Train-test split
validation_size = 800
validation_indices = random.sample(range(len(X_resampled)), 800)

validation_set = X_resampled.loc[validation_indices]
training_set = X_resampled.drop(validation_indices)

validation_labels = y_resampled.loc[validation_indices]
training_labels = y_resampled.drop(validation_indices)

print("Train X:", training_set.shape, "Train y:", training_labels.shape)
print("Test X:", validation_set.shape, "Test y:", validation_labels.shape)

After SMOTE - X: (13000, 354)
After SMOTE - y: (13000, 1)
Train X: (12200, 354) Train y: (12200, 1)
Test X: (800, 354) Test y: (800, 1)


In [13]:
training_set.to_csv('training_set.csv')
validation_set.to_csv('validation_set.csv')
training_labels.to_csv('training_labels.csv')
validation_labels.to_csv('validation_labels.csv')

In [14]:
try:
    print(training_labels['0'].value_counts())
except:
    try:
        print(training_labels[0].value_counts())
    except:
        pass

0
5    1548
3    1532
4    1530
7    1526
8    1524
1    1519
6    1514
2    1507
Name: count, dtype: int64


In [15]:
try:
    print(validation_labels['0'].value_counts())
except:
    try:
        print(validation_labels[0].value_counts())
    except:
        pass

0
2    118
6    111
1    106
8    101
7     99
4     95
3     93
5     77
Name: count, dtype: int64


## Fixed Hyperparameters

In [16]:
batch_size = 8
momentum_term = 0.9
logistic_slope_param = 2

epoch = 5
patience = 10
min_delta = 0.001

input_layer_count = 354
output_layer_count = 8

### Network A Configuration

In [17]:
Network = 'A'

# tanh, logistic, relu
function_hidden1 = 'tanh'
function_hidden2 = 'tanh'
function_output = 'logistic'

tanh_param_a = 1.716
tanh_param_b = 0.6667

### Network B Configuration

In [18]:
# tanh, logistic, relu
function_hidden1 = 'relu'
function_hidden2 = 'relu'
function_output = 'logistic'

leaky_param = 0.01

In [19]:
# hyperparameter options
nodes_options = [[256,256],[256,128],[256,64],[256,32],[256,16],[128,64],[128,16],[32,16],[16,16]]
learning_rate_options = [0.001,0.005,0.01]

# Create the list of hyperparameter combinations
hyperparameter_configs = []

for option in nodes_options:
    for lr in learning_rate_options:
        config = {
            "hidden1": option[0],
            "hidden2": option[1],
            "learning_rate": lr
        }
        hyperparameter_configs.append(config)

print(len(hyperparameter_configs))

27


# Training Phase 1

In [20]:
def forward_pass_v2(function_hidden1, function_hidden2, function_output, Xb, W1, W2, W3, b1, b2, b3, Yb, tanh_param_a, tanh_param_b, logistic_slope_param, leaky_param):
    # — forward —
    z1 = Xb.dot(W1) + b1                                        # (mb×H1)
    if function_hidden1 == 'relu':
        a1 = relu_output(z1, leaky_param)                       # (mb×H1)
    elif function_hidden1 == 'logistic':
        a1 = logistic_output(z1, logistic_slope_param)         # (mb×H1)
    elif function_hidden1 == 'tanh':
        a1 = tanh_output(z1, tanh_param_a, tanh_param_b)
    
    z2 = a1.dot(W2) + b2                                        # (mb×H2)
    if function_hidden2 == 'relu':
        a2 = relu_output(z2, leaky_param)                       # (mb×H2)
    elif function_hidden2 == 'logistic':
        a2 = logistic_output(z2, logistic_slope_param)         # (mb×H2)
    elif function_hidden2 == 'tanh':
        a2 = tanh_output(z2, tanh_param_a, tanh_param_b)

    z3 = a2.dot(W3) + b3                                      # (mb×C)
    if function_output == 'relu':
        ŷ = relu_output(z3, leaky_param)                    # (mb×C)
    elif function_output == 'logistic':
        ŷ = logistic_output(z3, logistic_slope_param)            # (mb×C)
    elif function_output == 'tanh':
        ŷ = tanh_output(z3, tanh_param_a, tanh_param_b)            # (mb×C)
    if not Yb:
        return ŷ
    sse = get_error(Yb, ŷ)
    total_sse += sse

    classification_computed = np.argmax(ŷ, axis=1)
    classification_actual = np.argmax(Yb, axis=1)
    misclassification_count = np.sum(classification_computed != classification_actual)
    total_misclassifications += misclassification_count
    
    return sse, misclassification_count, classification_actual, classification_computed

def tanh_gradient_hidden_v2(hidden_neurons, output_gradient_tanh, output_weight, a, b):
    return (b / a) * (output_gradient_tanh @ output_weight.T) * (a - hidden_neurons) * (a + hidden_neurons)

def relu_gradient_hidden_v2(hidden_neurons, output_gradient, output_weight, leaky_param):
    multiplier = np.where(hidden_neurons > 0, 1, leaky_param)
    return multiplier * ((output_gradient @ output_weight.T))

def logistic_gradient_hidden_v2(hidden_neurons, output_gradient,
                                   output_weight, logistic_slope_param):

    summation = output_gradient @ output_weight.T
    summation = summation

    return logistic_slope_param * np.multiply(np.multiply(hidden_neurons,
                                            (1-hidden_neurons)),
                                            summation)

In [21]:
import numpy as np


def backprop_training_v2(df, df_labels, patience, hidden_layer_count1, hidden_layer_count2,
                         learning_rate, tanh_param_a, tanh_param_b, leaky_param, logistic_slope_param,
                         activation_function1="tanh", activation_function2="tanh"):
    now = datetime.now()
    start_time = now.strftime("%H:%M:%S")

    # 1) SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(df, df_labels)

    # 2) After SMOTE, convert to numpy arrays once:
    X = X_resampled            # shape (N, D)
    y = pd.get_dummies(y_resampled[0],
                    columns=list(range(1,9))).sort_index(axis=1)
                                        # shape (N, C), one-hot

    # 3) Single train/validation split:
    # Train-test split
    validation_size = 800
    validation_indices = random.sample(range(len(X_resampled)), validation_size)

    X_val = X.loc[validation_indices]
    X_train = X.drop(validation_indices)

    y_val = y.loc[validation_indices]
    y_train = y.drop(validation_indices)

    X_val = X_val.to_numpy()
    X_train = X_train.to_numpy()
    y_val = y_val.to_numpy()
    y_train = y_train.to_numpy()

    patience     = 10       # how many epochs to wait without improvement
    max_epochs   = 50        # a hard cap, in case validation never improves

    # 4) In your training loop, shuffle each epoch and batch by slicing:
    m, D = X_train.shape
    _, C = y_train.shape
    H1 = hidden_layer_count1
    H2 = hidden_layer_count2

    # 5) Initialize weights & biases once
    W1 = xavier_init(D, H1);  b1 = np.zeros((1, H1))
    W2 = xavier_init(H1, H2); b2 = np.zeros((1, H2))
    W3 = xavier_init(H2, C);  b3 = np.zeros((1, C))

    # 6) Initialize velocities for momentum
    vW1 = np.zeros_like(W1);  vb1 = np.zeros_like(b1)
    vW2 = np.zeros_like(W2);  vb2 = np.zeros_like(b2)
    vW3 = np.zeros_like(W3);  vb3 = np.zeros_like(b3)

    m = X_train.shape[0]

    patience_counter  = 0

    training_results = []
    validation_results = []
    iteration = 0

    min_sse = float('inf')
    min_misclassifications = float('inf')

    for epoch in range(1, max_epochs+1):
        # Collect all 
        training_all_actual = []
        training_all_computed = []
        validation_all_actual = []
        validation_all_computed = []

        # ———  Shuffle training set  —————————————
        perm    = np.random.permutation(m)
        X_shuff = X_train[perm]
        Y_shuff = y_train[perm]

        epoch_loss = 0
        epoch_acc  = 0
        total_sse = 0
        total_misclassifications = 0

        # ———  Mini-batch training  ——————————————
        for start in range(0, m, batch_size):
            end      = start + batch_size
            Xb, Yb   = X_shuff[start:end], Y_shuff[start:end]
            mb       = Xb.shape[0]              # last batch might be smaller

            z1 = Xb.dot(W1) + b1                                        # (mb×H1)
            if activation_function1 == "tanh":
                a1 = tanh_output(z1, tanh_param_a, tanh_param_b)
            elif activation_function1 == "relu":
                a1 = relu_output(z1, leaky_param)
            elif activation_function1 == "logistic":
                a1 = logistic_output(z1, logistic_slope_param)
            
            z2 = a1.dot(W2) + b2                                        # (mb×H2)
            if activation_function2 == "tanh":
                a2 = tanh_output(z2, tanh_param_a, tanh_param_b)
            elif activation_function2 == "relu":
                a2 = relu_output(z2, leaky_param)
            elif activation_function2 == "logistic":
                a2 = logistic_output(z2, logistic_slope_param)

            z3   = a2.dot(W3) + b3                                      # (mb×C)
            ŷ    = logistic_output(z3, logistic_slope_param)            # (mb×C)

            sse = get_error(Yb, ŷ)
            total_sse += sse

            classification_computed = np.argmax(ŷ, axis=1)
            classification_actual = np.argmax(Yb, axis=1)
            misclassification_count = np.sum(classification_computed != classification_actual)
            total_misclassifications += misclassification_count

            # inside your batch loop, replace the SSE/loss & E computation with:
            eps = 1e-8  # to avoid log(0)
            # — BINARY CROSS–ENTROPY LOSS —
            # Yb: (mb×C) one-hot / binary labels
            # y_hat: (mb×C) sigmoid outputs in (0,1)
            bce_loss = -np.sum(
                Yb * np.log(ŷ + eps) +
                (1 - Yb) * np.log(1 - ŷ + eps)
            ) / mb

            # track total loss/accuracy as before
            epoch_loss += bce_loss * mb
            preds = (ŷ >= 0.5).astype(int)           # threshold at 0.5
            labels = Yb
            acc   = np.mean(preds == labels)
            epoch_acc += acc * mb

            # — GRADIENT AT OUTPUT LAYER —  
            # For sigmoid + BCE, the upstream delta simplifies to:
            E = (ŷ - Yb)                        # shape (mb×C)

            # now proceed exactly as you already do:
            dW3 = (a2.T @ E) / mb                   # (H2×C)
            db3 =  E.sum(axis=0, keepdims=True) / mb

            # and then your tanh-backprop for the hidden layers:
            if activation_function2 == "tanh":
                d2 = tanh_gradient_hidden_v2(a2, E, W3, tanh_param_a, tanh_param_b)
            elif activation_function2 == "relu":
                d2 = relu_gradient_hidden_v2(a2, E, W3, leaky_param)
            elif activation_function2 == "logistic":
                d2 = logistic_gradient_hidden_v2(a2, E, W3, logistic_slope_param)
            dW2 = (a1.T @ d2) / mb
            db2 =  d2.sum(axis=0, keepdims=True) / mb

            if activation_function1 == "tanh":
                d1 = tanh_gradient_hidden_v2(a1, d2, W2, tanh_param_a, tanh_param_b)
            elif activation_function1 == "relu":
                d1 = relu_gradient_hidden_v2(a1, d2, W2, leaky_param)
            elif activation_function1 == "logistic":
                d1 = logistic_gradient_hidden_v2(a1, d2, W2, logistic_slope_param)
            dW1 = (Xb.T @ d1) / mb
            db1 =  d1.sum(axis=0, keepdims=True) / mb

            # — momentum updates —
            vW3 = momentum_term*vW3 + (1-momentum_term)*dW3
            vb3 = momentum_term*vb3 + (1-momentum_term)*db3
            W3 -= learning_rate * vW3
            b3 -= learning_rate * vb3

            vW2 = momentum_term*vW2 + (1-momentum_term)*dW2
            vb2 = momentum_term*vb2 + (1-momentum_term)*db2
            W2 -= learning_rate * vW2
            b2 -= learning_rate * vb2

            vW1 = momentum_term*vW1 + (1-momentum_term)*dW1
            vb1 = momentum_term*vb1 + (1-momentum_term)*db1
            W1 -= learning_rate * vW1
            b1 -= learning_rate * vb1

            training_all_actual.extend(np.argmax(Yb, axis=1))
            training_all_computed.extend(classification_computed)

        temp_dict = {}
        temp_dict['Iteration'] = iteration
        temp_dict['Epoch'] = epoch
        temp_dict['SSE'] = total_sse
        temp_dict['Misclassifications'] = total_misclassifications
        temp_dict['Training Loss'] = epoch_loss
        temp_dict['Training Accuracy'] = epoch_acc
        temp_dict['Training Misclassifications'] = total_misclassifications
        training_results.append(temp_dict)

        # ———  End of epoch metrics  ——————————————
        epoch_loss /= m
        epoch_acc  /= m

        # ———  Validation pass  ————————————————
        # (vectorized, no weight updates)
        # compute loss, acc, misclassifications
        total_sse = 0
        total_misclassifications = 0
        if epoch % 5 == 0:
            z1_val = X_val.dot(W1) + b1
            a1_val = tanh_output(z1_val, tanh_param_a, tanh_param_b)

            z2_val = a1_val.dot(W2) + b2
            a2_val = tanh_output(z2_val, tanh_param_a, tanh_param_b)

            z3_val = a2_val.dot(W3) + b3
            y_hat_val = logistic_output(z3_val, logistic_slope_param)

            val_loss = -np.sum(y_val * np.log(y_hat_val + 1e-8)) / X_val.shape[0]
            val_preds = np.argmax(y_hat_val, axis=1)
            val_labels = np.argmax(y_val, axis=1)
            val_acc = np.mean(val_preds == val_labels)
            val_mis = np.sum(val_preds != val_labels)
            sse = get_error(y_val, y_hat_val)
            min_sse = min(sse, min_sse)

            total_sse += sse
            total_misclassifications += val_mis

            # Edit here
            validation_all_actual.extend(val_labels)
            validation_all_computed.extend(val_preds)

            temp_dict = {}
            temp_dict['Iteration'] = iteration
            temp_dict['SSE'] = total_sse
            temp_dict['Misclassifications'] = total_misclassifications
            temp_dict['Validation Loss'] = val_loss
            temp_dict['Validation Accuracy'] = val_acc
            temp_dict['Validation Misclassifications'] = val_mis
            validation_results.append(temp_dict)
            iteration += 1

            print(f"Epoch {epoch:>2}: "
                f"train_loss={epoch_loss:.4f}, train_acc={epoch_acc:.4f} | "
                f"val_loss={val_loss:.4f}, val_acc={val_acc:.4f}, "
                f"val_mis={val_mis}")

            # — early stopping check —
            if total_misclassifications < min_misclassifications:
                patience_counter = 0
                validation_all_actual_curr = copy.deepcopy(validation_all_actual)
                validation_all_computed_curr = copy.deepcopy(validation_all_computed)
                training_all_actual_curr = copy.deepcopy(training_all_actual)
                training_all_computed_curr = copy.deepcopy(training_all_computed)
                min_misclassifications = total_misclassifications*1
                min_train_acc = epoch_acc*1
                min_train_loss = epoch_loss*1
                min_val_acc = val_acc*1
                min_val_loss = val_loss*1
                print('Min SSE:', min_sse)
                print('Min Misclassifications:', min_misclassifications)
                best_W1, best_b1 = W1.copy(), b1.copy()
                best_W2, best_b2 = W2.copy(), b2.copy()
                best_W3, best_b3 = W3.copy(), b3.copy()
            else:
                patience_counter += 1
                print(f"  → no improvement for {patience_counter}/{patience} epochs")

            # stop if we've gone too long without improving
            if patience_counter >= patience:
                print(f"Stopping early on epoch {epoch} (no val_loss improvement in last {patience} epochs).")
                break

    print('learning rate:', learning_rate)
    print('logistic slope:', logistic_slope_param)
    print('Final Min SSE:', min_sse)
    print('Final Min Misclassifications', min_misclassifications)

    now = datetime.now()
    end_time = now.strftime("%H:%M:%S")

    t1 = datetime.strptime(start_time, '%H:%M:%S')
    t2 = datetime.strptime(end_time, '%H:%M:%S')
    training_duration = t2-t1

    print("lengths")
    print(len(training_all_actual_curr))
    print(len(training_all_computed_curr))
    print(len(validation_all_actual_curr))
    print(len(validation_all_computed_curr))

    return (
        training_results,
        validation_results,
        training_all_actual_curr,
        training_all_computed_curr,
        validation_all_actual_curr,
        validation_all_computed_curr,
        best_W1,
        best_W2,
        best_W3,
        best_b1,
        best_b2,
        best_b3,
        start_time,
        end_time,
        training_duration,
        min_train_loss,
        min_train_acc,
        min_val_loss,
        min_val_acc
    )

In [22]:
import os

def calculate_and_save_metrics(training_results, validation_results, training_all_actual_curr,
                               training_all_computed_curr, validation_all_actual_curr,
                               validation_all_computed_curr, H1, H2, best_W1, best_W2, best_W3,
                               best_b1, best_b2, best_b3, learning_rate, activation1, activation2,
                               start_time, end_time, duration):
    
    # Create a folder for the current hyperparameter configuration
    # Use the hyperparameters as the folder name
    os.makedirs(os.path.join("artifacts", f"{H1}_{H2}_{activation1}_{activation2}_{learning_rate}"), exist_ok=True)
    folder_name = os.path.join("artifacts", f"{H1}_{H2}_{activation1}_{activation2}_{learning_rate}")
    filepath="hyperparameters.txt"

    with open(os.path.join(folder_name, filepath), 'w') as f:
        f.write(f"hidden_layer_count1: {H1}\n")
        f.write(f"hidden_layer_count2: {H2}\n")
        f.write(f"Network: {Network}\n")
        f.write(f"batch_size: {batch_size}\n")
        f.write(f"function_hidden1: {function_hidden1}\n")
        f.write(f"function_hidden2: {function_hidden2}\n")
        f.write(f"function_output: {function_output}\n")
        f.write(f"momentum_term: {momentum_term}\n")
        f.write(f"learning_rate: {learning_rate}\n")
        f.write(f"logistic_slope_param: {logistic_slope_param}\n")
        f.write(f"tanh_param_a: {tanh_param_a}\n")
        f.write(f"tanh_param_b: {tanh_param_b}\n")
        f.write(f"leaky_param: {leaky_param}\n")
        f.write(f"start:{start_time}\n")
        f.write(f"end:{end_time}\n")
        f.write(f"duration:{duration}\n")

    # Export the files needed
    training_actual_df = pd.DataFrame(training_all_actual_curr)
    training_computed_df = pd.DataFrame(training_all_computed_curr)
    validation_actual_df = pd.DataFrame(validation_all_actual_curr)
    validation_computed_df = pd.DataFrame(validation_all_computed_curr)

    best_W1_df = pd.DataFrame(best_W1)
    best_b1_df = pd.DataFrame(best_b1)
    best_W2_df = pd.DataFrame(best_W2)
    best_b2_df = pd.DataFrame(best_b2)
    best_W3_df = pd.DataFrame(best_W3)
    best_b3_df = pd.DataFrame(best_b3)

    # Includes the SSE and Total Misclassifications per Epoch
    training_results_df = pd.DataFrame(training_results)

    # Includes the SSE and Total Misclassifications per Iteration (5 epochs)
    validation_results_df = pd.DataFrame(validation_results)

    training_actual_df.to_csv(os.path.join(folder_name, 'Training_Actual.csv'), index=False)
    training_computed_df.to_csv(os.path.join(folder_name, 'Training_Computed.csv'), index=False)
    validation_actual_df.to_csv(os.path.join(folder_name, 'Validation_Actual.csv'), index=False)
    validation_computed_df.to_csv(os.path.join(folder_name, 'Validation_Computed.csv'), index=False)

    training_results_df.to_csv(os.path.join(folder_name, 'training_results.csv'), index=False)
    validation_results_df.to_csv(os.path.join(folder_name, 'validation_results.csv'), index=False)

    best_W1_df.to_csv(os.path.join(folder_name, 'hidden_weight1.csv'), index=False, header=None)
    best_b1_df.to_csv(os.path.join(folder_name, 'hidden_biases1.csv'), index=False, header=None)
    best_W2_df.to_csv(os.path.join(folder_name, 'hidden_weight2.csv'), index=False, header=None)
    best_b2_df.to_csv(os.path.join(folder_name, 'hidden_biases2.csv'), index=False, header=None)
    best_W3_df.to_csv(os.path.join(folder_name, 'output_weight.csv'), index=False, header=None)
    best_b3_df.to_csv(os.path.join(folder_name, 'output_biases.csv'), index=False, header=None)

    # Training and Validation SSE per Epoch
    plot_curves(training_results, validation_results, filename=os.path.join(folder_name,"training_curves.png"))
    plot_curves_misclassifications(training_results, validation_results, filename=os.path.join(folder_name,"training_curves_misc.png"))

    # Confusion Matrix for the Training Set
    training_cm = compute_confusion_matrix(training_all_computed_curr, training_all_actual_curr, num_classes=8)
    class_names = ['Class 1', 'Class 2', 'Class 3', 'Class 4',
                'Class 5', 'Class 6', 'Class 7', 'Class 8']

    plot_confusion_matrix_pure_matplotlib(training_cm, os.path.join(folder_name, "training_confusion_matrix.png"), class_names, normalize=False)

    # Export the Accuracy, Precision, Recall, F1 scores, Matthews Correlation Coefficient
    save_metrics_to_file(training_cm, os.path.join(folder_name, "training_metrics.txt"))

    validation_cm = compute_confusion_matrix(validation_all_computed_curr, validation_all_actual_curr, num_classes=8)
    plot_confusion_matrix_pure_matplotlib(validation_cm, os.path.join(folder_name, "validation_confusion_matrix.png"), class_names, normalize=False)

    # Export the Accuracy, Precision, Recall, F1 scores, Matthews Correlation Coefficient
    save_metrics_to_file(validation_cm, os.path.join(folder_name, "validation_metrics.txt"))


In [23]:
from datetime import timedelta

best_configuration_accuracy = {
    'hidden_layer_count1': 0,
    'hidden_layer_count2': 0,
    'learning_rate': 0,
    'min_sse': float('inf'),
    'min_misclassifications': float('inf'),
    'training_loss': 0,
    'training_accuracy': 0,
    'validation_loss': 0,
    'validation_accuracy': 0,
    'training_duration': timedelta(999999999),
}
best_configuration_time = {
    'hidden_layer_count1': 0,
    'hidden_layer_count2': 0,
    'learning_rate': 0,
    'min_sse': float('inf'),
    'min_misclassifications': float('inf'),
    'training_loss': 0,
    'training_accuracy': 0,
    'validation_loss': 0,
    'validation_accuracy': 0,
    'training_duration': timedelta(999999999),
}

training_summary = []

for i, config in enumerate(hyperparameter_configs):

    hidden_layer_count1 = config['hidden1']
    hidden_layer_count2 = config['hidden2']
    learning_rate = config['learning_rate']
    print(f"--------- Config {i+1} ---------")
    print(f"h1 = {hidden_layer_count1}")
    print(f"h2 = {hidden_layer_count2}")
    print(config)
    print("---------            ---------")
    
    # training accuracy, training loss, validation_accuracy, validation_loss
    (training_results, validation_results, training_all_actual_curr, training_all_computed_curr,
     validation_all_actual_curr, validation_all_computed_curr, best_W1, best_W2, best_W3, best_b1,
     best_b2, best_b3, start_time, end_time, duration, min_train_loss, min_train_acc,
        min_val_loss, min_val_acc) = backprop_training_v2(
        df=df,
        df_labels=df_labels,
        patience=2, 
        hidden_layer_count1=hidden_layer_count1,
        hidden_layer_count2=hidden_layer_count2,
        learning_rate=learning_rate,
        tanh_param_a=tanh_param_a,
        tanh_param_b=tanh_param_b,
        leaky_param=leaky_param,
        logistic_slope_param=logistic_slope_param,
        activation_function1='tanh',
        activation_function2='tanh')
    print(validation_all_actual_curr)
    
    training_summary.append(
        {'hidden1': hidden_layer_count1,
         'hidden2': hidden_layer_count2,
         'learning_rate': learning_rate,
         'logistic slope': logistic_slope_param,
         'duration': duration,
         'training_loss': min_train_loss,
         'training_accuracy': min_train_acc,
         'validation_loss': min_val_loss,
         'validation_accuracy': min_val_acc
        }
        )
    
    # Check if the current configuration has the best validation accuracy
    # and update the best configuration if necessary
    if validation_results[-1]['Validation Accuracy'] > best_configuration_accuracy['validation_accuracy']:
        print("New best validation accuracy: ", validation_results[-1]['Validation Accuracy'])
        print("Given hidden layer counts and learning rate: ", hidden_layer_count1, hidden_layer_count2, learning_rate)
        best_configuration_accuracy['hidden_layer_count1'] = hidden_layer_count1
        best_configuration_accuracy['hidden_layer_count2'] = hidden_layer_count2
        best_configuration_accuracy['learning_rate'] = learning_rate
        best_configuration_accuracy['min_sse'] = validation_results[-1]['SSE']
        best_configuration_accuracy['min_misclassifications'] = validation_results[-1]['Misclassifications']
        best_configuration_accuracy['training_loss'] = training_results[-1]['Training Loss']
        best_configuration_accuracy['training_accuracy'] = training_results[-1]['Training Accuracy']
        best_configuration_accuracy['validation_loss'] = validation_results[-1]['Validation Loss']
        best_configuration_accuracy['validation_accuracy'] = validation_results[-1]['Validation Accuracy']
        best_configuration_accuracy['training_duration'] = duration

    # Check if the current configuration has the best training duration
    # and update the best configuration if necessary
    if duration < best_configuration_time['training_duration']:
        print("New best training duration: ", duration)
        print("Given hidden layer counts and learning rate: ", hidden_layer_count1, hidden_layer_count2, learning_rate)
        best_configuration_time['hidden_layer_count1'] = hidden_layer_count1
        best_configuration_time['hidden_layer_count2'] = hidden_layer_count2
        best_configuration_time['learning_rate'] = learning_rate
        best_configuration_time['min_sse'] = validation_results[-1]['SSE']
        best_configuration_time['min_misclassifications'] = validation_results[-1]['Misclassifications']
        best_configuration_time['training_loss'] = training_results[-1]['Training Loss']
        best_configuration_time['training_accuracy'] = training_results[-1]['Training Accuracy']
        best_configuration_time['validation_loss'] = validation_results[-1]['Validation Loss']
        best_configuration_time['validation_accuracy'] = validation_results[-1]['Validation Accuracy']
        best_configuration_time['training_duration'] = duration
    
    calculate_and_save_metrics(training_results, validation_results, training_all_actual_curr,
                               training_all_computed_curr, validation_all_actual_curr,
                               validation_all_computed_curr, hidden_layer_count1, hidden_layer_count2,
                               best_W1, best_W2, best_W3, best_b1, best_b2, best_b3,
                               learning_rate, "tanh", "tanh", start_time, end_time, duration)

print("Best Configuration Accuracy:")
print(f"Hidden Layer 1: {best_configuration_accuracy['hidden_layer_count1']}")
print(f"Hidden Layer 2: {best_configuration_accuracy['hidden_layer_count2']}")
print(f"Learning Rate: {best_configuration_accuracy['learning_rate']}")
print(f"Minimum SSE: {best_configuration_accuracy['min_sse']}")
print(f"Minimum Misclassifications: {best_configuration_accuracy['min_misclassifications']}")
print(f"Training Loss: {best_configuration_accuracy['training_loss']}")
print(f"Training Accuracy: {best_configuration_accuracy['training_accuracy']}")
print(f"Validation Loss: {best_configuration_accuracy['validation_loss']}")
print(f"Validation Accuracy: {best_configuration_accuracy['validation_accuracy']}")
print(f"Training Duration: {best_configuration_accuracy['training_duration']}")

print("Best Configuration Time:")
print(f"Hidden Layer 1: {best_configuration_time['hidden_layer_count1']}")
print(f"Hidden Layer 2: {best_configuration_time['hidden_layer_count2']}")
print(f"Learning Rate: {best_configuration_time['learning_rate']}")
print(f"Minimum SSE: {best_configuration_time['min_sse']}")
print(f"Minimum Misclassifications: {best_configuration_time['min_misclassifications']}")
print(f"Training Loss: {best_configuration_time['training_loss']}")
print(f"Training Accuracy: {best_configuration_time['training_accuracy']}")
print(f"Validation Loss: {best_configuration_time['validation_loss']}")
print(f"Validation Accuracy: {best_configuration_time['validation_accuracy']}")
print(f"Training Duration: {best_configuration_time['training_duration']}")

--------- Config 1 ---------
h1 = 256
h2 = 256
{'hidden1': 256, 'hidden2': 256, 'learning_rate': 0.001}
---------            ---------
Epoch  5: train_loss=2.0836, train_acc=0.8957 | val_loss=1.2930, val_acc=0.7163, val_mis=227
Min SSE: 235.57618281603945
Min Misclassifications: 227
Epoch 10: train_loss=1.6119, train_acc=0.9195 | val_loss=0.9899, val_acc=0.7900, val_mis=168
Min SSE: 179.71446326455958
Min Misclassifications: 168
Epoch 15: train_loss=1.3364, train_acc=0.9345 | val_loss=0.7968, val_acc=0.8512, val_mis=119
Min SSE: 148.98095490750234
Min Misclassifications: 119
Epoch 20: train_loss=1.1416, train_acc=0.9433 | val_loss=0.6650, val_acc=0.8900, val_mis=88
Min SSE: 126.5322982715096
Min Misclassifications: 88
Epoch 25: train_loss=0.9896, train_acc=0.9511 | val_loss=0.5803, val_acc=0.8962, val_mis=83
Min SSE: 109.48574340478373
Min Misclassifications: 83
Epoch 30: train_loss=0.8711, train_acc=0.9589 | val_loss=0.5042, val_acc=0.9038, val_mis=77
Min SSE: 96.00387536823038
Min Mi

--------- Config 3 ---------
h1 = 256
h2 = 256
{'hidden1': 256, 'hidden2': 256, 'learning_rate': 0.01}
---------            ---------
Epoch  5: train_loss=0.6955, train_acc=0.9697 | val_loss=0.3768, val_acc=0.9300, val_mis=56
Min SSE: 70.04570391246659
Min Misclassifications: 56
Epoch 10: train_loss=0.4181, train_acc=0.9824 | val_loss=0.1920, val_acc=0.9700, val_mis=24
Min SSE: 38.20427388999166
Min Misclassifications: 24
Epoch 15: train_loss=0.3251, train_acc=0.9864 | val_loss=0.1657, val_acc=0.9812, val_mis=15
Min SSE: 30.80232409361986
Min Misclassifications: 15
Epoch 20: train_loss=0.2684, train_acc=0.9887 | val_loss=0.1208, val_acc=0.9850, val_mis=12
Min SSE: 21.474797851955465
Min Misclassifications: 12
Epoch 25: train_loss=0.2379, train_acc=0.9901 | val_loss=0.1346, val_acc=0.9700, val_mis=24
  → no improvement for 1/10 epochs
Epoch 30: train_loss=0.2101, train_acc=0.9915 | val_loss=0.0926, val_acc=0.9875, val_mis=10
Min SSE: 18.032395366209172
Min Misclassifications: 10
Epoch 3

--------- Config 5 ---------
h1 = 256
h2 = 128
{'hidden1': 256, 'hidden2': 128, 'learning_rate': 0.005}
---------            ---------
Epoch  5: train_loss=1.0794, train_acc=0.9463 | val_loss=0.6323, val_acc=0.8600, val_mis=112
Min SSE: 118.82427573563452
Min Misclassifications: 112
Epoch 10: train_loss=0.6566, train_acc=0.9719 | val_loss=0.3992, val_acc=0.9100, val_mis=72
Min SSE: 73.72457480904421
Min Misclassifications: 72
Epoch 15: train_loss=0.4811, train_acc=0.9808 | val_loss=0.2640, val_acc=0.9475, val_mis=42
Min SSE: 50.93889096749545
Min Misclassifications: 42
Epoch 20: train_loss=0.3798, train_acc=0.9849 | val_loss=0.1913, val_acc=0.9663, val_mis=27
Min SSE: 39.1224971785897
Min Misclassifications: 27
Epoch 25: train_loss=0.3134, train_acc=0.9883 | val_loss=0.1722, val_acc=0.9663, val_mis=27
  → no improvement for 1/10 epochs
Epoch 30: train_loss=0.2740, train_acc=0.9892 | val_loss=0.1819, val_acc=0.9788, val_mis=17
Min SSE: 28.0814737365713
Min Misclassifications: 17
Epoch 3

Epoch  5: train_loss=2.1198, train_acc=0.8947 | val_loss=1.3236, val_acc=0.6450, val_mis=284
Min SSE: 243.9249088229009
Min Misclassifications: 284
Epoch 10: train_loss=1.6486, train_acc=0.9179 | val_loss=1.0140, val_acc=0.7925, val_mis=166
Min SSE: 189.52429309904335
Min Misclassifications: 166
Epoch 15: train_loss=1.3641, train_acc=0.9348 | val_loss=0.8329, val_acc=0.8337, val_mis=133
Min SSE: 157.27089374382825
Min Misclassifications: 133
Epoch 20: train_loss=1.1666, train_acc=0.9430 | val_loss=0.7005, val_acc=0.8775, val_mis=98
Min SSE: 134.27900957587582
Min Misclassifications: 98
Epoch 25: train_loss=1.0109, train_acc=0.9498 | val_loss=0.5831, val_acc=0.8888, val_mis=89
Min SSE: 114.4305231764552
Min Misclassifications: 89
Epoch 30: train_loss=0.8851, train_acc=0.9584 | val_loss=0.5109, val_acc=0.9012, val_mis=79
Min SSE: 99.13935857455158
Min Misclassifications: 79
Epoch 35: train_loss=0.7861, train_acc=0.9653 | val_loss=0.4418, val_acc=0.9012, val_mis=79
  → no improvement for 

--------- Config 9 ---------
h1 = 256
h2 = 64
{'hidden1': 256, 'hidden2': 64, 'learning_rate': 0.01}
---------            ---------
Epoch  5: train_loss=0.7190, train_acc=0.9687 | val_loss=0.3652, val_acc=0.9525, val_mis=38
Min SSE: 61.16860329132834
Min Misclassifications: 38
Epoch 10: train_loss=0.4184, train_acc=0.9830 | val_loss=0.1954, val_acc=0.9637, val_mis=29
Min SSE: 35.91203972405043
Min Misclassifications: 29
Epoch 15: train_loss=0.3039, train_acc=0.9878 | val_loss=0.1184, val_acc=0.9600, val_mis=32
  → no improvement for 1/10 epochs
Epoch 20: train_loss=0.2626, train_acc=0.9891 | val_loss=0.0740, val_acc=0.9875, val_mis=10
Min SSE: 16.341516461403636
Min Misclassifications: 10
Epoch 25: train_loss=0.2069, train_acc=0.9919 | val_loss=0.0690, val_acc=0.9875, val_mis=10
  → no improvement for 1/10 epochs
Epoch 30: train_loss=0.1935, train_acc=0.9921 | val_loss=0.2634, val_acc=0.9413, val_mis=47
  → no improvement for 2/10 epochs
Epoch 35: train_loss=0.1617, train_acc=0.9938 | 

--------- Config 11 ---------
h1 = 256
h2 = 32
{'hidden1': 256, 'hidden2': 32, 'learning_rate': 0.005}
---------            ---------
Epoch  5: train_loss=1.1369, train_acc=0.9448 | val_loss=0.6193, val_acc=0.8750, val_mis=100
Min SSE: 117.66216316982366
Min Misclassifications: 100
Epoch 10: train_loss=0.6737, train_acc=0.9721 | val_loss=0.3941, val_acc=0.9450, val_mis=44
Min SSE: 68.22452133628586
Min Misclassifications: 44
Epoch 15: train_loss=0.4876, train_acc=0.9810 | val_loss=0.2635, val_acc=0.9625, val_mis=30
Min SSE: 46.050125001667084
Min Misclassifications: 30
Epoch 20: train_loss=0.3890, train_acc=0.9848 | val_loss=0.1718, val_acc=0.9613, val_mis=31
  → no improvement for 1/10 epochs
Epoch 25: train_loss=0.3200, train_acc=0.9879 | val_loss=0.1919, val_acc=0.9725, val_mis=22
Min SSE: 32.57924344893432
Min Misclassifications: 22
Epoch 30: train_loss=0.2761, train_acc=0.9893 | val_loss=0.1201, val_acc=0.9762, val_mis=19
Min SSE: 24.652410167774953
Min Misclassifications: 19
Epoc

--------- Config 13 ---------
h1 = 256
h2 = 16
{'hidden1': 256, 'hidden2': 16, 'learning_rate': 0.001}
---------            ---------
Epoch  5: train_loss=2.2949, train_acc=0.8908 | val_loss=1.4038, val_acc=0.6525, val_mis=278
Min SSE: 256.0351286747962
Min Misclassifications: 278
Epoch 10: train_loss=1.7941, train_acc=0.9110 | val_loss=1.0577, val_acc=0.7837, val_mis=173
Min SSE: 198.54899465367265
Min Misclassifications: 173
Epoch 15: train_loss=1.4915, train_acc=0.9277 | val_loss=0.8773, val_acc=0.8087, val_mis=153
Min SSE: 163.65849674686507
Min Misclassifications: 153
Epoch 20: train_loss=1.2666, train_acc=0.9395 | val_loss=0.7434, val_acc=0.8438, val_mis=125
Min SSE: 139.58935077013496
Min Misclassifications: 125
Epoch 25: train_loss=1.0889, train_acc=0.9475 | val_loss=0.6326, val_acc=0.8812, val_mis=95
Min SSE: 117.57874149255265
Min Misclassifications: 95
Epoch 30: train_loss=0.9473, train_acc=0.9553 | val_loss=0.5216, val_acc=0.9025, val_mis=78
Min SSE: 100.40806336301398
Min 

--------- Config 15 ---------
h1 = 256
h2 = 16
{'hidden1': 256, 'hidden2': 16, 'learning_rate': 0.01}
---------            ---------
Epoch  5: train_loss=0.7459, train_acc=0.9682 | val_loss=0.4432, val_acc=0.9113, val_mis=71
Min SSE: 80.70376309200873
Min Misclassifications: 71
Epoch 10: train_loss=0.4293, train_acc=0.9828 | val_loss=0.2051, val_acc=0.9525, val_mis=38
Min SSE: 44.65377616082422
Min Misclassifications: 38
Epoch 15: train_loss=0.3119, train_acc=0.9873 | val_loss=0.1781, val_acc=0.9563, val_mis=35
Min SSE: 34.497349077424886
Min Misclassifications: 35
Epoch 20: train_loss=0.2441, train_acc=0.9900 | val_loss=0.1438, val_acc=0.9613, val_mis=31
Min SSE: 28.151948299288485
Min Misclassifications: 31
Epoch 25: train_loss=0.2023, train_acc=0.9921 | val_loss=0.1533, val_acc=0.9725, val_mis=22
Min SSE: 26.45325977706988
Min Misclassifications: 22
Epoch 30: train_loss=0.1767, train_acc=0.9934 | val_loss=0.1054, val_acc=0.9775, val_mis=18
Min SSE: 20.1437723440162
Min Misclassifica

--------- Config 17 ---------
h1 = 128
h2 = 64
{'hidden1': 128, 'hidden2': 64, 'learning_rate': 0.005}
---------            ---------
Epoch  5: train_loss=1.1459, train_acc=0.9440 | val_loss=0.6100, val_acc=0.8988, val_mis=81
Min SSE: 119.68637068920378
Min Misclassifications: 81
Epoch 10: train_loss=0.6792, train_acc=0.9709 | val_loss=0.3558, val_acc=0.9313, val_mis=55
Min SSE: 69.79465564889409
Min Misclassifications: 55
Epoch 15: train_loss=0.4831, train_acc=0.9805 | val_loss=0.2392, val_acc=0.9413, val_mis=47
Min SSE: 49.09845991476793
Min Misclassifications: 47
Epoch 20: train_loss=0.3815, train_acc=0.9851 | val_loss=0.1700, val_acc=0.9637, val_mis=29
Min SSE: 36.262948219977474
Min Misclassifications: 29
Epoch 25: train_loss=0.3125, train_acc=0.9877 | val_loss=0.1773, val_acc=0.9725, val_mis=22
Min SSE: 30.55630379542127
Min Misclassifications: 22
Epoch 30: train_loss=0.2695, train_acc=0.9896 | val_loss=0.1639, val_acc=0.9738, val_mis=21
Min SSE: 27.3047510323086
Min Misclassific

--------- Config 19 ---------
h1 = 128
h2 = 16
{'hidden1': 128, 'hidden2': 16, 'learning_rate': 0.001}
---------            ---------
Epoch  5: train_loss=2.3292, train_acc=0.8905 | val_loss=1.4491, val_acc=0.6425, val_mis=286
Min SSE: 263.6842233658922
Min Misclassifications: 286
Epoch 10: train_loss=1.8861, train_acc=0.9036 | val_loss=1.1365, val_acc=0.7850, val_mis=172
Min SSE: 214.06794934683697
Min Misclassifications: 172
Epoch 15: train_loss=1.5562, train_acc=0.9224 | val_loss=0.9275, val_acc=0.8013, val_mis=159
Min SSE: 174.58199852439748
Min Misclassifications: 159
Epoch 20: train_loss=1.3132, train_acc=0.9368 | val_loss=0.7779, val_acc=0.8500, val_mis=120
Min SSE: 146.37292645616816
Min Misclassifications: 120
Epoch 25: train_loss=1.1272, train_acc=0.9453 | val_loss=0.6584, val_acc=0.8750, val_mis=100
Min SSE: 124.4556421878243
Min Misclassifications: 100
Epoch 30: train_loss=0.9799, train_acc=0.9528 | val_loss=0.5301, val_acc=0.9087, val_mis=73
Min SSE: 104.49794027209826
Min

--------- Config 21 ---------
h1 = 128
h2 = 16
{'hidden1': 128, 'hidden2': 16, 'learning_rate': 0.01}
---------            ---------
Epoch  5: train_loss=0.8282, train_acc=0.9631 | val_loss=0.4358, val_acc=0.9025, val_mis=78
Min SSE: 82.48727826359477
Min Misclassifications: 78
Epoch 10: train_loss=0.4467, train_acc=0.9817 | val_loss=0.2639, val_acc=0.9350, val_mis=52
Min SSE: 50.18404520129719
Min Misclassifications: 52
Epoch 15: train_loss=0.3082, train_acc=0.9882 | val_loss=0.2438, val_acc=0.9337, val_mis=53
  → no improvement for 1/10 epochs
Epoch 20: train_loss=0.2439, train_acc=0.9904 | val_loss=0.1603, val_acc=0.9650, val_mis=28
Min SSE: 28.16311705086367
Min Misclassifications: 28
Epoch 25: train_loss=0.2125, train_acc=0.9916 | val_loss=0.1418, val_acc=0.9762, val_mis=19
Min SSE: 23.400286244400082
Min Misclassifications: 19
Epoch 30: train_loss=0.1764, train_acc=0.9932 | val_loss=0.1600, val_acc=0.9587, val_mis=33
  → no improvement for 1/10 epochs
Epoch 35: train_loss=0.1584,

--------- Config 23 ---------
h1 = 32
h2 = 16
{'hidden1': 32, 'hidden2': 16, 'learning_rate': 0.005}
---------            ---------
Epoch  5: train_loss=1.3220, train_acc=0.9372 | val_loss=0.7501, val_acc=0.8425, val_mis=126
Min SSE: 142.42762239718166
Min Misclassifications: 126
Epoch 10: train_loss=0.7722, train_acc=0.9678 | val_loss=0.4515, val_acc=0.9237, val_mis=61
Min SSE: 82.0661256216851
Min Misclassifications: 61
Epoch 15: train_loss=0.5463, train_acc=0.9785 | val_loss=0.3270, val_acc=0.9550, val_mis=36
Min SSE: 58.39690995005908
Min Misclassifications: 36
Epoch 20: train_loss=0.4156, train_acc=0.9839 | val_loss=0.2653, val_acc=0.9675, val_mis=26
Min SSE: 45.83941198333619
Min Misclassifications: 26
Epoch 25: train_loss=0.3369, train_acc=0.9877 | val_loss=0.1917, val_acc=0.9688, val_mis=25
Min SSE: 35.7082975132877
Min Misclassifications: 25
Epoch 30: train_loss=0.2819, train_acc=0.9893 | val_loss=0.1868, val_acc=0.9700, val_mis=24
Min SSE: 28.973687560974604
Min Misclassifica

--------- Config 25 ---------
h1 = 16
h2 = 16
{'hidden1': 16, 'hidden2': 16, 'learning_rate': 0.001}
---------            ---------
Epoch  5: train_loss=2.7278, train_acc=0.8758 | val_loss=1.7838, val_acc=0.4300, val_mis=456
Min SSE: 317.23390849699115
Min Misclassifications: 456
Epoch 10: train_loss=2.3551, train_acc=0.8889 | val_loss=1.4810, val_acc=0.5112, val_mis=391
Min SSE: 273.3017346064304
Min Misclassifications: 391
Epoch 15: train_loss=2.0246, train_acc=0.8994 | val_loss=1.2361, val_acc=0.6975, val_mis=242
Min SSE: 231.97527728442043
Min Misclassifications: 242
Epoch 20: train_loss=1.7726, train_acc=0.9131 | val_loss=1.0726, val_acc=0.7650, val_mis=188
Min SSE: 200.24051286130845
Min Misclassifications: 188
Epoch 25: train_loss=1.5683, train_acc=0.9271 | val_loss=0.9414, val_acc=0.8087, val_mis=153
Min SSE: 174.30122746139193
Min Misclassifications: 153
Epoch 30: train_loss=1.3996, train_acc=0.9367 | val_loss=0.8318, val_acc=0.8337, val_mis=133
Min SSE: 155.07317585978421
Min

--------- Config 27 ---------
h1 = 16
h2 = 16
{'hidden1': 16, 'hidden2': 16, 'learning_rate': 0.01}
---------            ---------
Epoch  5: train_loss=0.9945, train_acc=0.9539 | val_loss=0.5220, val_acc=0.9087, val_mis=73
Min SSE: 94.27060362733687
Min Misclassifications: 73
Epoch 10: train_loss=0.5403, train_acc=0.9775 | val_loss=0.2676, val_acc=0.9575, val_mis=34
Min SSE: 45.51974236855209
Min Misclassifications: 34
Epoch 15: train_loss=0.3699, train_acc=0.9852 | val_loss=0.1599, val_acc=0.9700, val_mis=24
Min SSE: 28.474147320946592
Min Misclassifications: 24
Epoch 20: train_loss=0.2866, train_acc=0.9885 | val_loss=0.1055, val_acc=0.9750, val_mis=20
Min SSE: 23.611892402413794
Min Misclassifications: 20
Epoch 25: train_loss=0.2379, train_acc=0.9903 | val_loss=0.0861, val_acc=0.9812, val_mis=15
Min SSE: 18.72805163245666
Min Misclassifications: 15
Epoch 30: train_loss=0.1957, train_acc=0.9924 | val_loss=0.0839, val_acc=0.9862, val_mis=11
Min SSE: 14.619006804456767
Min Misclassifica

In [24]:
# save the training weights and biases into their own files
A_best_W1_df = pd.DataFrame(best_W1)
A_best_b1_df = pd.DataFrame(best_b1)
A_best_W2_df = pd.DataFrame(best_W2)
A_best_b2_df = pd.DataFrame(best_b2)
A_best_W3_df = pd.DataFrame(best_W3)
A_best_b3_df = pd.DataFrame(best_b3)

os.makedirs(os.path.join("best_weights", "A"), exist_ok=True)
A_best_W1_df.to_csv(os.path.join("best_weights", "A", "best_W1.csv"), index=False, header=None)
A_best_b1_df.to_csv(os.path.join("best_weights", "A", "best_b1.csv"), index=False, header=None)
A_best_W2_df.to_csv(os.path.join("best_weights", "A", "best_W2.csv"), index=False, header=None)
A_best_b2_df.to_csv(os.path.join("best_weights", "A", "best_b2.csv"), index=False, header=None)
A_best_W3_df.to_csv(os.path.join("best_weights", "A", "best_W3.csv"), index=False, header=None)
A_best_b3_df.to_csv(os.path.join("best_weights", "A", "best_b3.csv"), index=False, header=None)
# Save the best configuration to a file
os.makedirs(os.path.join("best_configuration","A"), exist_ok=True)
with open(os.path.join("best_configuration", "A", "best_configuration.txt"), 'w') as f:
    f.write("Best Configuration Accuracy:\n")
    for key, value in best_configuration_accuracy.items():
        f.write(f"{key}: {value}\n")
    f.write("\nBest Configuration Time:\n")
    for key, value in best_configuration_time.items():
        f.write(f"{key}: {value}\n")
        
training_summary_pda = pd.DataFrame(training_summary)
training_summary_pda.to_csv('training_summary_a.csv')

# Training Phase 2 using Network B Configuration

This takes the best configuration from the Network A runs, and runs 50 epochs of training.

In [25]:
Network = 'B'

Best Configuration based on both accuracy and speed.

In [35]:
print(f"--------- Config Network B ---------")
print(f"h1 = {32}")
print(f"h2 = {16}")
print(f"learning rate = {0.01}")
print("---------            ---------")
H1=32
H2=16
(training_results, validation_results, training_all_actual_curr,
 training_all_computed_curr, validation_all_actual_curr, validation_all_computed_curr,
 networkb_best_W1, networkb_best_W2, networkb_best_W3, networkb_best_b1, networkb_best_b2,
 networkb_best_b3, start_time, end_time, duration, min_train_loss, min_train_acc,
        min_val_loss, min_val_acc) = backprop_training_v2(
        df=df,
        df_labels=df_labels,
        patience=2, 
        hidden_layer_count1=32,
        hidden_layer_count2=16,
        learning_rate=0.01,
        tanh_param_a=tanh_param_a,
        tanh_param_b=tanh_param_b,
        leaky_param=leaky_param,
        logistic_slope_param=logistic_slope_param,
        activation_function1='relu',
        activation_function2='relu')
hidden_layer_count1=H1
hidden_layer_count2=H2
calculate_and_save_metrics(training_results, validation_results, training_all_actual_curr,
                           training_all_computed_curr, validation_all_actual_curr,
                           validation_all_computed_curr, hidden_layer_count1, hidden_layer_count2,
                           networkb_best_W1, networkb_best_W2, networkb_best_W3, networkb_best_b1,
                           networkb_best_b2, networkb_best_b3, learning_rate, "relu", "relu",
                           start_time, end_time, duration)

--------- Config Network B ---------
h1 = 32
h2 = 16
learning rate = 0.01
---------            ---------
Epoch  5: train_loss=1.3032, train_acc=0.9390 | val_loss=1.1689, val_acc=0.4963, val_mis=403
Min SSE: 283.8004227863593
Min Misclassifications: 403
Epoch 10: train_loss=0.6918, train_acc=0.9700 | val_loss=0.6959, val_acc=0.6088, val_mis=313
Min SSE: 283.8004227863593
Min Misclassifications: 313
Epoch 15: train_loss=0.4456, train_acc=0.9823 | val_loss=0.4807, val_acc=0.6438, val_mis=285
Min SSE: 283.8004227863593
Min Misclassifications: 285
Epoch 20: train_loss=0.3264, train_acc=0.9876 | val_loss=0.3603, val_acc=0.6913, val_mis=247
Min SSE: 283.8004227863593
Min Misclassifications: 247
Epoch 25: train_loss=0.2539, train_acc=0.9904 | val_loss=0.1593, val_acc=0.7350, val_mis=212
Min SSE: 283.8004227863593
Min Misclassifications: 212
Epoch 30: train_loss=0.2132, train_acc=0.9921 | val_loss=0.1095, val_acc=0.7675, val_mis=186
Min SSE: 283.8004227863593
Min Misclassifications: 186
Epoch 3

In [36]:
# save the training weights and biases into their own files
B_best_W1_df = pd.DataFrame(networkb_best_W1)
B_best_b1_df = pd.DataFrame(networkb_best_b1)
B_best_W2_df = pd.DataFrame(networkb_best_W2)
B_best_b2_df = pd.DataFrame(networkb_best_b2)
B_best_W3_df = pd.DataFrame(networkb_best_W3)
B_best_b3_df = pd.DataFrame(networkb_best_b3)

os.makedirs(os.path.join("best_weights", "B"), exist_ok=True)
B_best_W1_df.to_csv(os.path.join("best_weights", "B", "best_W1.csv"), index=False, header=None)
B_best_b1_df.to_csv(os.path.join("best_weights", "B", "best_b1.csv"), index=False, header=None)
B_best_W2_df.to_csv(os.path.join("best_weights", "B", "best_W2.csv"), index=False, header=None)
B_best_b2_df.to_csv(os.path.join("best_weights", "B", "best_b2.csv"), index=False, header=None)
B_best_W3_df.to_csv(os.path.join("best_weights", "B", "best_W3.csv"), index=False, header=None)
B_best_b3_df.to_csv(os.path.join("best_weights", "B", "best_b3.csv"), index=False, header=None)
# Save the best configuration to a file
os.makedirs(os.path.join("best_configuration","B"), exist_ok=True)
with open(os.path.join("best_configuration", "B", "best_configuration.txt"), 'w') as f:
    f.write("Best Configuration Accuracy:\n")
    f.write(str(min_val_acc))
    f.write("\nBest Configuration Time:\n")
    f.write(str(duration))
    f.write("\n")
    f.write("Hidden Layer 1: ")
    f.write(str(32))
    f.write("\n")
    f.write("Hidden Layer 2: ")
    f.write(str(16))
    f.write("\n")
    f.write("Learning Rate: ")
    f.write(str(0.01))
    f.write("\n")

# Run best weights from Networks A and B on Test Set

## Network A

In [37]:
function_hidden1 = 'tanh'
function_hidden2 = 'tanh'

# Retrieve the best weights and biases
best_W1 = pd.read_csv('best_weights/A/best_W1.csv', header=None).values
best_b1 = pd.read_csv('best_weights/A/best_b1.csv', header=None).values
best_W2 = pd.read_csv('best_weights/A/best_W2.csv', header=None).values
best_b2 = pd.read_csv('best_weights/A/best_b2.csv', header=None).values
best_W3 = pd.read_csv('best_weights/A/best_W3.csv', header=None).values
best_b3 = pd.read_csv('best_weights/A/best_b3.csv', header=None).values
# Load the test set
df_test = pd.read_csv('test_set.csv', header=None)
# Run forward_pass_v2 on the df_test which doesn't have labels
X_test = df_test.to_numpy()
# Perform forward pass
outputs = forward_pass_v2(
    function_hidden1, function_hidden2, function_output, X_test, best_W1, best_W2, best_W3,
    best_b1, best_b2, best_b3, None, tanh_param_a, tanh_param_b, logistic_slope_param,
    leaky_param)
classification_computed = np.argmax(outputs, axis=1) + 1
pd.DataFrame(classification_computed).to_csv('predictions_for_test_tanh.csv', index=False, header=None)
print("Test set predictions saved to 'predictions_for_test_tanh.csv")

Test set predictions saved to 'predictions_for_test_tanh.csv


## Network B

In [38]:
function_hidden1 = 'relu'
function_hidden2 = 'relu'

# Retrieve the best weights and biases
best_W1 = pd.read_csv('best_weights/B/best_W1.csv', header=None).values
best_b1 = pd.read_csv('best_weights/B/best_b1.csv', header=None).values
best_W2 = pd.read_csv('best_weights/B/best_W2.csv', header=None).values
best_b2 = pd.read_csv('best_weights/B/best_b2.csv', header=None).values
best_W3 = pd.read_csv('best_weights/B/best_W3.csv', header=None).values
best_b3 = pd.read_csv('best_weights/B/best_b3.csv', header=None).values
# Load the test set
df_test = pd.read_csv('test_set.csv', header=None)
# Run forward_pass_v2 on the df_test which doesn't have labels
X_test = df_test.to_numpy()
# Perform forward pass
outputs = forward_pass_v2(
    function_hidden1, function_hidden2, function_output, X_test, best_W1, best_W2, best_W3,
    best_b1, best_b2, best_b3, None, tanh_param_a, tanh_param_b, logistic_slope_param,
    leaky_param)
classification_computed = np.argmax(outputs, axis=1) + 1
pd.DataFrame(classification_computed).to_csv('predictions_for_test_leakyrelu.csv', index=False, header=None)
print("Test set predictions saved to 'predictions_for_test_tanh.csv")

Test set predictions saved to 'predictions_for_test_tanh.csv
