<a href="https://colab.research.google.com/github/arunangshudutta/DA6401_assignments/blob/main/assignment_1/Assignment_1_Q8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from numpy import linalg as la
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)

from keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split

import wandb

In [None]:
def weights_initialization(num_neurons, initializer):
  """
  num_neurons = list of number of neurons at each layer starting from the input layer and ending at output layer
  initializer = 'random' or 'xavier'

  Returns: initialized weight matrices and bias vectors
  """
  mean=0
  std_dev=1

  W_matrices = []
  b_vectors = []

  for i in range(len(num_neurons)-1):
    rows = num_neurons[i+1]
    cols = num_neurons[i]

    if initializer == 'random':

      weight_matrix = np.random.normal(mean, std_dev, size=(rows, cols))

    elif initializer == 'Xavier':

      upper_bound = np.sqrt(6.0/(rows + cols))
      lower_bound = -1*upper_bound
      weight_matrix = np.random.uniform(low = lower_bound, high = upper_bound, size = (rows, cols))

    else:
      print('initializer invalid')


    bias_vector = np.zeros((rows,1))

    W_matrices.append(weight_matrix)
    b_vectors.append(bias_vector)


  return W_matrices, b_vectors

################################################################################

# ACTIVATION FUNCTIONS
def relu(x):
  """
  Rectified Linear Unit (ReLU) activation function
  """
  return np.maximum(0, x)

def sigmoid(x):
  """
  Sigmoid activation function
  """
  # x = np.float128(x)
  return 1 / (1 + np.exp(-x))

def tanh(x):
  """
  Hyperbolic tangent (tanh) activation function
  """
  # x = np.float128(x)
  return np.tanh(x)
def softmax(x):

  """
  Softmax function for output layer
  """
  # x = np.float128(x)
  return np.exp(x) / np.sum(np.exp(x), axis=0)

def activation_output(x, activation_function):
  """
  activation_function = 'ReLU', 'sigmoid', 'tanh'
  """
  if activation_function == 'ReLU':
    return relu(x)
  elif activation_function == 'sigmoid':
    return sigmoid(x)
  elif activation_function == 'tanh':
    return tanh(x)
  elif activation_function == 'softmax':
    return softmax(x)
  else:
    print('activation function invalid')

# DERIVATIVE OF ACTIVATION FUNCTION
def sigmoid_derivative(x):
  s = sigmoid(x)
  return s * (1 - s)

def tanh_derivative(x):
  t = tanh(x)
  return 1 - t**2

def relu_derivative(x):
  return 1*(x>0)

def activation_derivative(x, activation_function):
  """
  activation_function = 'ReLU', 'sigmoid', 'tanh'
  """
  if activation_function == 'ReLU':
    return relu_derivative(x)
  elif activation_function == 'sigmoid':
    return sigmoid_derivative(x)
  elif activation_function == 'tanh':
    return tanh_derivative(x)
  else:
    print('activation function invalid')

################################################################################

def layer_output_FP(x, weight_matrix, bias_vector, activation_function):
  pre_activation = np.add(np.matmul(weight_matrix, x), bias_vector)
  post_activation = activation_output(pre_activation, activation_function)
  return pre_activation, post_activation

def forward_propagation(ip_data, W_matrices, b_vectors, activation_functions):
  """
  forward propagation
  """

  layer_op = []
  layer_op.append(ip_data)

  layer_ip = []

  for i in range(len(W_matrices)):

    weight_matrix = W_matrices[i]
    bias_vector = b_vectors[i]

    activation_function = activation_functions[i]

    pre_activation, post_activation = layer_output_FP(layer_op[i], weight_matrix, bias_vector, activation_function)

    layer_op.append(post_activation)
    layer_ip.append(pre_activation)

  return layer_ip, layer_op

################################################################################

def back_propagation(W_matrices, b_vectors, y_true, layer_ip, layer_op, activation_functions, batch_size, w_d, loss_function):

  DWs = []
  Dbs = []
  for i in range(len(W_matrices)):
    k = len(W_matrices) - i

    if k == len(W_matrices):
      if loss_function == 'cross_entropy':
        Da = -np.add(y_true, -layer_op[k])
      elif loss_function == 'squared_error':
        Da = (layer_op[k] - y_true)*layer_op[k]*(1-layer_op[k])

      Dw = (np.matmul(Da, layer_op[k-1].T) + w_d*W_matrices[k-1])/batch_size
    else:

      Dh = np.matmul(W_matrices[k].T, Da)
      Dg = activation_derivative(layer_ip[k-1], activation_functions[k-1])
      Da = np.multiply(Dh, Dg)
      Dw = (np.matmul(Da, layer_op[k-1].T) + w_d*W_matrices[k-1])/batch_size
    Db = np.sum(Da, axis=1, keepdims=True)/batch_size

    DWs.append(Dw)
    Dbs.append(Db)

  return DWs, Dbs


################################################################################


def update_weights_gd(W_matrices, b_vectors, DWs, Dbs, learning_rate = 0.1):

  DWs.reverse()
  Dbs.reverse()

  for i in range(len(DWs)):

    W_matrices[i] = W_matrices[i] - learning_rate*DWs[i]
    b_vectors[i] = b_vectors[i] - learning_rate*Dbs[i]
  return W_matrices, b_vectors

def update_weights_momentum(W_matrices, b_vectors, DWs, Dbs, u_past_w, u_past_b, learning_rate = 0.1, beta = 0.5):
  DWs.reverse()
  Dbs.reverse()
  u_w = u_past_w
  u_b = u_past_b
  for i in range(len(DWs)):

    u_w[i] = beta*u_past_w[i] + DWs[i]
    u_b[i] = beta*u_past_b[i] + Dbs[i]

    W_matrices[i] = W_matrices[i] - learning_rate*u_w[i]
    b_vectors[i] = b_vectors[i] - learning_rate*u_b[i]

  return W_matrices, b_vectors, u_w, u_b

def update_weights_adagrad(W_matrices, b_vectors, DWs, Dbs, u_past_w, u_past_b, learning_rate = 0.1):
  DWs.reverse()
  Dbs.reverse()

  u_w = u_past_w
  u_b = u_past_b
  eps = 1e-8
  for i in range(len(DWs)):
    u_w[i] = u_past_w[i] + DWs[i]**2
    u_b[i] = u_past_b[i] + Dbs[i]**2

    W_matrices[i] = W_matrices[i] - learning_rate*DWs[i]/(np.sqrt(u_w[i]) + eps)
    b_vectors[i] = b_vectors[i] - learning_rate*Dbs[i]/(np.sqrt(u_b[i]) + eps)

  return W_matrices, b_vectors, u_w, u_b

def update_weights_rmsprop(W_matrices, b_vectors, DWs, Dbs, u_past_w, u_past_b, learning_rate = 0.1, beta = 0.5):
  DWs.reverse()
  Dbs.reverse()

  u_w = u_past_w
  u_b = u_past_b
  eps = 1e-8
  for i in range(len(DWs)):
    u_w[i] = beta*u_past_w[i] + (1-beta)*DWs[i]**2
    u_b[i] = beta*u_past_b[i] + (1-beta)*Dbs[i]**2

    W_matrices[i] = W_matrices[i] - learning_rate*DWs[i]/(np.sqrt(u_w[i]) + eps)
    b_vectors[i] = b_vectors[i] - learning_rate*Dbs[i]/(np.sqrt(u_b[i]) + eps)

  return W_matrices, b_vectors, u_w, u_b

def update_weights_adam(W_matrices, b_vectors, DWs, Dbs, mw_past, mb_past, vw_past, vb_past, t, learning_rate = 0.1, beta1 = 0.5, beta2 =0.5):
  DWs.reverse()
  Dbs.reverse()
  mw = mw_past
  mb = mb_past
  vw = vw_past
  vb = vb_past
  eps = 1e-8

  for i in range(len(DWs)):
    mw[i] = beta1*mw_past[i] + (1-beta1)*DWs[i]
    mb[i] = beta1*mb_past[i] + (1-beta1)*Dbs[i]

    mw_cap = mw[i]/(1 - beta1**t)
    mb_cap = mb[i]/(1 - beta1**t)

    vw[i] = beta2*vw_past[i] + (1-beta2)*DWs[i]**2
    vb[i] = beta2*vb_past[i] + (1-beta2)*Dbs[i]**2
    vw_cap = vw[i]/(1 - beta2**t)
    vb_cap = vb[i]/(1 - beta2**t)

    W_matrices[i] = W_matrices[i] - learning_rate*mw_cap/(np.sqrt(vw_cap) + eps)
    b_vectors[i] = b_vectors[i] - learning_rate*mb_cap/(np.sqrt(vb_cap) + eps)

  return W_matrices, b_vectors, mw, mb, vw, vb

def update_weights_nadam(W_matrices, b_vectors, DWs, Dbs, mw_past, mb_past, vw_past, vb_past,t,  learning_rate = 0.1, beta1 = 0.5, beta2 =0.5):
  DWs.reverse()
  Dbs.reverse()
  mw = mw_past
  mb = mb_past
  vw = vw_past
  vb = vb_past
  eps = 1e-8

  for i in range(len(DWs)):
    mw[i] = beta1*mw_past[i] + (1-beta1)*DWs[i]
    mb[i] = beta1*mb_past[i] + (1-beta1)*Dbs[i]

    mw_cap = mw[i]/(1 - beta1**(t+1))
    mb_cap = mb[i]/(1 - beta1**(t+1))

    vw[i] = beta2*vw_past[i] + (1-beta2)*DWs[i]**2
    vb[i] = beta2*vb_past[i] + (1-beta2)*Dbs[i]**2
    vw_cap = vw[i]/(1 - beta2**(t+1))
    vb_cap = vb[i]/(1 - beta2**(t+1))

    W_matrices[i] = W_matrices[i] - learning_rate*(beta1*mw_cap + ((1-beta1)/(1 - beta1**(t+1)))*DWs[i])/(np.sqrt(vw_cap) + eps)
    b_vectors[i] = b_vectors[i] - learning_rate*(beta1*mb_cap + ((1-beta1)/(1 - beta1**(t+1)))*Dbs[i])/(np.sqrt(vb_cap) + eps)

  return W_matrices, b_vectors, mw, mb, vw, vb

def look_ahead_nag(W_s, b_s, u_past_w, u_past_b, beta = 0.5):
  for i in range(len(W_s)):
    W_s[i] = W_s[i] - beta*u_past_w[i]
    b_s[i] = b_s[i] - beta*u_past_b[i]
  return W_s, b_s

################################################################################


def one_hot_encode(integers, num_classes=None):
  if num_classes is None:
      num_classes = np.max(integers) + 1
  return np.eye(num_classes)[integers]

def cross_entropy_loss(y_true, y_pred, batch_size):
  # Clip the predicted probabilities to avoid numerical instability
  y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
  loss_value = np.sum(np.sum(y_true*np.log(y_pred), axis=0))/batch_size
  return loss_value*(-1)

def squared_error_loss(y_true, y_pred, batch_size):
  # Clip the predicted probabilities to avoid numerical instability
  loss_value = 0.5*np.sum(la.norm(y_true-y_pred, axis=0)**2)/batch_size
  return loss_value

def accuracy(y_true, y_pred, batch_size):
  n_correct = 0
  for i in range(0, batch_size, 1) :
    if y_true[:,i].argmax() == y_pred[:,i].argmax() :
      n_correct += 1
  return 100 * n_correct / batch_size

################################################################################

def load_split_dataset(test_ratio=0.3):
  # Load Fashion MNIST dataset
  (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

  # Split the training set into training and validation sets
  X_train, X_val, Y_train, Y_val = train_test_split(train_images, train_labels, test_size=test_ratio, random_state=42)

  data_size = X_train.shape[0]
  X_train = (X_train.reshape(data_size, -1).T)/255
  Y_train = one_hot_encode(Y_train, 10).T

  data_size = X_val.shape[0]
  X_val = (X_val.reshape(data_size, -1).T)/255
  Y_val = one_hot_encode(Y_val, 10).T

  data_size = test_images.shape[0]
  X_test = (test_images.reshape(data_size, -1).T)/255
  Y_test = one_hot_encode(test_labels, 10).T

  return X_train, Y_train, X_val, Y_val, X_test, Y_test


In [None]:

def train_model(X_train,Y_train, X_test, Y_test, epoch=1,batch_size=25, num_neurons_hidden = [10], activation_functions = ['sigmoid'], loss_function = 'cross_entropy',
                weights_init_type='random', optimizer = 'sgd', learning_rate = 0.1, opti_beta = [0.5, 0.5], w_d = 0, plot_acc_loss = False):

  """
  X has shape (number of features, number of samples in train data set)
  Y has shape (number of classes, number of samples in train data set)

  num_neurons_hidden = list of number of neurons at each hidden layer

  """
  num_ip_neurons = X_train.shape[0]
  num_op_neurons = Y_train.shape[0]
  num_neurons = [num_ip_neurons] + num_neurons_hidden + [num_op_neurons]
  activation_functions = activation_functions + ['softmax']

  W_s, b_s = weights_initialization(num_neurons, weights_init_type)
  print('Hyper parameters: \n')
  print("Weight initialization type : ", weights_init_type)
  print("Optimizer : ", optimizer)
  print("Learning rate (initial): ", learning_rate)
  print("Batch size: ", batch_size)
  print("-------------------")
  print("Architecture Description:\n")

  for i in range(len(num_neurons)-1):
    print("Layer: ", i+1, " ; number of neurons: ", num_neurons[i+1], " ; activation function: ", activation_functions[i])
    print("Weight matrix dimention", W_s[i].shape, "Bias vector dimention", b_s[i].shape)
    print("----------------")

  num_batches = np.floor(X_train.shape[1]/batch_size)
  print(num_batches)


  if optimizer == 'momentum':
    u_past_w = [x * 0 for x in W_s]
    u_past_b = [x * 0 for x in b_s]

  elif optimizer == 'nag':
    u_past_w = [x * 0 for x in W_s]
    u_past_b = [x * 0 for x in b_s]

  elif optimizer == 'rmsprop':
    u_past_w = [x * 0 for x in W_s]
    u_past_b = [x * 0 for x in b_s]

  elif optimizer == 'adagrad':
    u_past_w = [x * 0 for x in W_s]
    u_past_b = [x * 0 for x in b_s]

  elif optimizer == 'adam':
    mw_past = [x * 0 for x in W_s]
    mb_past = [x * 0 for x in b_s]
    vw_past = [x * 0 for x in W_s]
    vb_past = [x * 0 for x in b_s]
    t = 1

  elif optimizer == 'nadam':
    mw_past = [x * 0 for x in W_s]
    mb_past = [x * 0 for x in b_s]
    vw_past = [x * 0 for x in W_s]
    vb_past = [x * 0 for x in b_s]
    t = 1

  print('\n Start of training')

  ip_all, op_all = forward_propagation(X_train, W_s, b_s, activation_functions)
  ce_loss_tr = cross_entropy_loss(Y_train, op_all[-1], X_train.shape[1])
  se_loss_tr = squared_error_loss(Y_train, op_all[-1], X_train.shape[1])
  acc_tr = accuracy(Y_train, op_all[-1], Y_train.shape[1])

  ip_all, op_all = forward_propagation(X_test, W_s, b_s, activation_functions)
  ce_loss_ts = cross_entropy_loss(Y_test, op_all[-1], X_test.shape[1])
  se_loss_ts = squared_error_loss(Y_test, op_all[-1], X_test.shape[1])
  acc_ts = accuracy(Y_test, op_all[-1], Y_test.shape[1])

  print("CE Training Loss: ", ce_loss_tr, "SE Training Loss: ", se_loss_tr,"Training Accuracy: ", acc_tr, "CE Test Loss: ",
        ce_loss_ts, "SE Test Loss: ", se_loss_ts ,  "Testing Accuracy: ", acc_ts)

  wandb.log({'tr_loss_CE' : ce_loss_tr, 'tr_loss_SE' : se_loss_tr, 'tr_accuracy' : acc_tr, 'val_loss_CE' : ce_loss_ts, 'val_loss_SE' : se_loss_ts, 'val_accuracy' : acc_ts})

  train_loss = np.array([ce_loss_tr])
  train_acc = np.array([acc_tr])

  val_loss = np.array([ce_loss_ts])
  val_acc = np.array([acc_ts])

  for i in range(epoch):
    print('Epoch: ', i+1)

    for j in tqdm(range(int(num_batches))):
      batch_X = X_train[:,j*batch_size:(j+1)*batch_size]
      batch_Y = Y_train[:,j*batch_size:(j+1)*batch_size]


      if optimizer == 'sgd':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d, loss_function)
        W_s, b_s = update_weights_gd(W_s, b_s, DWs, Dbs, learning_rate)

      elif optimizer == 'momentum':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d, loss_function)
        W_s, b_s, u_past_w, u_past_b  = update_weights_momentum(W_s, b_s, DWs, Dbs, u_past_w, u_past_b, learning_rate, opti_beta[0])

      elif optimizer == 'adagrad':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d, loss_function)
        W_s, b_s, u_past_w, u_past_b  = update_weights_adagrad(W_s, b_s, DWs, Dbs, u_past_w, u_past_b, learning_rate)

      elif optimizer == 'rmsprop':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d, loss_function)
        W_s, b_s, u_past_w, u_past_b  = update_weights_rmsprop(W_s, b_s, DWs, Dbs, u_past_w, u_past_b, learning_rate, opti_beta[0])

      elif optimizer == 'adam':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d, loss_function)
        W_s, b_s, mw_past, mb_past, vw_past, vb_past = update_weights_adam(W_s, b_s, DWs, Dbs, mw_past, mb_past, vw_past, vb_past, t, learning_rate, opti_beta[0], opti_beta[1])
        t =t +1

      elif optimizer == 'nadam':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d, loss_function)
        W_s, b_s, mw_past, mb_past, vw_past, vb_past = update_weights_nadam(W_s, b_s, DWs, Dbs, mw_past, mb_past, vw_past, vb_past, t, learning_rate, opti_beta[0], opti_beta[1])
        t =t +1
      elif optimizer == 'nag':
        PWs, Pbs = look_ahead_nag(W_s, b_s, u_past_w, u_past_b, opti_beta[0])
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(PWs, Pbs, batch_Y, ip, op, activation_functions, batch_size, w_d, loss_function)
        W_s, b_s, u_past_w, u_past_b  = update_weights_momentum(W_s, b_s, DWs, Dbs, u_past_w, u_past_b, learning_rate, opti_beta[0])


    ip_all, op_all = forward_propagation(X_train, W_s, b_s, activation_functions)
    ce_loss_tr = cross_entropy_loss(Y_train, op_all[-1], X_train.shape[1])
    se_loss_tr = squared_error_loss(Y_train, op_all[-1], X_train.shape[1])
    acc_tr = accuracy(Y_train, op_all[-1], Y_train.shape[1])

    ip_all, op_all = forward_propagation(X_test, W_s, b_s, activation_functions)
    ce_loss_ts = cross_entropy_loss(Y_test, op_all[-1], X_test.shape[1])
    se_loss_ts = squared_error_loss(Y_test, op_all[-1], X_test.shape[1])
    acc_ts = accuracy(Y_test, op_all[-1], Y_test.shape[1])

    print("CE Training Loss: ", ce_loss_tr, "SE Training Loss: ", se_loss_tr,"Training Accuracy: ", acc_tr, "CE Test Loss: ",
          ce_loss_ts, "SE Test Loss: ", se_loss_ts ,  "Testing Accuracy: ", acc_ts)

    train_loss = np.append(train_loss, [ce_loss_tr])
    train_acc = np.append(train_acc, [acc_tr])

    val_loss = np.append(val_loss, [ce_loss_ts])
    val_acc = np.append(val_acc, [acc_ts])

    wandb.log({'tr_loss_CE' : ce_loss_tr, 'tr_loss_SE' : se_loss_tr, 'tr_accuracy' : acc_tr, 'val_loss_CE' : ce_loss_ts, 'val_loss_SE' : se_loss_ts, 'val_accuracy' : acc_ts})

  if plot_acc_loss == True:

    fig, ax = plt.subplots()  # Create a figure and axes object
    ax.plot(np.arange(0, epoch + 1, 1), train_acc, color='r', label='training')
    ax.plot(np.arange(0, epoch + 1, 1), val_acc, color='g', label='validation')
    ax.set_title("Accuracy")  # Set title on the axes object
    ax.legend()
    plt.grid()
    plt.show()

    fig, ax = plt.subplots()  # Create a figure and axes object for the second plot
    ax.plot(np.arange(0, epoch + 1, 1), train_loss, color='r', label='training')
    ax.plot(np.arange(0, epoch + 1, 1), val_loss, color='g', label='validation')
    ax.set_title("Loss")  # Set title on the axes object
    ax.legend()
    plt.grid()
    plt.show()




In [None]:
sweep_config = {
    'method': 'grid',
    'metric': {
      'name': 'valid accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'epochs': {
            'values': [5]
        },
        'num_layers': {
            'values': [4]
        },
         'hidden_size': {
            'values': [64]
        },
        'weight_decay': {
            'values': [0, 0.0005, 0.5]
        },
         'learning_rate': {
            'values': [0.001, 0.0001]
        },
         'optimizer': {
            'values': ['rmsprop', 'adam', 'nadam']
        },
        'batch_size': {
            'values': [32]
        },
         'weight_init': {
            'values': ['Xavier']
        },
        'activation': {
            'values': ['tanh', 'ReLU']
        },
        'loss_function': {
            'values': ['cross_entropy', 'squared_error']
        },
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project = 'dl_assgn_1_q_8')

Create sweep with ID: 28yyx0vh
Sweep URL: https://wandb.ai/arunangshudutta218-iitm/dl_assgn_1_q_8/sweeps/28yyx0vh


In [None]:
def main():
  with wandb.init() as run:

    epochs = wandb.config.epochs
    nhl = wandb.config.num_layers
    sz = wandb.config.hidden_size
    w_d = wandb.config.weight_decay
    lr = wandb.config.learning_rate
    optimizer = wandb.config.optimizer
    b_sz = wandb.config.batch_size
    weight_init = wandb.config.weight_init
    act_fun = wandb.config.activation
    loss_fun = wandb.config.loss_function

    neuros_num = []
    act_func = []
    for i in range(nhl):
      neuros_num.append(sz)
      act_func.append(act_fun)

    wandb.run.name = "e_{}_hl_{}_hs_{}_lr_{}_opt_{}_bs_{}_init_{}_ac_{}_l2_{}".format(epochs, nhl, sz, lr, optimizer, b_sz, weight_init, act_fun, w_d)

    train_model(X_train, Y_train, X_val, Y_val, epoch=epochs, batch_size=b_sz, num_neurons_hidden = neuros_num, activation_functions = act_func, loss_function = loss_fun,
                weights_init_type=weight_init, optimizer = optimizer, learning_rate = lr, opti_beta = [0.5, 0.5], w_d = w_d)


X_train, Y_train, X_val, Y_val, X_test, Y_test = load_split_dataset()

wandb.agent(sweep_id, function = main)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: rrwycqoi with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.270430577226135 SE Training Loss:  0.44609708

100%|██████████| 1312/1312 [00:04<00:00, 274.70it/s]


CE Training Loss:  0.47530398975837174 SE Training Loss:  0.12322470019385406 Training Accuracy:  82.95952380952382 CE Test Loss:  0.48625823532585055 SE Test Loss:  0.12592855079643367 Testing Accuracy:  82.43888888888888
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 206.42it/s]


CE Training Loss:  0.40385400231315993 SE Training Loss:  0.10413439567929757 Training Accuracy:  85.5047619047619 CE Test Loss:  0.42618932806884896 SE Test Loss:  0.10852818755662655 Testing Accuracy:  85.03333333333333
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 223.72it/s]


CE Training Loss:  0.37891867796074935 SE Training Loss:  0.09753425879380408 Training Accuracy:  86.38333333333334 CE Test Loss:  0.40992068194981807 SE Test Loss:  0.10354504095333375 Testing Accuracy:  85.7
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 241.79it/s]


CE Training Loss:  0.36321319948151254 SE Training Loss:  0.09339508325728828 Training Accuracy:  87.01904761904763 CE Test Loss:  0.401159297590101 SE Test Loss:  0.100775861701728 Testing Accuracy:  86.21666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 229.22it/s]


CE Training Loss:  0.34614661497715105 SE Training Loss:  0.08897782400327779 Training Accuracy:  87.6952380952381 CE Test Loss:  0.3939658056776319 SE Test Loss:  0.09855443196515486 Testing Accuracy:  86.60555555555555


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,87.69524
tr_loss_CE,0.34615
tr_loss_SE,0.08898
val_accuracy,86.60556
val_loss_CE,0.39397
val_loss_SE,0.09855


[34m[1mwandb[0m: Agent Starting Run: ckd3otub with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3562718052467186 SE Training Loss:  0.4546680

100%|██████████| 1312/1312 [00:07<00:00, 172.63it/s]


CE Training Loss:  0.46385108773768047 SE Training Loss:  0.1195059314264847 Training Accuracy:  83.33095238095238 CE Test Loss:  0.47102494097145925 SE Test Loss:  0.12111717916551738 Testing Accuracy:  83.01666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 272.27it/s]


CE Training Loss:  0.4161964072735203 SE Training Loss:  0.1070613677880542 Training Accuracy:  85.00714285714285 CE Test Loss:  0.435227932315886 SE Test Loss:  0.11087746586759116 Testing Accuracy:  84.7
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 177.84it/s]


CE Training Loss:  0.386947004267527 SE Training Loss:  0.09977350226099783 Training Accuracy:  86.04047619047618 CE Test Loss:  0.41849099450180055 SE Test Loss:  0.10577896602395775 Testing Accuracy:  85.46111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 270.25it/s]


CE Training Loss:  0.3615634462964621 SE Training Loss:  0.0929263229211986 Training Accuracy:  87.07857142857142 CE Test Loss:  0.4027355609766588 SE Test Loss:  0.10120760555441126 Testing Accuracy:  86.04444444444445
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 175.84it/s]


CE Training Loss:  0.3488316388494272 SE Training Loss:  0.08962232289058411 Training Accuracy:  87.63095238095238 CE Test Loss:  0.39897979722149746 SE Test Loss:  0.09935173640947831 Testing Accuracy:  86.53888888888889


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,87.63095
tr_loss_CE,0.34883
tr_loss_SE,0.08962
val_accuracy,86.53889
val_loss_CE,0.39898
val_loss_SE,0.09935


[34m[1mwandb[0m: Agent Starting Run: v3uq7sj3 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.2525407306586835 SE Training Loss:  0.4476082

100%|██████████| 1312/1312 [00:07<00:00, 180.12it/s]


CE Training Loss:  0.6016020347566335 SE Training Loss:  0.15123784074674046 Training Accuracy:  77.88333333333334 CE Test Loss:  0.6045471494760483 SE Test Loss:  0.15181610761543918 Testing Accuracy:  77.97222222222223
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 266.13it/s]


CE Training Loss:  0.5806935286002788 SE Training Loss:  0.14525673062576097 Training Accuracy:  79.34285714285714 CE Test Loss:  0.5834760495839533 SE Test Loss:  0.14572468584238277 Testing Accuracy:  79.43333333333334
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 170.92it/s]


CE Training Loss:  0.567312923784186 SE Training Loss:  0.14147453073712463 Training Accuracy:  80.23809523809524 CE Test Loss:  0.5705255589051859 SE Test Loss:  0.14211396103270016 Testing Accuracy:  80.24444444444444
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 269.25it/s]


CE Training Loss:  0.5671179612087084 SE Training Loss:  0.14116798452669047 Training Accuracy:  80.34761904761905 CE Test Loss:  0.570373764037257 SE Test Loss:  0.1418913907806816 Testing Accuracy:  80.2611111111111
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 177.61it/s]


CE Training Loss:  0.5580429035644924 SE Training Loss:  0.13831811492357401 Training Accuracy:  80.92142857142858 CE Test Loss:  0.5614491441179466 SE Test Loss:  0.1390243196455635 Testing Accuracy:  80.78888888888889


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,80.92143
tr_loss_CE,0.55804
tr_loss_SE,0.13832
val_accuracy,80.78889
val_loss_CE,0.56145
val_loss_SE,0.13902


[34m[1mwandb[0m: Agent Starting Run: pf8btf8b with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.367334189832244 SE Training Loss:  0.45589999552

100%|██████████| 1312/1312 [00:08<00:00, 148.80it/s]


CE Training Loss:  0.4176161101232898 SE Training Loss:  0.10806549225363714 Training Accuracy:  84.88095238095238 CE Test Loss:  0.4322104419155502 SE Test Loss:  0.1104908600586063 Testing Accuracy:  84.63333333333334
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 184.39it/s]


CE Training Loss:  0.3686862624304778 SE Training Loss:  0.09430061793643474 Training Accuracy:  86.91666666666667 CE Test Loss:  0.395734003792096 SE Test Loss:  0.09962478886108446 Testing Accuracy:  86.29444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 226.97it/s]


CE Training Loss:  0.34267932886897456 SE Training Loss:  0.08830298303677683 Training Accuracy:  87.73095238095237 CE Test Loss:  0.37874463293391336 SE Test Loss:  0.09569626276784314 Testing Accuracy:  86.79444444444445
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 153.11it/s]


CE Training Loss:  0.35230783064238946 SE Training Loss:  0.09007750417532755 Training Accuracy:  87.54047619047618 CE Test Loss:  0.3929233889303621 SE Test Loss:  0.09818482106887785 Testing Accuracy:  86.61111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 222.78it/s]


CE Training Loss:  0.3399627601062014 SE Training Loss:  0.08730257731866672 Training Accuracy:  87.97142857142858 CE Test Loss:  0.3883433098491288 SE Test Loss:  0.09679102094239764 Testing Accuracy:  86.92222222222222


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,87.97143
tr_loss_CE,0.33996
tr_loss_SE,0.0873
val_accuracy,86.92222
val_loss_CE,0.38834
val_loss_SE,0.09679


[34m[1mwandb[0m: Agent Starting Run: d1kckih0 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3251098566960775 SE Training Loss:  0.4513323882

100%|██████████| 1312/1312 [00:05<00:00, 224.59it/s]


CE Training Loss:  0.43914641197933 SE Training Loss:  0.1125218593946983 Training Accuracy:  84.16666666666667 CE Test Loss:  0.4483152346677464 SE Test Loss:  0.114240314472195 Testing Accuracy:  83.97777777777777
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 163.02it/s]


CE Training Loss:  0.3733752998517749 SE Training Loss:  0.09606954995563817 Training Accuracy:  86.55714285714286 CE Test Loss:  0.39623260117148074 SE Test Loss:  0.10075517413491711 Testing Accuracy:  85.9888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 170.82it/s]


CE Training Loss:  0.3455587203850888 SE Training Loss:  0.08897910710367682 Training Accuracy:  87.72380952380952 CE Test Loss:  0.38111008815078323 SE Test Loss:  0.09657316472572558 Testing Accuracy:  86.4888888888889
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 225.26it/s]


CE Training Loss:  0.33249420129466456 SE Training Loss:  0.0857675015138064 Training Accuracy:  88.17380952380952 CE Test Loss:  0.3783190510808772 SE Test Loss:  0.09540823422030477 Testing Accuracy:  86.7
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 157.59it/s]


CE Training Loss:  0.3298679347164885 SE Training Loss:  0.08561538562880915 Training Accuracy:  88.22619047619048 CE Test Loss:  0.3858333413471239 SE Test Loss:  0.09688089183055928 Testing Accuracy:  86.61666666666666


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,88.22619
tr_loss_CE,0.32987
tr_loss_SE,0.08562
val_accuracy,86.61667
val_loss_CE,0.38583
val_loss_SE,0.09688


[34m[1mwandb[0m: Agent Starting Run: fui0p5uw with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3738485648635 SE Training Loss:  0.4597866595143

100%|██████████| 1312/1312 [00:07<00:00, 169.91it/s]


CE Training Loss:  0.5841737202852851 SE Training Loss:  0.14214271988182262 Training Accuracy:  80.17619047619047 CE Test Loss:  0.5858672987264371 SE Test Loss:  0.14241802958341096 Testing Accuracy:  80.41111111111111
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 178.97it/s]


CE Training Loss:  0.5700971885332943 SE Training Loss:  0.13810698236864113 Training Accuracy:  80.9952380952381 CE Test Loss:  0.5725054297127723 SE Test Loss:  0.13851504780359325 Testing Accuracy:  81.06111111111112
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 223.65it/s]


CE Training Loss:  0.5648232748806461 SE Training Loss:  0.13628467995876617 Training Accuracy:  81.20476190476191 CE Test Loss:  0.5669639009823606 SE Test Loss:  0.13662981881060812 Testing Accuracy:  81.32222222222222
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 156.59it/s]


CE Training Loss:  0.5587295807375058 SE Training Loss:  0.1346023344744257 Training Accuracy:  81.52142857142857 CE Test Loss:  0.5613488258434622 SE Test Loss:  0.13504063342199393 Testing Accuracy:  81.58333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 225.41it/s]


CE Training Loss:  0.5584359351578857 SE Training Loss:  0.13434772231952596 Training Accuracy:  81.53333333333333 CE Test Loss:  0.5613768637837748 SE Test Loss:  0.13488703252332787 Testing Accuracy:  81.55555555555556


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,81.53333
tr_loss_CE,0.55844
tr_loss_SE,0.13435
val_accuracy,81.55556
val_loss_CE,0.56138
val_loss_SE,0.13489


[34m[1mwandb[0m: Agent Starting Run: 6fmjsr1n with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.4195202899101673 SE Training Loss:  0.469767235

100%|██████████| 1312/1312 [00:07<00:00, 177.65it/s]


CE Training Loss:  0.42866095755099837 SE Training Loss:  0.11095642960333445 Training Accuracy:  84.40714285714286 CE Test Loss:  0.43929940028983316 SE Test Loss:  0.11302366101080065 Testing Accuracy:  84.16111111111111
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 207.89it/s]


CE Training Loss:  0.3812920908452503 SE Training Loss:  0.09895473270958276 Training Accuracy:  86.17619047619047 CE Test Loss:  0.4050116002553513 SE Test Loss:  0.10397669603790238 Testing Accuracy:  85.53888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 153.53it/s]


CE Training Loss:  0.35918918368666153 SE Training Loss:  0.09325580500966482 Training Accuracy:  87.08333333333333 CE Test Loss:  0.3922691750341287 SE Test Loss:  0.09994918228382746 Testing Accuracy:  86.09444444444445
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 211.55it/s]


CE Training Loss:  0.3451560799300643 SE Training Loss:  0.08928090874231115 Training Accuracy:  87.6452380952381 CE Test Loss:  0.3852665044972165 SE Test Loss:  0.097513515181984 Testing Accuracy:  86.63888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 177.89it/s]


CE Training Loss:  0.33362262490353684 SE Training Loss:  0.08613746782921755 Training Accuracy:  88.18333333333334 CE Test Loss:  0.38219062119115127 SE Test Loss:  0.09611788509186285 Testing Accuracy:  86.92222222222222


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,88.18333
tr_loss_CE,0.33362
tr_loss_SE,0.08614
val_accuracy,86.92222
val_loss_CE,0.38219
val_loss_SE,0.09612


[34m[1mwandb[0m: Agent Starting Run: ax9r1gak with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3412849579002635 SE Training Loss:  0.454074412

100%|██████████| 1312/1312 [00:06<00:00, 199.90it/s]


CE Training Loss:  0.42318330700551166 SE Training Loss:  0.10887378681483793 Training Accuracy:  84.77380952380952 CE Test Loss:  0.4359565746054125 SE Test Loss:  0.11165430766581523 Testing Accuracy:  84.5
Epoch:  2


100%|██████████| 1312/1312 [00:09<00:00, 143.36it/s]


CE Training Loss:  0.3829622797876951 SE Training Loss:  0.09858379758315977 Training Accuracy:  86.10952380952381 CE Test Loss:  0.40804430825206156 SE Test Loss:  0.1039415772708531 Testing Accuracy:  85.38333333333334
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 210.37it/s]


CE Training Loss:  0.3563074595082384 SE Training Loss:  0.09190549592395451 Training Accuracy:  87.11190476190477 CE Test Loss:  0.39323056492387665 SE Test Loss:  0.09940545904699932 Testing Accuracy:  86.08888888888889
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 175.71it/s]


CE Training Loss:  0.33916581847876537 SE Training Loss:  0.08758169982467556 Training Accuracy:  87.72619047619048 CE Test Loss:  0.38314512009420126 SE Test Loss:  0.09632806882758388 Testing Accuracy:  86.60555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 149.42it/s]


CE Training Loss:  0.3296421633197618 SE Training Loss:  0.08466731740643876 Training Accuracy:  88.25 CE Test Loss:  0.383326449134606 SE Test Loss:  0.095292391065642 Testing Accuracy:  86.79444444444445


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,88.25
tr_loss_CE,0.32964
tr_loss_SE,0.08467
val_accuracy,86.79444
val_loss_CE,0.38333
val_loss_SE,0.09529


[34m[1mwandb[0m: Agent Starting Run: 9hy8ey08 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.27648483406902 SE Training Loss:  0.44742538534

100%|██████████| 1312/1312 [00:08<00:00, 148.72it/s]


CE Training Loss:  0.6506255382545798 SE Training Loss:  0.157803774384925 Training Accuracy:  77.33333333333333 CE Test Loss:  0.6528720846272498 SE Test Loss:  0.158162706031426 Testing Accuracy:  77.38888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 156.62it/s]


CE Training Loss:  0.6052791266322683 SE Training Loss:  0.1463809403788681 Training Accuracy:  79.51428571428572 CE Test Loss:  0.6081792398408884 SE Test Loss:  0.14698727347368507 Testing Accuracy:  79.5
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 196.53it/s]


CE Training Loss:  0.581083451122601 SE Training Loss:  0.1402091364405518 Training Accuracy:  80.67380952380952 CE Test Loss:  0.5841284796728144 SE Test Loss:  0.14089000170874674 Testing Accuracy:  80.51666666666667
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 149.33it/s]


CE Training Loss:  0.57001835514999 SE Training Loss:  0.137307639311927 Training Accuracy:  81.21190476190476 CE Test Loss:  0.57318853901708 SE Test Loss:  0.13804358097237562 Testing Accuracy:  81.1
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 160.60it/s]


CE Training Loss:  0.5631661004383862 SE Training Loss:  0.13541119886682337 Training Accuracy:  81.53809523809524 CE Test Loss:  0.5664637627260832 SE Test Loss:  0.13619137384947697 Testing Accuracy:  81.28333333333333


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,81.5381
tr_loss_CE,0.56317
tr_loss_SE,0.13541
val_accuracy,81.28333
val_loss_CE,0.56646
val_loss_SE,0.13619


[34m[1mwandb[0m: Agent Starting Run: 3eoornkw with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3349029432090656 SE Training Loss:  0.4550832

100%|██████████| 1312/1312 [00:06<00:00, 195.97it/s]


CE Training Loss:  0.44321737777533954 SE Training Loss:  0.11062457418233845 Training Accuracy:  84.58571428571429 CE Test Loss:  0.45491920560916954 SE Test Loss:  0.1137633253026493 Testing Accuracy:  84.16666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 258.56it/s]


CE Training Loss:  0.39055974074232586 SE Training Loss:  0.09784168046980052 Training Accuracy:  86.45 CE Test Loss:  0.4079344734348397 SE Test Loss:  0.10257743347704656 Testing Accuracy:  85.80555555555556
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 173.86it/s]


CE Training Loss:  0.36615631807955173 SE Training Loss:  0.0909673943132271 Training Accuracy:  87.4452380952381 CE Test Loss:  0.3902840948238906 SE Test Loss:  0.0971093990207087 Testing Accuracy:  86.47222222222223
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 248.68it/s]


CE Training Loss:  0.35592677582978755 SE Training Loss:  0.08850366089177165 Training Accuracy:  87.87619047619047 CE Test Loss:  0.38625370184453545 SE Test Loss:  0.09581306179899875 Testing Accuracy:  86.83333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:10<00:00, 123.76it/s]


CE Training Loss:  0.33704719787932974 SE Training Loss:  0.08356537508245919 Training Accuracy:  88.4047619047619 CE Test Loss:  0.3705100739905934 SE Test Loss:  0.0919112508962775 Testing Accuracy:  87.40555555555555


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,88.40476
tr_loss_CE,0.33705
tr_loss_SE,0.08357
val_accuracy,87.40556
val_loss_CE,0.37051
val_loss_SE,0.09191


[34m[1mwandb[0m: Agent Starting Run: hmjplqjj with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.465625241380099 SE Training Loss:  0.46266700

100%|██████████| 1312/1312 [00:07<00:00, 186.92it/s]


CE Training Loss:  0.4214778892967141 SE Training Loss:  0.10586004658919555 Training Accuracy:  85.25238095238095 CE Test Loss:  0.4297478177412615 SE Test Loss:  0.10832011460944363 Testing Accuracy:  85.02777777777777
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 192.51it/s]


CE Training Loss:  0.3907902208370213 SE Training Loss:  0.09840313960674008 Training Accuracy:  86.3 CE Test Loss:  0.4043211251813576 SE Test Loss:  0.10160418015001829 Testing Accuracy:  85.8
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 257.79it/s]


CE Training Loss:  0.3666242449335216 SE Training Loss:  0.09186791265107863 Training Accuracy:  87.2547619047619 CE Test Loss:  0.38999409004453883 SE Test Loss:  0.0976552900915566 Testing Accuracy:  86.42777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 169.90it/s]


CE Training Loss:  0.3682559258245019 SE Training Loss:  0.09225352617479854 Training Accuracy:  87.22619047619048 CE Test Loss:  0.39469767242385934 SE Test Loss:  0.09849281658775312 Testing Accuracy:  86.2611111111111
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 252.52it/s]


CE Training Loss:  0.35417355402913303 SE Training Loss:  0.08838225379612806 Training Accuracy:  87.7547619047619 CE Test Loss:  0.3882117441062125 SE Test Loss:  0.09667557902987614 Testing Accuracy:  86.57222222222222


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,87.75476
tr_loss_CE,0.35417
tr_loss_SE,0.08838
val_accuracy,86.57222
val_loss_CE,0.38821
val_loss_SE,0.09668


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: phd0gtxf with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.4093022488780114 SE Training Loss:  0.4607781

100%|██████████| 1312/1312 [00:06<00:00, 208.59it/s]


CE Training Loss:  0.990409021794853 SE Training Loss:  0.24170831420263053 Training Accuracy:  68.0952380952381 CE Test Loss:  0.9932854850235987 SE Test Loss:  0.24250353928399873 Testing Accuracy:  68.0
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 174.53it/s]


CE Training Loss:  1.017200428067201 SE Training Loss:  0.24690583392067894 Training Accuracy:  66.14285714285714 CE Test Loss:  1.0187898494872785 SE Test Loss:  0.24734347837124496 Testing Accuracy:  65.93333333333334
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 247.70it/s]


CE Training Loss:  1.0334892689264372 SE Training Loss:  0.2501073489638408 Training Accuracy:  64.93095238095238 CE Test Loss:  1.0344422501767983 SE Test Loss:  0.2504087305316545 Testing Accuracy:  64.89444444444445
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 171.18it/s]


CE Training Loss:  1.0412631392524845 SE Training Loss:  0.25144084128133204 Training Accuracy:  64.6952380952381 CE Test Loss:  1.041907744450334 SE Test Loss:  0.2516533008143025 Testing Accuracy:  64.84444444444445
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 250.24it/s]


CE Training Loss:  1.0511720661402113 SE Training Loss:  0.25353248569219683 Training Accuracy:  64.39285714285714 CE Test Loss:  1.0516055801405133 SE Test Loss:  0.25366786869811797 Testing Accuracy:  64.43333333333334


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,64.39286
tr_loss_CE,1.05117
tr_loss_SE,0.25353
val_accuracy,64.43333
val_loss_CE,1.05161
val_loss_SE,0.25367


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zrnst5z7 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.377075392330591 SE Training Loss:  0.45724376304

100%|██████████| 1312/1312 [00:06<00:00, 194.16it/s]


CE Training Loss:  0.4267659075934204 SE Training Loss:  0.10635756664086406 Training Accuracy:  85.22142857142858 CE Test Loss:  0.4385942195380185 SE Test Loss:  0.10941814128675051 Testing Accuracy:  84.96666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 146.45it/s]


CE Training Loss:  0.3773534087243217 SE Training Loss:  0.0936612835173643 Training Accuracy:  87.03095238095239 CE Test Loss:  0.39933445098117853 SE Test Loss:  0.0991036711785086 Testing Accuracy:  86.3
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 217.09it/s]


CE Training Loss:  0.3612202523317352 SE Training Loss:  0.08991925953676731 Training Accuracy:  87.50952380952381 CE Test Loss:  0.3919920177553151 SE Test Loss:  0.09728719628072371 Testing Accuracy:  86.7
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 156.94it/s]


CE Training Loss:  0.34000958403675435 SE Training Loss:  0.08429187547560572 Training Accuracy:  88.4 CE Test Loss:  0.3767202337119373 SE Test Loss:  0.09292689434786622 Testing Accuracy:  87.33333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 168.57it/s]


CE Training Loss:  0.33820287754385514 SE Training Loss:  0.08328872108554423 Training Accuracy:  88.62142857142857 CE Test Loss:  0.3821822963804947 SE Test Loss:  0.09392259516694107 Testing Accuracy:  87.18333333333334


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,88.62143
tr_loss_CE,0.3382
tr_loss_SE,0.08329
val_accuracy,87.18333
val_loss_CE,0.38218
val_loss_SE,0.09392


[34m[1mwandb[0m: Agent Starting Run: 996hzv1q with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3513896369326073 SE Training Loss:  0.4556170940

100%|██████████| 1312/1312 [00:08<00:00, 153.11it/s]


CE Training Loss:  0.4388318450659608 SE Training Loss:  0.11072897580728336 Training Accuracy:  84.66904761904762 CE Test Loss:  0.4484429830761597 SE Test Loss:  0.1132645074564319 Testing Accuracy:  84.5
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 210.93it/s]


CE Training Loss:  0.3913542003645009 SE Training Loss:  0.09794054118030443 Training Accuracy:  86.37857142857143 CE Test Loss:  0.40820290549730204 SE Test Loss:  0.10234196589132748 Testing Accuracy:  85.86111111111111
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 178.93it/s]


CE Training Loss:  0.37326984030591825 SE Training Loss:  0.09246729101434788 Training Accuracy:  87.14285714285714 CE Test Loss:  0.39722672355770716 SE Test Loss:  0.09843790092101931 Testing Accuracy:  86.38333333333334
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 155.28it/s]


CE Training Loss:  0.35546209153680447 SE Training Loss:  0.08810412954707376 Training Accuracy:  87.88571428571429 CE Test Loss:  0.38755935568216743 SE Test Loss:  0.09592079769641866 Testing Accuracy:  86.73333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 216.57it/s]


CE Training Loss:  0.3566202766689391 SE Training Loss:  0.08791203676525154 Training Accuracy:  87.82857142857142 CE Test Loss:  0.39389231284610166 SE Test Loss:  0.09700532940836598 Testing Accuracy:  86.73333333333333


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,87.82857
tr_loss_CE,0.35662
tr_loss_SE,0.08791
val_accuracy,86.73333
val_loss_CE,0.39389
val_loss_SE,0.09701


[34m[1mwandb[0m: Agent Starting Run: s6z7eetj with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.362133598778783 SE Training Loss:  0.45763536021

100%|██████████| 1312/1312 [00:06<00:00, 204.00it/s]


CE Training Loss:  1.0061749292119373 SE Training Loss:  0.24657241162527685 Training Accuracy:  66.98571428571428 CE Test Loss:  1.0071109318691465 SE Test Loss:  0.2471957021386424 Testing Accuracy:  66.88888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 163.16it/s]


CE Training Loss:  1.0276867617288108 SE Training Loss:  0.24970070399210803 Training Accuracy:  65.68333333333334 CE Test Loss:  1.0276964880115447 SE Test Loss:  0.2500846941955546 Testing Accuracy:  65.60555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 154.11it/s]


CE Training Loss:  1.1632405178237746 SE Training Loss:  0.2801697056091577 Training Accuracy:  55.57142857142857 CE Test Loss:  1.1617673539342095 SE Test Loss:  0.280034098396943 Testing Accuracy:  55.46111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 211.70it/s]


CE Training Loss:  1.1682950522545874 SE Training Loss:  0.2809834190655749 Training Accuracy:  55.72142857142857 CE Test Loss:  1.1666807990735628 SE Test Loss:  0.2807938088884057 Testing Accuracy:  55.61666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 149.15it/s]


CE Training Loss:  1.1698860481506697 SE Training Loss:  0.28105038539738 Training Accuracy:  55.65238095238095 CE Test Loss:  1.1682922401361266 SE Test Loss:  0.2808702261782303 Testing Accuracy:  55.56111111111111


0,1
tr_accuracy,▁██▇▇▇
tr_loss_CE,█▁▁▂▂▂
tr_loss_SE,█▁▁▂▂▂
val_accuracy,▁██▇▇▇
val_loss_CE,█▁▁▂▂▂
val_loss_SE,█▁▁▂▂▂

0,1
tr_accuracy,55.65238
tr_loss_CE,1.16989
tr_loss_SE,0.28105
val_accuracy,55.56111
val_loss_CE,1.16829
val_loss_SE,0.28087


[34m[1mwandb[0m: Agent Starting Run: etxbvniq with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.329018458181603 SE Training Loss:  0.4514838050

100%|██████████| 1312/1312 [00:08<00:00, 146.53it/s]


CE Training Loss:  0.43221679731145085 SE Training Loss:  0.10766294277033031 Training Accuracy:  84.95952380952382 CE Test Loss:  0.4409688648567716 SE Test Loss:  0.11016685885825694 Testing Accuracy:  84.77777777777777
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 153.92it/s]


CE Training Loss:  0.39403649224655823 SE Training Loss:  0.0984715501209457 Training Accuracy:  86.28809523809524 CE Test Loss:  0.4123743184978772 SE Test Loss:  0.10318729232544045 Testing Accuracy:  85.55555555555556
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 209.36it/s]


CE Training Loss:  0.36652621744230784 SE Training Loss:  0.09170689648567473 Training Accuracy:  87.21428571428571 CE Test Loss:  0.39194496690335456 SE Test Loss:  0.09841712304676609 Testing Accuracy:  86.38888888888889
Epoch:  4


100%|██████████| 1312/1312 [00:09<00:00, 144.82it/s]


CE Training Loss:  0.3571742667449823 SE Training Loss:  0.08849050136654636 Training Accuracy:  87.74761904761905 CE Test Loss:  0.390789523314693 SE Test Loss:  0.096964409267209 Testing Accuracy:  86.63888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 175.60it/s]


CE Training Loss:  0.3390376070590004 SE Training Loss:  0.08416132152719234 Training Accuracy:  88.37142857142857 CE Test Loss:  0.37859842917577685 SE Test Loss:  0.09383149979655123 Testing Accuracy:  87.2


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,88.37143
tr_loss_CE,0.33904
tr_loss_SE,0.08416
val_accuracy,87.2
val_loss_CE,0.3786
val_loss_SE,0.09383


[34m[1mwandb[0m: Agent Starting Run: yyc39dl6 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.4137351943015926 SE Training Loss:  0.460910705

100%|██████████| 1312/1312 [00:08<00:00, 152.58it/s]


CE Training Loss:  0.4314926213032638 SE Training Loss:  0.10758475502564195 Training Accuracy:  85.03571428571429 CE Test Loss:  0.4384222220196298 SE Test Loss:  0.10974966995995522 Testing Accuracy:  84.75555555555556
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 202.30it/s]


CE Training Loss:  0.390372839783606 SE Training Loss:  0.09737630312579361 Training Accuracy:  86.39761904761905 CE Test Loss:  0.40669724857468775 SE Test Loss:  0.10174686645407444 Testing Accuracy:  85.85555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 147.28it/s]


CE Training Loss:  0.3813760481008846 SE Training Loss:  0.09543531469932341 Training Accuracy:  86.55714285714286 CE Test Loss:  0.40427847471282496 SE Test Loss:  0.10123075856080814 Testing Accuracy:  85.87222222222222
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 151.67it/s]


CE Training Loss:  0.37489475550252654 SE Training Loss:  0.09328436860042262 Training Accuracy:  86.91190476190476 CE Test Loss:  0.4024269926864682 SE Test Loss:  0.09984374601221885 Testing Accuracy:  86.08888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 200.78it/s]


CE Training Loss:  0.35759876770831256 SE Training Loss:  0.08834327244876236 Training Accuracy:  87.78333333333333 CE Test Loss:  0.38857513221353424 SE Test Loss:  0.0961813962773384 Testing Accuracy:  86.58888888888889


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,87.78333
tr_loss_CE,0.3576
tr_loss_SE,0.08834
val_accuracy,86.58889
val_loss_CE,0.38858
val_loss_SE,0.09618


[34m[1mwandb[0m: Agent Starting Run: 2yhc7y0l with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.440388246947826 SE Training Loss:  0.4633101481

100%|██████████| 1312/1312 [00:08<00:00, 159.84it/s]


CE Training Loss:  1.0022822152799296 SE Training Loss:  0.24331526011285454 Training Accuracy:  66.62142857142857 CE Test Loss:  1.0025846056677954 SE Test Loss:  0.24347227431044954 Testing Accuracy:  66.66666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 198.94it/s]


CE Training Loss:  1.0085489708844269 SE Training Loss:  0.24290073705486104 Training Accuracy:  66.3952380952381 CE Test Loss:  1.0080126938210268 SE Test Loss:  0.24287110645392088 Testing Accuracy:  66.6
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 137.96it/s]


CE Training Loss:  1.020916462656238 SE Training Loss:  0.24512639108943646 Training Accuracy:  65.81190476190476 CE Test Loss:  1.0202128607560217 SE Test Loss:  0.24506768043369104 Testing Accuracy:  65.90555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 178.73it/s]


CE Training Loss:  1.0296065600318571 SE Training Loss:  0.2464741150431527 Training Accuracy:  65.45476190476191 CE Test Loss:  1.0287864076200335 SE Test Loss:  0.24639469083962276 Testing Accuracy:  65.66666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 199.24it/s]


CE Training Loss:  1.0358373358367499 SE Training Loss:  0.24738767210766885 Training Accuracy:  65.48571428571428 CE Test Loss:  1.035001286682921 SE Test Loss:  0.24731173788296806 Testing Accuracy:  65.66666666666667


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,65.48571
tr_loss_CE,1.03584
tr_loss_SE,0.24739
val_accuracy,65.66667
val_loss_CE,1.035
val_loss_SE,0.24731


[34m[1mwandb[0m: Agent Starting Run: qrbolcnp with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3845621082086033 SE Training Loss:  0.460027

100%|██████████| 1312/1312 [00:05<00:00, 249.51it/s]


CE Training Loss:  0.5180661552632856 SE Training Loss:  0.1282292645640433 Training Accuracy:  82.09285714285714 CE Test Loss:  0.5223526667252019 SE Test Loss:  0.12934865360009332 Testing Accuracy:  82.11666666666666
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 195.88it/s]


CE Training Loss:  0.44805058172585266 SE Training Loss:  0.11287320390082153 Training Accuracy:  84.00714285714285 CE Test Loss:  0.45394919717278437 SE Test Loss:  0.11406892552144642 Testing Accuracy:  84.2
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 223.60it/s]


CE Training Loss:  0.41921609511110175 SE Training Loss:  0.10616682644420225 Training Accuracy:  84.98333333333333 CE Test Loss:  0.42812351144846417 SE Test Loss:  0.10800097842350291 Testing Accuracy:  84.94444444444444
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 226.20it/s]


CE Training Loss:  0.4011338244636579 SE Training Loss:  0.10186162510170817 Training Accuracy:  85.62857142857143 CE Test Loss:  0.4129269696938328 SE Test Loss:  0.10440156015453847 Testing Accuracy:  85.49444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 197.39it/s]


CE Training Loss:  0.38816939738096 SE Training Loss:  0.09871900474718201 Training Accuracy:  86.09761904761905 CE Test Loss:  0.4026922336657548 SE Test Loss:  0.10193835922537259 Testing Accuracy:  85.89444444444445


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.09762
tr_loss_CE,0.38817
tr_loss_SE,0.09872
val_accuracy,85.89444
val_loss_CE,0.40269
val_loss_SE,0.10194


[34m[1mwandb[0m: Agent Starting Run: 0ygt1nmd with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3679805751389806 SE Training Loss:  0.455962

100%|██████████| 1312/1312 [00:07<00:00, 174.89it/s]


CE Training Loss:  0.5357867383426269 SE Training Loss:  0.13131773625694312 Training Accuracy:  81.92857142857143 CE Test Loss:  0.5409927917156977 SE Test Loss:  0.13276482912740123 Testing Accuracy:  81.7611111111111
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 239.41it/s]


CE Training Loss:  0.4558516638426443 SE Training Loss:  0.11409455000762828 Training Accuracy:  83.93333333333334 CE Test Loss:  0.46103501001193664 SE Test Loss:  0.11521898599405252 Testing Accuracy:  83.74444444444444
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 167.29it/s]


CE Training Loss:  0.42568210012302576 SE Training Loss:  0.10713224548043336 Training Accuracy:  84.91666666666667 CE Test Loss:  0.4323635790125069 SE Test Loss:  0.1086251814711691 Testing Accuracy:  84.68888888888888
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 249.53it/s]


CE Training Loss:  0.40737685231375986 SE Training Loss:  0.10273161857713872 Training Accuracy:  85.6047619047619 CE Test Loss:  0.4157189812862286 SE Test Loss:  0.10468682279344017 Testing Accuracy:  85.38888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 165.04it/s]


CE Training Loss:  0.3938780307653014 SE Training Loss:  0.09944797457334363 Training Accuracy:  86.06904761904762 CE Test Loss:  0.40390003108806183 SE Test Loss:  0.10183933039846159 Testing Accuracy:  85.83333333333333


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.06905
tr_loss_CE,0.39388
tr_loss_SE,0.09945
val_accuracy,85.83333
val_loss_CE,0.4039
val_loss_SE,0.10184


[34m[1mwandb[0m: Agent Starting Run: i05czk4o with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3589434378403467 SE Training Loss:  0.455569

100%|██████████| 1312/1312 [00:05<00:00, 219.53it/s]


CE Training Loss:  0.6152103234876427 SE Training Loss:  0.1482224755760092 Training Accuracy:  80.17142857142858 CE Test Loss:  0.620121003370948 SE Test Loss:  0.14948852402792687 Testing Accuracy:  80.13888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 189.56it/s]


CE Training Loss:  0.5559604915494714 SE Training Loss:  0.1337003673899773 Training Accuracy:  82.10952380952381 CE Test Loss:  0.5603617853882275 SE Test Loss:  0.1345772920079293 Testing Accuracy:  81.92777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 247.62it/s]


CE Training Loss:  0.5365133481087342 SE Training Loss:  0.12900168153207914 Training Accuracy:  82.73571428571428 CE Test Loss:  0.540928684849122 SE Test Loss:  0.1298477142179548 Testing Accuracy:  82.46111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 169.76it/s]


CE Training Loss:  0.5262248426981836 SE Training Loss:  0.12654014134905378 Training Accuracy:  82.98571428571428 CE Test Loss:  0.5307836703406426 SE Test Loss:  0.12742055465545038 Testing Accuracy:  82.78333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 250.79it/s]


CE Training Loss:  0.5198493036647668 SE Training Loss:  0.12500993561723137 Training Accuracy:  83.22142857142858 CE Test Loss:  0.5245806515387189 SE Test Loss:  0.12593424987027002 Testing Accuracy:  82.92777777777778


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,83.22143
tr_loss_CE,0.51985
tr_loss_SE,0.12501
val_accuracy,82.92778
val_loss_CE,0.52458
val_loss_SE,0.12593


[34m[1mwandb[0m: Agent Starting Run: fdj71v56 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3478152927279416 SE Training Loss:  0.453923151

100%|██████████| 1312/1312 [00:06<00:00, 213.94it/s]


CE Training Loss:  0.5199201823372571 SE Training Loss:  0.12895853529405993 Training Accuracy:  81.99047619047619 CE Test Loss:  0.5245151488464038 SE Test Loss:  0.1301256105317464 Testing Accuracy:  81.95555555555555
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 149.04it/s]


CE Training Loss:  0.4464388957291581 SE Training Loss:  0.11266566567079858 Training Accuracy:  84.08571428571429 CE Test Loss:  0.4519666287590097 SE Test Loss:  0.11378640836176869 Testing Accuracy:  84.05
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 171.64it/s]


CE Training Loss:  0.41426024721609694 SE Training Loss:  0.1051671013506127 Training Accuracy:  85.15 CE Test Loss:  0.42278441838429937 SE Test Loss:  0.10691273928124306 Testing Accuracy:  85.13888888888889
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 215.62it/s]


CE Training Loss:  0.3939333521296788 SE Training Loss:  0.10029790811250028 Training Accuracy:  85.88333333333334 CE Test Loss:  0.40566653076332804 SE Test Loss:  0.10279025664620142 Testing Accuracy:  85.7
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 148.74it/s]


CE Training Loss:  0.379357021422473 SE Training Loss:  0.09676692068131917 Training Accuracy:  86.41666666666667 CE Test Loss:  0.3940338749961743 SE Test Loss:  0.09994409817920231 Testing Accuracy:  86.07222222222222


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.41667
tr_loss_CE,0.37936
tr_loss_SE,0.09677
val_accuracy,86.07222
val_loss_CE,0.39403
val_loss_SE,0.09994


[34m[1mwandb[0m: Agent Starting Run: d09wagut with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3878502989104495 SE Training Loss:  0.456311454

100%|██████████| 1312/1312 [00:06<00:00, 215.95it/s]


CE Training Loss:  0.5193610592769458 SE Training Loss:  0.12926796441186444 Training Accuracy:  82.07857142857142 CE Test Loss:  0.5237908800147865 SE Test Loss:  0.13026706916460862 Testing Accuracy:  81.89444444444445
Epoch:  2


100%|██████████| 1312/1312 [00:09<00:00, 144.46it/s]


CE Training Loss:  0.44169987216251616 SE Training Loss:  0.11114650819360503 Training Accuracy:  84.37142857142857 CE Test Loss:  0.4481715363574006 SE Test Loss:  0.1123878160168739 Testing Accuracy:  84.3
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 211.84it/s]


CE Training Loss:  0.40857507923641156 SE Training Loss:  0.10325564409192928 Training Accuracy:  85.41428571428571 CE Test Loss:  0.4183479077790888 SE Test Loss:  0.10524716742991304 Testing Accuracy:  85.40555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 180.93it/s]


CE Training Loss:  0.3882853772325131 SE Training Loss:  0.09839354557246134 Training Accuracy:  86.23095238095237 CE Test Loss:  0.40124478521780677 SE Test Loss:  0.10112403562434777 Testing Accuracy:  85.9
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 152.09it/s]


CE Training Loss:  0.3737969458523537 SE Training Loss:  0.09492432008146826 Training Accuracy:  86.76190476190476 CE Test Loss:  0.3898954533110584 SE Test Loss:  0.09839135544234107 Testing Accuracy:  86.31111111111112


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.7619
tr_loss_CE,0.3738
tr_loss_SE,0.09492
val_accuracy,86.31111
val_loss_CE,0.3899
val_loss_SE,0.09839


[34m[1mwandb[0m: Agent Starting Run: g5s7ai9q with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.4708325266602045 SE Training Loss:  0.465205311

100%|██████████| 1312/1312 [00:08<00:00, 150.84it/s]


CE Training Loss:  0.5893570901373724 SE Training Loss:  0.1418431516434776 Training Accuracy:  81.31666666666666 CE Test Loss:  0.5934445292820714 SE Test Loss:  0.1430202392196441 Testing Accuracy:  81.02777777777777
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 169.22it/s]


CE Training Loss:  0.5369983974886074 SE Training Loss:  0.12907292014253532 Training Accuracy:  82.79761904761905 CE Test Loss:  0.5410836373519093 SE Test Loss:  0.13002867085413486 Testing Accuracy:  82.63888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 202.15it/s]


CE Training Loss:  0.5206733043804145 SE Training Loss:  0.1251026917319215 Training Accuracy:  83.21904761904761 CE Test Loss:  0.5252891483124188 SE Test Loss:  0.12610682897693834 Testing Accuracy:  83.12222222222222
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 149.30it/s]


CE Training Loss:  0.5118643994819272 SE Training Loss:  0.12297681116831498 Training Accuracy:  83.43095238095238 CE Test Loss:  0.517036086773617 SE Test Loss:  0.12408110146305044 Testing Accuracy:  83.41666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 197.41it/s]


CE Training Loss:  0.5064785554399757 SE Training Loss:  0.12167921550950157 Training Accuracy:  83.62380952380953 CE Test Loss:  0.5121413398896794 SE Test Loss:  0.12288196445835729 Testing Accuracy:  83.54444444444445


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,83.62381
tr_loss_CE,0.50648
tr_loss_SE,0.12168
val_accuracy,83.54444
val_loss_CE,0.51214
val_loss_SE,0.12288


[34m[1mwandb[0m: Agent Starting Run: ux8mi946 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3910652669285817 SE Training Loss:  0.45890601

100%|██████████| 1312/1312 [00:06<00:00, 194.26it/s]


CE Training Loss:  0.5111797464334453 SE Training Loss:  0.12696690681315626 Training Accuracy:  82.3452380952381 CE Test Loss:  0.5161846940772193 SE Test Loss:  0.12831307190285351 Testing Accuracy:  82.13333333333334
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 146.13it/s]


CE Training Loss:  0.44283411684617907 SE Training Loss:  0.1116113847495285 Training Accuracy:  84.32619047619048 CE Test Loss:  0.4496165531001552 SE Test Loss:  0.11324973359197937 Testing Accuracy:  84.14444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 138.10it/s]


CE Training Loss:  0.4145551488181208 SE Training Loss:  0.10489623868290573 Training Accuracy:  85.47142857142858 CE Test Loss:  0.4244410615135114 SE Test Loss:  0.10727008741467718 Testing Accuracy:  84.9888888888889
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 195.64it/s]


CE Training Loss:  0.3964567397505905 SE Training Loss:  0.10047855955191334 Training Accuracy:  86.01428571428572 CE Test Loss:  0.409329954370347 SE Test Loss:  0.10356185758115115 Testing Accuracy:  85.40555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 158.41it/s]


CE Training Loss:  0.38336539960127264 SE Training Loss:  0.09727099154783428 Training Accuracy:  86.45714285714286 CE Test Loss:  0.3991306410496553 SE Test Loss:  0.10101276722643925 Testing Accuracy:  85.7388888888889


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.45714
tr_loss_CE,0.38337
tr_loss_SE,0.09727
val_accuracy,85.73889
val_loss_CE,0.39913
val_loss_SE,0.10101


[34m[1mwandb[0m: Agent Starting Run: cjbve4jb with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3715117019076724 SE Training Loss:  0.45845550

100%|██████████| 1312/1312 [00:07<00:00, 178.42it/s]


CE Training Loss:  0.5121094340213437 SE Training Loss:  0.12709766160671965 Training Accuracy:  82.26904761904763 CE Test Loss:  0.5190185943362408 SE Test Loss:  0.12878878803265892 Testing Accuracy:  82.22222222222223
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 187.87it/s]


CE Training Loss:  0.4398344388662914 SE Training Loss:  0.11136408923783808 Training Accuracy:  84.25 CE Test Loss:  0.44865920852833274 SE Test Loss:  0.11327179366024077 Testing Accuracy:  83.97777777777777
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 138.59it/s]


CE Training Loss:  0.40955407757926676 SE Training Loss:  0.10427251271399669 Training Accuracy:  85.25952380952381 CE Test Loss:  0.4214524972850685 SE Test Loss:  0.10687157525316368 Testing Accuracy:  85.03888888888889
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 186.54it/s]


CE Training Loss:  0.3899308645431725 SE Training Loss:  0.0994849371141068 Training Accuracy:  85.92857142857143 CE Test Loss:  0.40465897811821355 SE Test Loss:  0.10273644435676957 Testing Accuracy:  85.60555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 175.52it/s]


CE Training Loss:  0.37553850311839404 SE Training Loss:  0.0959524535685152 Training Accuracy:  86.49285714285715 CE Test Loss:  0.392810309885806 SE Test Loss:  0.09976410625881095 Testing Accuracy:  86.00555555555556


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.49286
tr_loss_CE,0.37554
tr_loss_SE,0.09595
val_accuracy,86.00556
val_loss_CE,0.39281
val_loss_SE,0.09976


[34m[1mwandb[0m: Agent Starting Run: btec7nlm with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.2847154191057615 SE Training Loss:  0.44553883

100%|██████████| 1312/1312 [00:06<00:00, 193.09it/s]


CE Training Loss:  0.598897377253706 SE Training Loss:  0.14424171510970504 Training Accuracy:  80.59285714285714 CE Test Loss:  0.6043982210887854 SE Test Loss:  0.14560792789190327 Testing Accuracy:  80.68333333333334
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 164.04it/s]


CE Training Loss:  0.5403699485584649 SE Training Loss:  0.12977237114741566 Training Accuracy:  82.47857142857143 CE Test Loss:  0.5447529128038021 SE Test Loss:  0.13071891085275145 Testing Accuracy:  82.42777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 133.26it/s]


CE Training Loss:  0.5218048054535447 SE Training Loss:  0.12517296284499407 Training Accuracy:  83.2452380952381 CE Test Loss:  0.5265743063889818 SE Test Loss:  0.1262131059678341 Testing Accuracy:  83.02222222222223
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 189.23it/s]


CE Training Loss:  0.5126313508298946 SE Training Loss:  0.12293504669556372 Training Accuracy:  83.56666666666666 CE Test Loss:  0.5179228881670285 SE Test Loss:  0.12411461746513786 Testing Accuracy:  83.27222222222223
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 165.99it/s]


CE Training Loss:  0.5070035435036289 SE Training Loss:  0.12157152523121624 Training Accuracy:  83.76428571428572 CE Test Loss:  0.5127599461844145 SE Test Loss:  0.1228723662843705 Testing Accuracy:  83.46111111111111


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,83.76429
tr_loss_CE,0.507
tr_loss_SE,0.12157
val_accuracy,83.46111
val_loss_CE,0.51276
val_loss_SE,0.12287


[34m[1mwandb[0m: Agent Starting Run: 8cbe5uf1 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3539526523975765 SE Training Loss:  0.454796

100%|██████████| 1312/1312 [00:05<00:00, 223.11it/s]


CE Training Loss:  0.5560498098481899 SE Training Loss:  0.13377793059330584 Training Accuracy:  81.85952380952381 CE Test Loss:  0.5603541550016738 SE Test Loss:  0.13485467082331312 Testing Accuracy:  81.81666666666666
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 160.67it/s]


CE Training Loss:  0.4753447007922648 SE Training Loss:  0.11587733444068934 Training Accuracy:  84.05952380952381 CE Test Loss:  0.48118825221615 SE Test Loss:  0.11722340348921789 Testing Accuracy:  83.71111111111111
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 247.44it/s]


CE Training Loss:  0.4389934523745545 SE Training Loss:  0.10778029911842203 Training Accuracy:  85.11428571428571 CE Test Loss:  0.44667490827404993 SE Test Loss:  0.10966461764787598 Testing Accuracy:  84.91666666666667
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 190.30it/s]


CE Training Loss:  0.4155657454390006 SE Training Loss:  0.1024380102924916 Training Accuracy:  85.79047619047618 CE Test Loss:  0.4256494564440727 SE Test Loss:  0.10505123646688032 Testing Accuracy:  85.59444444444445
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 211.00it/s]


CE Training Loss:  0.39914419138653967 SE Training Loss:  0.09858443070481174 Training Accuracy:  86.31190476190476 CE Test Loss:  0.41157342066192104 SE Test Loss:  0.10196143312108219 Testing Accuracy:  85.88333333333334


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.3119
tr_loss_CE,0.39914
tr_loss_SE,0.09858
val_accuracy,85.88333
val_loss_CE,0.41157
val_loss_SE,0.10196


[34m[1mwandb[0m: Agent Starting Run: xi5fov4w with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.420111360798009 SE Training Loss:  0.4616786

100%|██████████| 1312/1312 [00:07<00:00, 164.58it/s]


CE Training Loss:  0.5462848861810925 SE Training Loss:  0.1324819975877123 Training Accuracy:  81.99047619047619 CE Test Loss:  0.548836747489932 SE Test Loss:  0.1330451507257044 Testing Accuracy:  81.88888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 228.14it/s]


CE Training Loss:  0.4651829256925373 SE Training Loss:  0.1139842444458077 Training Accuracy:  84.29285714285714 CE Test Loss:  0.4701453010847761 SE Test Loss:  0.11494666340735375 Testing Accuracy:  84.24444444444444
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 181.70it/s]


CE Training Loss:  0.4329492817610025 SE Training Loss:  0.10631004808670527 Training Accuracy:  85.32619047619048 CE Test Loss:  0.4402797694333077 SE Test Loss:  0.10799256089765069 Testing Accuracy:  85.16111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 204.00it/s]


CE Training Loss:  0.4132982799601637 SE Training Loss:  0.10153860900241829 Training Accuracy:  85.8404761904762 CE Test Loss:  0.42275251224232935 SE Test Loss:  0.10394986000476006 Testing Accuracy:  85.69444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 229.44it/s]


CE Training Loss:  0.3993508783536034 SE Training Loss:  0.09813692112999789 Training Accuracy:  86.36428571428571 CE Test Loss:  0.41084100886704616 SE Test Loss:  0.10125148518457314 Testing Accuracy:  85.98333333333333


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.36429
tr_loss_CE,0.39935
tr_loss_SE,0.09814
val_accuracy,85.98333
val_loss_CE,0.41084
val_loss_SE,0.10125


[34m[1mwandb[0m: Agent Starting Run: wgieygrp with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.395942901974305 SE Training Loss:  0.4600772

100%|██████████| 1312/1312 [00:05<00:00, 240.50it/s]


CE Training Loss:  1.1050393086086892 SE Training Loss:  0.2599838675412065 Training Accuracy:  66.32380952380953 CE Test Loss:  1.1067725866626006 SE Test Loss:  0.2604991737325729 Testing Accuracy:  66.18888888888888
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 164.29it/s]


CE Training Loss:  1.0667588927847487 SE Training Loss:  0.26000040934018087 Training Accuracy:  67.6047619047619 CE Test Loss:  1.067608299226459 SE Test Loss:  0.2605389625221885 Testing Accuracy:  67.2388888888889
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 248.02it/s]


CE Training Loss:  1.0844793143796594 SE Training Loss:  0.26848000978122355 Training Accuracy:  62.76190476190476 CE Test Loss:  1.0843867824646554 SE Test Loss:  0.2686942922758015 Testing Accuracy:  62.25555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 160.05it/s]


CE Training Loss:  1.0692328344713962 SE Training Loss:  0.26434400057096175 Training Accuracy:  64.49761904761905 CE Test Loss:  1.069606819875886 SE Test Loss:  0.2647101139183713 Testing Accuracy:  64.02222222222223
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 241.00it/s]


CE Training Loss:  1.0566126429376836 SE Training Loss:  0.26023316637553773 Training Accuracy:  65.77619047619048 CE Test Loss:  1.0571834537024156 SE Test Loss:  0.2606970417319863 Testing Accuracy:  65.41666666666667


0,1
tr_accuracy,▁██▇██
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁██▇██
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,65.77619
tr_loss_CE,1.05661
tr_loss_SE,0.26023
val_accuracy,65.41667
val_loss_CE,1.05718
val_loss_SE,0.2607


[34m[1mwandb[0m: Agent Starting Run: 9ib25bh4 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.2711334783745962 SE Training Loss:  0.448782759

100%|██████████| 1312/1312 [00:09<00:00, 141.29it/s]


CE Training Loss:  0.527775025851288 SE Training Loss:  0.12751129239839407 Training Accuracy:  82.5547619047619 CE Test Loss:  0.5326174035729369 SE Test Loss:  0.12844728888712248 Testing Accuracy:  82.50555555555556
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 195.07it/s]


CE Training Loss:  0.45220444734931153 SE Training Loss:  0.11082536505860648 Training Accuracy:  84.61904761904762 CE Test Loss:  0.45978615944753326 SE Test Loss:  0.11256325980105868 Testing Accuracy:  84.47222222222223
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 181.74it/s]


CE Training Loss:  0.4214694192320429 SE Training Loss:  0.10355496566497108 Training Accuracy:  85.73095238095237 CE Test Loss:  0.4317488742242189 SE Test Loss:  0.1063402615024113 Testing Accuracy:  85.20555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:09<00:00, 143.54it/s]


CE Training Loss:  0.4030775119722838 SE Training Loss:  0.0990075257391273 Training Accuracy:  86.3404761904762 CE Test Loss:  0.41538816465086836 SE Test Loss:  0.10256180619110161 Testing Accuracy:  85.74444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 209.04it/s]


CE Training Loss:  0.3900385008963522 SE Training Loss:  0.09575536324431144 Training Accuracy:  86.83333333333333 CE Test Loss:  0.40421054715479454 SE Test Loss:  0.099949644150858 Testing Accuracy:  86.22222222222223


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.83333
tr_loss_CE,0.39004
tr_loss_SE,0.09576
val_accuracy,86.22222
val_loss_CE,0.40421
val_loss_SE,0.09995


[34m[1mwandb[0m: Agent Starting Run: gvy1tgaq with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.345673591102198 SE Training Loss:  0.4559887275

100%|██████████| 1312/1312 [00:07<00:00, 171.03it/s]


CE Training Loss:  0.5381640235259785 SE Training Loss:  0.12937944029045628 Training Accuracy:  82.25952380952381 CE Test Loss:  0.5386218121185157 SE Test Loss:  0.12994162461113146 Testing Accuracy:  82.12777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 201.58it/s]


CE Training Loss:  0.4568149634489234 SE Training Loss:  0.11121753272550021 Training Accuracy:  84.72142857142858 CE Test Loss:  0.4587212054736489 SE Test Loss:  0.11203718198618377 Testing Accuracy:  84.68333333333334
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 145.95it/s]


CE Training Loss:  0.4243017289006071 SE Training Loss:  0.1036707153900267 Training Accuracy:  85.79761904761905 CE Test Loss:  0.42842367012336763 SE Test Loss:  0.10509111047049327 Testing Accuracy:  85.6
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 208.73it/s]


CE Training Loss:  0.40402256621086974 SE Training Loss:  0.09881580265498245 Training Accuracy:  86.45714285714286 CE Test Loss:  0.4108267816605887 SE Test Loss:  0.1010371258207459 Testing Accuracy:  86.05555555555556
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 169.43it/s]


CE Training Loss:  0.3893996697952567 SE Training Loss:  0.0952449786938918 Training Accuracy:  86.90238095238095 CE Test Loss:  0.39912508107871375 SE Test Loss:  0.09831685901118686 Testing Accuracy:  86.41666666666667


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.90238
tr_loss_CE,0.3894
tr_loss_SE,0.09524
val_accuracy,86.41667
val_loss_CE,0.39913
val_loss_SE,0.09832


[34m[1mwandb[0m: Agent Starting Run: hr2egwra with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.365708797518797 SE Training Loss:  0.4557571260

100%|██████████| 1312/1312 [00:06<00:00, 190.34it/s]


CE Training Loss:  1.205623193224547 SE Training Loss:  0.2778281225990744 Training Accuracy:  64.97619047619048 CE Test Loss:  1.2056416381280326 SE Test Loss:  0.2779656502771777 Testing Accuracy:  64.65
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 177.42it/s]


CE Training Loss:  1.06004628826089 SE Training Loss:  0.2505382905131375 Training Accuracy:  64.63571428571429 CE Test Loss:  1.0616692407982269 SE Test Loss:  0.2512031919053918 Testing Accuracy:  64.61111111111111
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 142.63it/s]


CE Training Loss:  1.0145088197724266 SE Training Loss:  0.2428507693411873 Training Accuracy:  65.57619047619048 CE Test Loss:  1.0167751038631287 SE Test Loss:  0.24371698122614407 Testing Accuracy:  65.41666666666667
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 203.52it/s]


CE Training Loss:  0.9900505166178568 SE Training Loss:  0.23861109858876758 Training Accuracy:  67.48809523809524 CE Test Loss:  0.9925570423377533 SE Test Loss:  0.23953070136841131 Testing Accuracy:  67.41111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 159.75it/s]


CE Training Loss:  0.976220382067332 SE Training Loss:  0.2361137840041632 Training Accuracy:  68.38809523809523 CE Test Loss:  0.9787789442458406 SE Test Loss:  0.23702433067305517 Testing Accuracy:  68.52222222222223


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,68.3881
tr_loss_CE,0.97622
tr_loss_SE,0.23611
val_accuracy,68.52222
val_loss_CE,0.97878
val_loss_SE,0.23702


[34m[1mwandb[0m: Agent Starting Run: 2a7cmexs with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3586003363771737 SE Training Loss:  0.45612149

100%|██████████| 1312/1312 [00:06<00:00, 189.50it/s]


CE Training Loss:  0.5469373028194796 SE Training Loss:  0.1315364165338536 Training Accuracy:  82.18809523809524 CE Test Loss:  0.5460511858408369 SE Test Loss:  0.13157072544770093 Testing Accuracy:  82.26666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 156.31it/s]


CE Training Loss:  0.46586621002294376 SE Training Loss:  0.11353919301300801 Training Accuracy:  84.32619047619048 CE Test Loss:  0.4675930988152486 SE Test Loss:  0.11453303445790547 Testing Accuracy:  84.17222222222222
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 133.45it/s]


CE Training Loss:  0.4320391527698891 SE Training Loss:  0.10563721140548572 Training Accuracy:  85.46190476190476 CE Test Loss:  0.43701302461587865 SE Test Loss:  0.10763006074152619 Testing Accuracy:  85.0111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 193.33it/s]


CE Training Loss:  0.4105875933853278 SE Training Loss:  0.10049928238349079 Training Accuracy:  86.1952380952381 CE Test Loss:  0.4183678017415146 SE Test Loss:  0.10331156201019437 Testing Accuracy:  85.60555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 156.99it/s]


CE Training Loss:  0.3950916497700774 SE Training Loss:  0.09674517947056087 Training Accuracy:  86.70238095238095 CE Test Loss:  0.40538104251152157 SE Test Loss:  0.10022370602700516 Testing Accuracy:  85.91111111111111


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.70238
tr_loss_CE,0.39509
tr_loss_SE,0.09675
val_accuracy,85.91111
val_loss_CE,0.40538
val_loss_SE,0.10022


[34m[1mwandb[0m: Agent Starting Run: r59us7y8 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.408888344839348 SE Training Loss:  0.462345259

100%|██████████| 1312/1312 [00:06<00:00, 190.11it/s]


CE Training Loss:  0.5370983032237969 SE Training Loss:  0.13113575032882163 Training Accuracy:  81.97857142857143 CE Test Loss:  0.5413485845798908 SE Test Loss:  0.1323340956098747 Testing Accuracy:  81.97222222222223
Epoch:  2


100%|██████████| 1312/1312 [00:09<00:00, 136.31it/s]


CE Training Loss:  0.4656424499672919 SE Training Loss:  0.11483711589040335 Training Accuracy:  83.92619047619047 CE Test Loss:  0.47145356676443195 SE Test Loss:  0.11656112701305907 Testing Accuracy:  83.63888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 137.81it/s]


CE Training Loss:  0.43271426901625626 SE Training Loss:  0.1071554273429025 Training Accuracy:  85.15 CE Test Loss:  0.4407326368610023 SE Test Loss:  0.10954937165146293 Testing Accuracy:  84.65
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 189.88it/s]


CE Training Loss:  0.4112436297630189 SE Training Loss:  0.10202294498578474 Training Accuracy:  85.94047619047619 CE Test Loss:  0.4213333947070492 SE Test Loss:  0.10503629947692349 Testing Accuracy:  85.49444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:09<00:00, 136.11it/s]


CE Training Loss:  0.395677672970118 SE Training Loss:  0.09817036779215 Training Accuracy:  86.44047619047619 CE Test Loss:  0.4077347549469648 SE Test Loss:  0.10182372871777026 Testing Accuracy:  85.88333333333334


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.44048
tr_loss_CE,0.39568
tr_loss_SE,0.09817
val_accuracy,85.88333
val_loss_CE,0.40773
val_loss_SE,0.10182


[34m[1mwandb[0m: Agent Starting Run: 7znbjkvo with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.4668177026155265 SE Training Loss:  0.46668261

100%|██████████| 1312/1312 [00:06<00:00, 189.29it/s]


CE Training Loss:  1.1037166852777396 SE Training Loss:  0.2563549210345856 Training Accuracy:  65.3952380952381 CE Test Loss:  1.1070640214742116 SE Test Loss:  0.25725473333576654 Testing Accuracy:  64.98333333333333
Epoch:  2


100%|██████████| 1312/1312 [00:09<00:00, 135.23it/s]


CE Training Loss:  1.020139371064406 SE Training Loss:  0.24611896128681351 Training Accuracy:  67.18809523809524 CE Test Loss:  1.023623740243421 SE Test Loss:  0.24724661391238983 Testing Accuracy:  66.94444444444444
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 141.61it/s]


CE Training Loss:  0.9709026846811926 SE Training Loss:  0.23559996694655216 Training Accuracy:  69.42142857142858 CE Test Loss:  0.9750556864710983 SE Test Loss:  0.23696313336129238 Testing Accuracy:  68.92777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 187.82it/s]


CE Training Loss:  0.9512028777341978 SE Training Loss:  0.23149722146017276 Training Accuracy:  70.1547619047619 CE Test Loss:  0.9557363535482487 SE Test Loss:  0.23296431279399854 Testing Accuracy:  69.72222222222223
Epoch:  5


100%|██████████| 1312/1312 [00:09<00:00, 134.79it/s]


CE Training Loss:  0.9420642416621274 SE Training Loss:  0.22964145690798665 Training Accuracy:  70.45952380952382 CE Test Loss:  0.9468082840319623 SE Test Loss:  0.23115253490593968 Testing Accuracy:  70.07777777777778


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,70.45952
tr_loss_CE,0.94206
tr_loss_SE,0.22964
val_accuracy,70.07778
val_loss_CE,0.94681
val_loss_SE,0.23115


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: c76ubttf with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3106843221700775 SE Training Loss:  0.4509853

100%|██████████| 1312/1312 [00:04<00:00, 294.73it/s]


CE Training Loss:  0.5006927126205948 SE Training Loss:  0.12535455371002177 Training Accuracy:  82.57619047619048 CE Test Loss:  0.5128596085201874 SE Test Loss:  0.12739705536263615 Testing Accuracy:  82.2611111111111
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 255.67it/s]


CE Training Loss:  0.4853102343610469 SE Training Loss:  0.11741354707018159 Training Accuracy:  83.73809523809524 CE Test Loss:  0.5092118718755119 SE Test Loss:  0.1206562697636349 Testing Accuracy:  83.29444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:03<00:00, 356.04it/s]


CE Training Loss:  0.5162198389078357 SE Training Loss:  0.11815995613762709 Training Accuracy:  83.41904761904762 CE Test Loss:  0.5567018139995835 SE Test Loss:  0.12431309866740944 Testing Accuracy:  82.45
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 260.63it/s]


CE Training Loss:  0.48040887426577616 SE Training Loss:  0.10610257051042453 Training Accuracy:  85.30952380952381 CE Test Loss:  0.5394752229145625 SE Test Loss:  0.11238686421859266 Testing Accuracy:  84.62777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:04<00:00, 265.33it/s]


CE Training Loss:  0.49827291315510835 SE Training Loss:  0.11085302143240991 Training Accuracy:  84.5 CE Test Loss:  0.5592222345688689 SE Test Loss:  0.1177544332562356 Testing Accuracy:  83.68333333333334


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,84.5
tr_loss_CE,0.49827
tr_loss_SE,0.11085
val_accuracy,83.68333
val_loss_CE,0.55922
val_loss_SE,0.11775


[34m[1mwandb[0m: Agent Starting Run: a4l3874t with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.343811944172096 SE Training Loss:  0.45413222

100%|██████████| 1312/1312 [00:04<00:00, 288.51it/s]


CE Training Loss:  0.4862225333896783 SE Training Loss:  0.12191262748467957 Training Accuracy:  82.7547619047619 CE Test Loss:  0.5005415681832194 SE Test Loss:  0.12328618461074277 Testing Accuracy:  82.7
Epoch:  2


100%|██████████| 1312/1312 [00:13<00:00, 100.46it/s]


CE Training Loss:  0.5181062252631036 SE Training Loss:  0.12582603350140725 Training Accuracy:  82.43571428571428 CE Test Loss:  0.5432675259069518 SE Test Loss:  0.12940439313743962 Testing Accuracy:  81.60555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:13<00:00, 100.86it/s]


CE Training Loss:  0.5355622666185356 SE Training Loss:  0.12321470682192086 Training Accuracy:  83.05952380952381 CE Test Loss:  0.5806175925379365 SE Test Loss:  0.1280969990235507 Testing Accuracy:  82.65
Epoch:  4


100%|██████████| 1312/1312 [00:12<00:00, 105.07it/s]


CE Training Loss:  0.5723851086840612 SE Training Loss:  0.12774406280312803 Training Accuracy:  82.60952380952381 CE Test Loss:  0.6294081028257024 SE Test Loss:  0.13267322494732717 Testing Accuracy:  82.0
Epoch:  5


100%|██████████| 1312/1312 [00:14<00:00, 89.01it/s]


CE Training Loss:  0.5516856252168352 SE Training Loss:  0.12095027323668156 Training Accuracy:  83.57380952380953 CE Test Loss:  0.6125414922844835 SE Test Loss:  0.1275620065336542 Testing Accuracy:  82.75555555555556


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,83.57381
tr_loss_CE,0.55169
tr_loss_SE,0.12095
val_accuracy,82.75556
val_loss_CE,0.61254
val_loss_SE,0.12756


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: dghsxtmg with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.314799915698758 SE Training Loss:  0.45098149

100%|██████████| 1312/1312 [00:06<00:00, 198.10it/s]


CE Training Loss:  0.6209734304428416 SE Training Loss:  0.15927172533871312 Training Accuracy:  76.94285714285714 CE Test Loss:  0.6225024167214759 SE Test Loss:  0.1592498911878455 Testing Accuracy:  76.94444444444444
Epoch:  2


100%|██████████| 1312/1312 [00:03<00:00, 366.77it/s]


CE Training Loss:  0.5939082511005433 SE Training Loss:  0.1516493239992606 Training Accuracy:  78.28571428571429 CE Test Loss:  0.5961574709325759 SE Test Loss:  0.15179049692725397 Testing Accuracy:  78.28888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:04<00:00, 313.77it/s]


CE Training Loss:  0.5751561236514326 SE Training Loss:  0.14620483081706406 Training Accuracy:  79.1452380952381 CE Test Loss:  0.577991706732631 SE Test Loss:  0.14652379966982731 Testing Accuracy:  79.06111111111112
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 240.81it/s]


CE Training Loss:  0.5750130677725893 SE Training Loss:  0.14618249443757203 Training Accuracy:  79.20238095238095 CE Test Loss:  0.5781596887160094 SE Test Loss:  0.14669736896723 Testing Accuracy:  79.19444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 373.95it/s]


CE Training Loss:  0.5687877141680044 SE Training Loss:  0.1444089507991715 Training Accuracy:  79.48809523809524 CE Test Loss:  0.5728029397508071 SE Test Loss:  0.14518691233255454 Testing Accuracy:  79.35555555555555


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,79.4881
tr_loss_CE,0.56879
tr_loss_SE,0.14441
val_accuracy,79.35556
val_loss_CE,0.5728
val_loss_SE,0.14519


[34m[1mwandb[0m: Agent Starting Run: ql2jllog with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.273887184637737 SE Training Loss:  0.44684772018

100%|██████████| 1312/1312 [00:04<00:00, 263.32it/s]


CE Training Loss:  0.437877132688473 SE Training Loss:  0.11178653345010961 Training Accuracy:  84.26666666666667 CE Test Loss:  0.4499653557314756 SE Test Loss:  0.11337015446630852 Testing Accuracy:  83.92222222222222
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 187.96it/s]


CE Training Loss:  0.4533169492936528 SE Training Loss:  0.10564988570399576 Training Accuracy:  85.36666666666666 CE Test Loss:  0.4828213544734416 SE Test Loss:  0.10927876807897725 Testing Accuracy:  84.94444444444444
Epoch:  3


100%|██████████| 1312/1312 [00:04<00:00, 290.36it/s]


CE Training Loss:  0.41666114563837947 SE Training Loss:  0.09966084138109953 Training Accuracy:  86.22142857142858 CE Test Loss:  0.438912166467687 SE Test Loss:  0.10217734911758215 Testing Accuracy:  86.07777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 292.28it/s]


CE Training Loss:  0.4236274062984078 SE Training Loss:  0.09432469302998193 Training Accuracy:  86.97380952380952 CE Test Loss:  0.4772595349893522 SE Test Loss:  0.10004384415253717 Testing Accuracy:  86.4
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 186.76it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁████▁
tr_loss_CE,█▁▁▁▁
tr_loss_SE,█▁▁▁▁
val_accuracy,▁████▁
val_loss_CE,█▁▁▁▁
val_loss_SE,█▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: 9rxekkbl with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3145692298318417 SE Training Loss:  0.4511486516

100%|██████████| 1312/1312 [00:06<00:00, 193.53it/s]


CE Training Loss:  0.4478808762199854 SE Training Loss:  0.11478064082137088 Training Accuracy:  83.66190476190476 CE Test Loss:  0.45737031146421436 SE Test Loss:  0.11635789167864302 Testing Accuracy:  83.72777777777777
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 200.57it/s]


CE Training Loss:  0.4194587734072845 SE Training Loss:  0.10257531866756091 Training Accuracy:  85.66666666666667 CE Test Loss:  0.4415223897879802 SE Test Loss:  0.10564342077381765 Testing Accuracy:  85.33333333333333
Epoch:  3


100%|██████████| 1312/1312 [00:15<00:00, 86.52it/s] 


CE Training Loss:  0.4225944603100199 SE Training Loss:  0.10362477877135624 Training Accuracy:  85.44285714285714 CE Test Loss:  0.4546375822433824 SE Test Loss:  0.10747607593900108 Testing Accuracy:  84.96666666666667
Epoch:  4


100%|██████████| 1312/1312 [00:15<00:00, 85.28it/s] 


CE Training Loss:  0.39718643556786176 SE Training Loss:  0.09524204206274614 Training Accuracy:  86.75952380952381 CE Test Loss:  0.43903090017679003 SE Test Loss:  0.10151388899629221 Testing Accuracy:  85.90555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:16<00:00, 78.52it/s]


CE Training Loss:  0.4632033214398019 SE Training Loss:  0.10885679311055001 Training Accuracy:  85.00714285714285 CE Test Loss:  0.5153054486970757 SE Test Loss:  0.11633518865904927 Testing Accuracy:  84.20555555555555


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,85.00714
tr_loss_CE,0.4632
tr_loss_SE,0.10886
val_accuracy,84.20556
val_loss_CE,0.51531
val_loss_SE,0.11634


[34m[1mwandb[0m: Agent Starting Run: l1iy6ygf with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.306157412569674 SE Training Loss:  0.45018945593

100%|██████████| 1312/1312 [00:04<00:00, 276.58it/s]


CE Training Loss:  0.5709319684238825 SE Training Loss:  0.14268189861004374 Training Accuracy:  80.56428571428572 CE Test Loss:  0.57382235819119 SE Test Loss:  0.14301609245637079 Testing Accuracy:  80.5
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 208.61it/s]


CE Training Loss:  0.5489338891805278 SE Training Loss:  0.1369870745722707 Training Accuracy:  81.27380952380952 CE Test Loss:  0.5526746012082489 SE Test Loss:  0.13759625141858534 Testing Accuracy:  81.17222222222222
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 236.82it/s]


CE Training Loss:  0.5456371278773541 SE Training Loss:  0.13541540482673559 Training Accuracy:  81.43809523809524 CE Test Loss:  0.5502588163892274 SE Test Loss:  0.1363826851567611 Testing Accuracy:  81.51666666666667
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 266.56it/s]


CE Training Loss:  0.5371369274455149 SE Training Loss:  0.13362391946929095 Training Accuracy:  81.66428571428571 CE Test Loss:  0.5418817234783259 SE Test Loss:  0.1346710404500202 Testing Accuracy:  81.73333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 179.46it/s]


CE Training Loss:  0.5382362775229914 SE Training Loss:  0.13365773032089617 Training Accuracy:  81.61904761904762 CE Test Loss:  0.5433010860517886 SE Test Loss:  0.13481787720539776 Testing Accuracy:  81.63333333333334


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,81.61905
tr_loss_CE,0.53824
tr_loss_SE,0.13366
val_accuracy,81.63333
val_loss_CE,0.5433
val_loss_SE,0.13482


[34m[1mwandb[0m: Agent Starting Run: rz09g8mv with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3152946053712995 SE Training Loss:  0.452105365

100%|██████████| 1312/1312 [00:07<00:00, 172.16it/s]


CE Training Loss:  0.438329836437935 SE Training Loss:  0.11009376909439306 Training Accuracy:  84.69761904761904 CE Test Loss:  0.4497497431422054 SE Test Loss:  0.11182687303279391 Testing Accuracy:  84.50555555555556
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 243.95it/s]


CE Training Loss:  0.41571590584640955 SE Training Loss:  0.1033152715354114 Training Accuracy:  85.57380952380953 CE Test Loss:  0.44484324632081995 SE Test Loss:  0.10795085092357407 Testing Accuracy:  85.17222222222222
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 165.87it/s]


CE Training Loss:  0.4618530235661234 SE Training Loss:  0.10633054832811033 Training Accuracy:  85.33809523809524 CE Test Loss:  0.4951810167561186 SE Test Loss:  0.11046707279849477 Testing Accuracy:  84.79444444444445
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 265.53it/s]


CE Training Loss:  0.4819377507202414 SE Training Loss:  0.1021215498136242 Training Accuracy:  85.94285714285714 CE Test Loss:  0.5468778423752488 SE Test Loss:  0.10820504667804699 Testing Accuracy:  85.24444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 165.02it/s]


CE Training Loss:  0.521389260960463 SE Training Loss:  0.10673867873150142 Training Accuracy:  85.40238095238095 CE Test Loss:  0.580576917323205 SE Test Loss:  0.11311852716015705 Testing Accuracy:  84.56666666666666


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▂
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,85.40238
tr_loss_CE,0.52139
tr_loss_SE,0.10674
val_accuracy,84.56667
val_loss_CE,0.58058
val_loss_SE,0.11312


[34m[1mwandb[0m: Agent Starting Run: vaw3s35k with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.334980995833987 SE Training Loss:  0.4532663692

100%|██████████| 1312/1312 [00:11<00:00, 116.01it/s]


CE Training Loss:  0.4442129824667082 SE Training Loss:  0.1133065511631198 Training Accuracy:  84.12380952380953 CE Test Loss:  0.45453362969935956 SE Test Loss:  0.11554306521372448 Testing Accuracy:  83.9
Epoch:  2


100%|██████████| 1312/1312 [00:20<00:00, 63.61it/s]


CE Training Loss:  0.41470226542219374 SE Training Loss:  0.10197178383361154 Training Accuracy:  85.87857142857143 CE Test Loss:  0.43794974932794345 SE Test Loss:  0.10624699025059949 Testing Accuracy:  85.35
Epoch:  3


100%|██████████| 1312/1312 [00:18<00:00, 72.64it/s]


CE Training Loss:  0.4466039250280581 SE Training Loss:  0.10422322348603322 Training Accuracy:  85.6547619047619 CE Test Loss:  0.4871789703814274 SE Test Loss:  0.11002803627610808 Testing Accuracy:  85.12222222222222
Epoch:  4


100%|██████████| 1312/1312 [00:17<00:00, 73.69it/s]


CE Training Loss:  0.4189734652505151 SE Training Loss:  0.1020096135886444 Training Accuracy:  85.69047619047619 CE Test Loss:  0.46859184599450077 SE Test Loss:  0.10962613786427779 Testing Accuracy:  84.64444444444445
Epoch:  5


100%|██████████| 1312/1312 [00:21<00:00, 60.18it/s]


CE Training Loss:  0.41963551481991723 SE Training Loss:  0.098683290915159 Training Accuracy:  86.12619047619047 CE Test Loss:  0.47731346402514657 SE Test Loss:  0.1063833556226901 Testing Accuracy:  85.35


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,86.12619
tr_loss_CE,0.41964
tr_loss_SE,0.09868
val_accuracy,85.35
val_loss_CE,0.47731
val_loss_SE,0.10638


[34m[1mwandb[0m: Agent Starting Run: pxdv26ol with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3141230318283683 SE Training Loss:  0.451158362

100%|██████████| 1312/1312 [00:06<00:00, 196.74it/s]


CE Training Loss:  0.5677021912221735 SE Training Loss:  0.14220749218780498 Training Accuracy:  80.4095238095238 CE Test Loss:  0.570554786702519 SE Test Loss:  0.14255894167114191 Testing Accuracy:  80.63888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 226.60it/s]


CE Training Loss:  0.5539465286050201 SE Training Loss:  0.13899603020456897 Training Accuracy:  80.85 CE Test Loss:  0.5576041482045422 SE Test Loss:  0.1395050039369456 Testing Accuracy:  80.92222222222222
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 214.05it/s]


CE Training Loss:  0.5504722615001935 SE Training Loss:  0.13800599545364745 Training Accuracy:  81.02857142857142 CE Test Loss:  0.5548431841050995 SE Test Loss:  0.13872327615972507 Testing Accuracy:  81.02222222222223
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 192.24it/s]


CE Training Loss:  0.5438052694199367 SE Training Loss:  0.13609856950016433 Training Accuracy:  81.25714285714285 CE Test Loss:  0.5488455901081079 SE Test Loss:  0.1370785552673392 Testing Accuracy:  81.27222222222223
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 260.64it/s]


CE Training Loss:  0.5350770482500847 SE Training Loss:  0.1337583051220952 Training Accuracy:  81.74285714285715 CE Test Loss:  0.5410362438213756 SE Test Loss:  0.13508207433270847 Testing Accuracy:  81.7611111111111


0,1
tr_accuracy,▁█████
tr_loss_CE,█▁▁▁▁▁
tr_loss_SE,█▁▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▁▁▁▁▁
val_loss_SE,█▁▁▁▁▁

0,1
tr_accuracy,81.74286
tr_loss_CE,0.53508
tr_loss_SE,0.13376
val_accuracy,81.76111
val_loss_CE,0.54104
val_loss_SE,0.13508


[34m[1mwandb[0m: Agent Starting Run: xzlskivv with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.314106523572923 SE Training Loss:  0.45188962

100%|██████████| 1312/1312 [00:04<00:00, 274.66it/s]


CE Training Loss:  0.5085272715995395 SE Training Loss:  0.12563792495290055 Training Accuracy:  82.61190476190477 CE Test Loss:  0.519388234008961 SE Test Loss:  0.1281235492058002 Testing Accuracy:  82.33333333333333
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 235.95it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:03<00:00, 351.27it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 281.00it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 256.99it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█▁▁▁▁
tr_loss_CE,█▁
tr_loss_SE,█▁
val_accuracy,▁█▁▁▁▁
val_loss_CE,█▁
val_loss_SE,█▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: bbkb7zxh with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.341312248833033 SE Training Loss:  0.45291623

100%|██████████| 1312/1312 [00:06<00:00, 195.30it/s]


CE Training Loss:  0.5185616666625974 SE Training Loss:  0.13128664057198297 Training Accuracy:  81.71190476190476 CE Test Loss:  0.530434383082801 SE Test Loss:  0.13394475988565088 Testing Accuracy:  81.27222222222223
Epoch:  2


100%|██████████| 1312/1312 [00:14<00:00, 92.96it/s] 


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:03<00:00, 342.28it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 264.93it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 258.32it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█▁▁▁▁
tr_loss_CE,█▁
tr_loss_SE,█▁
val_accuracy,▁█▁▁▁▁
val_loss_CE,█▁
val_loss_SE,█▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: kjw2c92o with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.298359601946076 SE Training Loss:  0.45014726

100%|██████████| 1312/1312 [00:06<00:00, 201.07it/s]


CE Training Loss:  1.108433659105615 SE Training Loss:  0.2766111608011072 Training Accuracy:  54.11190476190476 CE Test Loss:  1.1089382393187581 SE Test Loss:  0.27686930250503233 Testing Accuracy:  54.272222222222226
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 311.12it/s]


CE Training Loss:  1.5406332441215722 SE Training Loss:  0.3036206929887955 Training Accuracy:  47.26190476190476 CE Test Loss:  1.515274333211717 SE Test Loss:  0.30294772341778214 Testing Accuracy:  47.46666666666667
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 227.01it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 274.10it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 341.33it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█▇▁▁▁
tr_loss_CE,█▁▄
tr_loss_SE,█▁▂
val_accuracy,▁█▇▁▁▁
val_loss_CE,█▁▃
val_loss_SE,█▁▂

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ejzlritb with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3244354406798062 SE Training Loss:  0.4522635479

100%|██████████| 1312/1312 [00:06<00:00, 189.30it/s]


CE Training Loss:  0.4236705107519774 SE Training Loss:  0.10585679804246982 Training Accuracy:  85.06904761904762 CE Test Loss:  0.4346377830680312 SE Test Loss:  0.10770010525211574 Testing Accuracy:  84.83888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 279.32it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 172.73it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 280.14it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 214.18it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█▁▁▁▁
tr_loss_CE,█▁
tr_loss_SE,█▁
val_accuracy,▁█▁▁▁▁
val_loss_CE,█▁
val_loss_SE,█▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: 3mp1rrlz with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.2950563806166193 SE Training Loss:  0.4490769406

100%|██████████| 1312/1312 [00:05<00:00, 249.56it/s]


CE Training Loss:  0.4656794176358535 SE Training Loss:  0.11717991335558935 Training Accuracy:  83.46428571428571 CE Test Loss:  0.4788141643120804 SE Test Loss:  0.11935800200862941 Testing Accuracy:  83.32777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:10<00:00, 121.67it/s]


CE Training Loss:  0.4779296804094183 SE Training Loss:  0.11583293369442446 Training Accuracy:  84.21428571428571 CE Test Loss:  0.4995082558139709 SE Test Loss:  0.11979478514391045 Testing Accuracy:  83.70555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:10<00:00, 120.79it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 180.48it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:04<00:00, 273.23it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▂██▁▁▁
tr_loss_CE,█▁▁
tr_loss_SE,█▁▁
val_accuracy,▂██▁▁▁
val_loss_CE,█▁▁
val_loss_SE,█▁▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: q4p2iet0 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3117845221007793 SE Training Loss:  0.4507098010

100%|██████████| 1312/1312 [00:04<00:00, 288.15it/s]


CE Training Loss:  1.1316941554690967 SE Training Loss:  0.2629707920413136 Training Accuracy:  60.50952380952381 CE Test Loss:  1.1431071096698455 SE Test Loss:  0.26432092164726967 Testing Accuracy:  60.18888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 200.96it/s]


CE Training Loss:  1.3007088120296844 SE Training Loss:  0.2571202576476905 Training Accuracy:  62.31666666666667 CE Test Loss:  1.3092083834081993 SE Test Loss:  0.25568185607502025 Testing Accuracy:  62.6
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 243.81it/s]


CE Training Loss:  1.3424914364584006 SE Training Loss:  0.28431021722775307 Training Accuracy:  56.83095238095238 CE Test Loss:  1.3602221843887488 SE Test Loss:  0.2851638922273067 Testing Accuracy:  56.93333333333333
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 268.48it/s]


CE Training Loss:  1.1046390496456016 SE Training Loss:  0.26643665006931044 Training Accuracy:  57.82857142857143 CE Test Loss:  1.0956793571714163 SE Test Loss:  0.26522170399681777 Testing Accuracy:  58.21666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 175.52it/s]


CE Training Loss:  1.7207633893107652 SE Training Loss:  0.2965735493126206 Training Accuracy:  55.65952380952381 CE Test Loss:  1.7526254912692858 SE Test Loss:  0.2982542965034539 Testing Accuracy:  55.33888888888889


0,1
tr_accuracy,▁██▇▇▇
tr_loss_CE,█▁▂▂▁▅
tr_loss_SE,█▁▁▂▁▂
val_accuracy,▁██▇▇▇
val_loss_CE,█▁▂▃▁▅
val_loss_SE,█▁▁▂▁▃

0,1
tr_accuracy,55.65952
tr_loss_CE,1.72076
tr_loss_SE,0.29657
val_accuracy,55.33889
val_loss_CE,1.75263
val_loss_SE,0.29825


[34m[1mwandb[0m: Agent Starting Run: kk6w2l9p with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3190242892685404 SE Training Loss:  0.451525330

100%|██████████| 1312/1312 [00:08<00:00, 159.23it/s]


CE Training Loss:  0.4805156189943286 SE Training Loss:  0.11822645283160071 Training Accuracy:  83.3404761904762 CE Test Loss:  0.4885109693424717 SE Test Loss:  0.11902000427470082 Testing Accuracy:  83.31666666666666
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 257.99it/s]


CE Training Loss:  0.4174600023257962 SE Training Loss:  0.09985286625440347 Training Accuracy:  86.18571428571428 CE Test Loss:  0.43612981335219114 SE Test Loss:  0.10297585465912824 Testing Accuracy:  85.86666666666666
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 153.71it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 249.27it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 166.57it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██▁▁▁
tr_loss_CE,█▁▁
tr_loss_SE,█▁▁
val_accuracy,▁██▁▁▁
val_loss_CE,█▁▁
val_loss_SE,█▁▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: e6tl2rkw with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3440621202744274 SE Training Loss:  0.453748444

100%|██████████| 1312/1312 [00:13<00:00, 96.43it/s]


CE Training Loss:  0.46391574175838374 SE Training Loss:  0.11655201401682964 Training Accuracy:  83.68333333333334 CE Test Loss:  0.4739725414858237 SE Test Loss:  0.11880751534618288 Testing Accuracy:  83.42222222222222
Epoch:  2


100%|██████████| 1312/1312 [00:16<00:00, 78.76it/s] 


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 246.64it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 166.96it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 253.24it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█▁▁▁▁
tr_loss_CE,█▁
tr_loss_SE,█▁
val_accuracy,▁█▁▁▁▁
val_loss_CE,█▁
val_loss_SE,█▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: 1ufnpo6t with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3232170673751713 SE Training Loss:  0.452581881

100%|██████████| 1312/1312 [00:06<00:00, 196.55it/s]


CE Training Loss:  1.2348555345933119 SE Training Loss:  0.29243825654731836 Training Accuracy:  51.44761904761905 CE Test Loss:  1.2336914307049702 SE Test Loss:  0.29191985044960267 Testing Accuracy:  51.18888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 255.46it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 160.34it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 239.16it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 158.82it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█▁▁▁▁
tr_loss_CE,█▁
tr_loss_SE,█▁
val_accuracy,▁█▁▁▁▁
val_loss_CE,█▁
val_loss_SE,█▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: 96dycp1w with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3145639913390537 SE Training Loss:  0.451219

100%|██████████| 1312/1312 [00:07<00:00, 176.25it/s]


CE Training Loss:  0.5801191355881611 SE Training Loss:  0.14449368648797167 Training Accuracy:  80.18571428571428 CE Test Loss:  0.5835982656771219 SE Test Loss:  0.14550690676278483 Testing Accuracy:  80.12777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 303.69it/s]


CE Training Loss:  0.4889711360459225 SE Training Loss:  0.12208187367831323 Training Accuracy:  83.23095238095237 CE Test Loss:  0.4926707845975739 SE Test Loss:  0.1229072364463887 Testing Accuracy:  83.32777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:04<00:00, 289.36it/s]


CE Training Loss:  0.4476760173749173 SE Training Loss:  0.11213250731471709 Training Accuracy:  84.53095238095239 CE Test Loss:  0.4545262636408938 SE Test Loss:  0.11352150440765742 Testing Accuracy:  84.38888888888889
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 218.05it/s]


CE Training Loss:  0.4230621663019236 SE Training Loss:  0.10622786328215122 Training Accuracy:  85.25238095238095 CE Test Loss:  0.43202346019341037 SE Test Loss:  0.10792761858612032 Testing Accuracy:  85.11111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 328.39it/s]


CE Training Loss:  0.4049778178755969 SE Training Loss:  0.10216406739583228 Training Accuracy:  85.74047619047619 CE Test Loss:  0.41595259853804206 SE Test Loss:  0.10411208083590741 Testing Accuracy:  85.63888888888889


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,85.74048
tr_loss_CE,0.40498
tr_loss_SE,0.10216
val_accuracy,85.63889
val_loss_CE,0.41595
val_loss_SE,0.10411


[34m[1mwandb[0m: Agent Starting Run: 1300qhrf with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.303130340460058 SE Training Loss:  0.4499415

100%|██████████| 1312/1312 [00:04<00:00, 325.65it/s]


CE Training Loss:  0.5860299257926184 SE Training Loss:  0.14765528617856438 Training Accuracy:  80.0 CE Test Loss:  0.5913462557867093 SE Test Loss:  0.14922356282566887 Testing Accuracy:  79.83888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 314.93it/s]


CE Training Loss:  0.4894764793884502 SE Training Loss:  0.12331608766485323 Training Accuracy:  83.16428571428571 CE Test Loss:  0.49669349648758454 SE Test Loss:  0.12484607206498771 Testing Accuracy:  83.09444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 182.67it/s]


CE Training Loss:  0.4540777563438371 SE Training Loss:  0.11427485187648533 Training Accuracy:  84.31666666666666 CE Test Loss:  0.46315701993454794 SE Test Loss:  0.11623289683035674 Testing Accuracy:  84.14444444444445
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 287.81it/s]


CE Training Loss:  0.4314371915417285 SE Training Loss:  0.1084872342142187 Training Accuracy:  85.08333333333333 CE Test Loss:  0.44249574757981586 SE Test Loss:  0.11085212464772717 Testing Accuracy:  84.78888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:09<00:00, 132.22it/s]


CE Training Loss:  0.4141056744455395 SE Training Loss:  0.10416692996509477 Training Accuracy:  85.6 CE Test Loss:  0.42686449098572393 SE Test Loss:  0.1069837792884886 Testing Accuracy:  85.28888888888889


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,85.6
tr_loss_CE,0.41411
tr_loss_SE,0.10417
val_accuracy,85.28889
val_loss_CE,0.42686
val_loss_SE,0.10698


[34m[1mwandb[0m: Agent Starting Run: d1ku9bpt with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3099345170848533 SE Training Loss:  0.450734

100%|██████████| 1312/1312 [00:04<00:00, 266.65it/s]


CE Training Loss:  0.6709951467109656 SE Training Loss:  0.1668186677206692 Training Accuracy:  77.54523809523809 CE Test Loss:  0.6739103859580505 SE Test Loss:  0.16791623160347854 Testing Accuracy:  77.41111111111111
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 321.83it/s]


CE Training Loss:  0.5654890966784203 SE Training Loss:  0.1397450190172275 Training Accuracy:  81.4095238095238 CE Test Loss:  0.5701095006083752 SE Test Loss:  0.14082290323875668 Testing Accuracy:  81.65555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 197.10it/s]


CE Training Loss:  0.5283217381825667 SE Training Loss:  0.1306846449950514 Training Accuracy:  82.57380952380953 CE Test Loss:  0.5332919226136192 SE Test Loss:  0.13161832324230854 Testing Accuracy:  82.65
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 287.31it/s]


CE Training Loss:  0.5102610302108098 SE Training Loss:  0.12625746390101242 Training Accuracy:  83.13095238095238 CE Test Loss:  0.5156856210900252 SE Test Loss:  0.1271992299639496 Testing Accuracy:  83.07222222222222
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 341.28it/s]


CE Training Loss:  0.49909957179240744 SE Training Loss:  0.12352729473800717 Training Accuracy:  83.42857142857143 CE Test Loss:  0.5050427555910136 SE Test Loss:  0.12455962965552506 Testing Accuracy:  83.49444444444444


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,83.42857
tr_loss_CE,0.4991
tr_loss_SE,0.12353
val_accuracy,83.49444
val_loss_CE,0.50504
val_loss_SE,0.12456


[34m[1mwandb[0m: Agent Starting Run: l13jrclx with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.2990617834462723 SE Training Loss:  0.449729264

100%|██████████| 1312/1312 [00:05<00:00, 261.85it/s]


CE Training Loss:  0.5691467844479541 SE Training Loss:  0.1418667771437092 Training Accuracy:  80.30952380952381 CE Test Loss:  0.5781589653690985 SE Test Loss:  0.14398300720597473 Testing Accuracy:  79.73333333333333
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 170.54it/s]


CE Training Loss:  0.47370507648327864 SE Training Loss:  0.11918137917764224 Training Accuracy:  83.57619047619048 CE Test Loss:  0.48393564144143886 SE Test Loss:  0.12116850189401866 Testing Accuracy:  83.29444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 250.19it/s]


CE Training Loss:  0.4363891616665694 SE Training Loss:  0.10982094004190808 Training Accuracy:  84.79523809523809 CE Test Loss:  0.4482040647098219 SE Test Loss:  0.11196767442310832 Testing Accuracy:  84.53333333333333
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 163.59it/s]


CE Training Loss:  0.41473042427967605 SE Training Loss:  0.10450823244573483 Training Accuracy:  85.51666666666667 CE Test Loss:  0.42941059976199725 SE Test Loss:  0.107279669595533 Testing Accuracy:  85.28333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 252.91it/s]


CE Training Loss:  0.39848010158872943 SE Training Loss:  0.10039238997166604 Training Accuracy:  86.17619047619047 CE Test Loss:  0.4160515706697696 SE Test Loss:  0.10370526743712387 Testing Accuracy:  85.82222222222222


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.17619
tr_loss_CE,0.39848
tr_loss_SE,0.10039
val_accuracy,85.82222
val_loss_CE,0.41605
val_loss_SE,0.10371


[34m[1mwandb[0m: Agent Starting Run: qlmov3ii with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3125105809387323 SE Training Loss:  0.450836771

100%|██████████| 1312/1312 [00:04<00:00, 267.72it/s]


CE Training Loss:  0.5841613980942911 SE Training Loss:  0.14611782907422563 Training Accuracy:  79.5047619047619 CE Test Loss:  0.5894600781017507 SE Test Loss:  0.1477561276639735 Testing Accuracy:  79.17222222222222
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 175.08it/s]


CE Training Loss:  0.48812288847289953 SE Training Loss:  0.12379412489316428 Training Accuracy:  82.73571428571428 CE Test Loss:  0.4937235843484643 SE Test Loss:  0.12490253194524868 Testing Accuracy:  82.52222222222223
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 211.75it/s]


CE Training Loss:  0.44669519766345617 SE Training Loss:  0.11334936211953685 Training Accuracy:  84.40714285714286 CE Test Loss:  0.4543009869424703 SE Test Loss:  0.11453945569078496 Testing Accuracy:  84.23333333333333
Epoch:  4


100%|██████████| 1312/1312 [00:13<00:00, 100.02it/s]


CE Training Loss:  0.418841125067751 SE Training Loss:  0.1060627336760807 Training Accuracy:  85.4 CE Test Loss:  0.4291494220308214 SE Test Loss:  0.10765214522449192 Testing Accuracy:  85.23333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:15<00:00, 86.41it/s] 


CE Training Loss:  0.3990921154855205 SE Training Loss:  0.10088798570700283 Training Accuracy:  86.11190476190477 CE Test Loss:  0.41267962702288835 SE Test Loss:  0.10317518969452895 Testing Accuracy:  85.66666666666667


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,86.1119
tr_loss_CE,0.39909
tr_loss_SE,0.10089
val_accuracy,85.66667
val_loss_CE,0.41268
val_loss_SE,0.10318


[34m[1mwandb[0m: Agent Starting Run: bwx46kcm with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.324090870439341 SE Training Loss:  0.4523526320

100%|██████████| 1312/1312 [00:07<00:00, 165.09it/s]


CE Training Loss:  0.7072916788808027 SE Training Loss:  0.175859502404441 Training Accuracy:  77.03333333333333 CE Test Loss:  0.710477917784066 SE Test Loss:  0.17722067498602068 Testing Accuracy:  76.62777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 263.23it/s]


CE Training Loss:  0.5910628365804591 SE Training Loss:  0.14709738191053695 Training Accuracy:  80.32619047619048 CE Test Loss:  0.5958327572636016 SE Test Loss:  0.14873533171511105 Testing Accuracy:  80.10555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 207.20it/s]


CE Training Loss:  0.549803016410469 SE Training Loss:  0.1367104060956438 Training Accuracy:  81.5 CE Test Loss:  0.5543602550803561 SE Test Loss:  0.13796583278442096 Testing Accuracy:  81.51666666666667
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 210.78it/s]


CE Training Loss:  0.5282920235149079 SE Training Loss:  0.13131553928558257 Training Accuracy:  82.20714285714286 CE Test Loss:  0.532942980189114 SE Test Loss:  0.13242066931663357 Testing Accuracy:  82.22777777777777
Epoch:  5


100%|██████████| 1312/1312 [00:04<00:00, 263.19it/s]


CE Training Loss:  0.515099818012323 SE Training Loss:  0.1281388594261516 Training Accuracy:  82.53333333333333 CE Test Loss:  0.520236565216644 SE Test Loss:  0.12925515073834584 Testing Accuracy:  82.52222222222223


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,82.53333
tr_loss_CE,0.5151
tr_loss_SE,0.12814
val_accuracy,82.52222
val_loss_CE,0.52024
val_loss_SE,0.12926


[34m[1mwandb[0m: Agent Starting Run: ljqa8tna with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.306876769891243 SE Training Loss:  0.450265223

100%|██████████| 1312/1312 [00:06<00:00, 203.05it/s]


CE Training Loss:  0.5802058071069252 SE Training Loss:  0.1434775611454182 Training Accuracy:  80.41904761904762 CE Test Loss:  0.5932481447255692 SE Test Loss:  0.14660171475808798 Testing Accuracy:  79.90555555555555
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 179.13it/s]


CE Training Loss:  0.484637472919229 SE Training Loss:  0.12144602949628194 Training Accuracy:  83.3404761904762 CE Test Loss:  0.49687267542392977 SE Test Loss:  0.12392240932413377 Testing Accuracy:  83.05555555555556
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 191.09it/s]


CE Training Loss:  0.44581277929410024 SE Training Loss:  0.11193423135426117 Training Accuracy:  84.62142857142857 CE Test Loss:  0.45855165223218913 SE Test Loss:  0.11448125432843373 Testing Accuracy:  84.36666666666666
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 170.89it/s]


CE Training Loss:  0.422979887377557 SE Training Loss:  0.1061107497606501 Training Accuracy:  85.41904761904762 CE Test Loss:  0.4366106384791791 SE Test Loss:  0.10880503676061087 Testing Accuracy:  85.01666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 223.55it/s]


CE Training Loss:  0.40532221097945687 SE Training Loss:  0.10176733212976426 Training Accuracy:  85.99047619047619 CE Test Loss:  0.4209409360890109 SE Test Loss:  0.10473220570686104 Testing Accuracy:  85.7


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,85.99048
tr_loss_CE,0.40532
tr_loss_SE,0.10177
val_accuracy,85.7
val_loss_CE,0.42094
val_loss_SE,0.10473


[34m[1mwandb[0m: Agent Starting Run: pu5m5v1y with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.357043629082565 SE Training Loss:  0.455618078

100%|██████████| 1312/1312 [00:06<00:00, 202.89it/s]


CE Training Loss:  0.5880457778285662 SE Training Loss:  0.1460636762506565 Training Accuracy:  79.82142857142857 CE Test Loss:  0.5929437068090124 SE Test Loss:  0.14731433274061842 Testing Accuracy:  79.83333333333333
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 189.86it/s]


CE Training Loss:  0.4894691791444253 SE Training Loss:  0.1237010991262226 Training Accuracy:  82.95238095238095 CE Test Loss:  0.4971390689998785 SE Test Loss:  0.12489519378604225 Testing Accuracy:  82.78888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 246.79it/s]


CE Training Loss:  0.4511237422625864 SE Training Loss:  0.11467248364808048 Training Accuracy:  84.1047619047619 CE Test Loss:  0.46116983752391005 SE Test Loss:  0.11606902998747402 Testing Accuracy:  83.9
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 152.14it/s]


CE Training Loss:  0.42705003122201307 SE Training Loss:  0.10888085392786927 Training Accuracy:  84.82857142857142 CE Test Loss:  0.43931043303427963 SE Test Loss:  0.11064784917734177 Testing Accuracy:  84.71111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:10<00:00, 124.03it/s]


CE Training Loss:  0.40929039625308117 SE Training Loss:  0.1043545878859306 Training Accuracy:  85.45952380952382 CE Test Loss:  0.4246894455227188 SE Test Loss:  0.10678680620449041 Testing Accuracy:  85.28888888888889


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,85.45952
tr_loss_CE,0.40929
tr_loss_SE,0.10435
val_accuracy,85.28889
val_loss_CE,0.42469
val_loss_SE,0.10679


[34m[1mwandb[0m: Agent Starting Run: 78hkbteo with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: cross_entropy
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3150178616152206 SE Training Loss:  0.45102483

100%|██████████| 1312/1312 [00:08<00:00, 160.67it/s]


CE Training Loss:  0.6886257143505083 SE Training Loss:  0.1740549880408423 Training Accuracy:  76.30714285714286 CE Test Loss:  0.6922231027652834 SE Test Loss:  0.17519516212825617 Testing Accuracy:  75.97777777777777
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 231.48it/s]


CE Training Loss:  0.5746148499526051 SE Training Loss:  0.14324701264532563 Training Accuracy:  80.95 CE Test Loss:  0.5793320534505734 SE Test Loss:  0.14436481360287398 Testing Accuracy:  81.09444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 158.42it/s]


CE Training Loss:  0.5326658779103486 SE Training Loss:  0.13284828033081006 Training Accuracy:  82.14761904761905 CE Test Loss:  0.5370389751523352 SE Test Loss:  0.13364341439971822 Testing Accuracy:  82.16111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 240.76it/s]


CE Training Loss:  0.515452574163739 SE Training Loss:  0.12850631449077818 Training Accuracy:  82.6952380952381 CE Test Loss:  0.5196471437010174 SE Test Loss:  0.12920689016541023 Testing Accuracy:  82.72222222222223
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 177.32it/s]


CE Training Loss:  0.5062567509153664 SE Training Loss:  0.126123939449946 Training Accuracy:  82.99047619047619 CE Test Loss:  0.5105849209094429 SE Test Loss:  0.12683782669147212 Testing Accuracy:  83.10555555555555


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,82.99048
tr_loss_CE,0.50626
tr_loss_SE,0.12612
val_accuracy,83.10556
val_loss_CE,0.51058
val_loss_SE,0.12684


[34m[1mwandb[0m: Agent Starting Run: nhnpu2bv with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3076635981603166 SE Training Loss:  0.450302

100%|██████████| 1312/1312 [00:04<00:00, 268.70it/s]


CE Training Loss:  0.6801980260063525 SE Training Loss:  0.15970103775325922 Training Accuracy:  79.10714285714286 CE Test Loss:  0.6817786316912385 SE Test Loss:  0.1616021344822566 Testing Accuracy:  78.71666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 230.69it/s]


CE Training Loss:  0.5801217733566162 SE Training Loss:  0.13529806939149727 Training Accuracy:  81.83809523809524 CE Test Loss:  0.5814252788420645 SE Test Loss:  0.13605235310568686 Testing Accuracy:  81.95
Epoch:  3


100%|██████████| 1312/1312 [00:03<00:00, 328.91it/s]


CE Training Loss:  0.528245457914436 SE Training Loss:  0.12312717403256798 Training Accuracy:  83.3404761904762 CE Test Loss:  0.532564437983719 SE Test Loss:  0.12425078436814156 Testing Accuracy:  83.31111111111112
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 229.62it/s]


CE Training Loss:  0.4962752834302408 SE Training Loss:  0.11586747790940258 Training Accuracy:  84.15 CE Test Loss:  0.5026739755787927 SE Test Loss:  0.11757507143931788 Testing Accuracy:  84.09444444444445
Epoch:  5


100%|██████████| 1312/1312 [00:04<00:00, 269.43it/s]


CE Training Loss:  0.47724779662103495 SE Training Loss:  0.11091075564330097 Training Accuracy:  84.89047619047619 CE Test Loss:  0.4860540903882024 SE Test Loss:  0.11327013820613825 Testing Accuracy:  84.57777777777778


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▂▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,84.89048
tr_loss_CE,0.47725
tr_loss_SE,0.11091
val_accuracy,84.57778
val_loss_CE,0.48605
val_loss_SE,0.11327


[34m[1mwandb[0m: Agent Starting Run: s0811kmd with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3382020625191964 SE Training Loss:  0.454007

100%|██████████| 1312/1312 [00:06<00:00, 200.85it/s]


CE Training Loss:  0.6262438902653682 SE Training Loss:  0.14936736005271942 Training Accuracy:  80.04523809523809 CE Test Loss:  0.62811552784769 SE Test Loss:  0.15030548297380353 Testing Accuracy:  80.14444444444445
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 320.93it/s]


CE Training Loss:  0.5367420305926529 SE Training Loss:  0.12841905593667463 Training Accuracy:  82.58095238095238 CE Test Loss:  0.5395556518848832 SE Test Loss:  0.12927672919632238 Testing Accuracy:  82.5
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 245.70it/s]


CE Training Loss:  0.4999450117438569 SE Training Loss:  0.119986267720283 Training Accuracy:  83.57619047619048 CE Test Loss:  0.505552029783532 SE Test Loss:  0.12142368395990072 Testing Accuracy:  83.41111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 229.53it/s]


CE Training Loss:  0.47790294803303357 SE Training Loss:  0.11520040612628145 Training Accuracy:  84.21428571428571 CE Test Loss:  0.48595546914380083 SE Test Loss:  0.11712194114359967 Testing Accuracy:  83.81111111111112
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 169.01it/s]


CE Training Loss:  0.46084415183066074 SE Training Loss:  0.11124725761507194 Training Accuracy:  84.69761904761904 CE Test Loss:  0.4701067910105353 SE Test Loss:  0.11342536536445426 Testing Accuracy:  84.27777777777777


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,84.69762
tr_loss_CE,0.46084
tr_loss_SE,0.11125
val_accuracy,84.27778
val_loss_CE,0.47011
val_loss_SE,0.11343


[34m[1mwandb[0m: Agent Starting Run: h7rugowh with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.343015587221127 SE Training Loss:  0.4541112

100%|██████████| 1312/1312 [00:03<00:00, 335.57it/s]


CE Training Loss:  1.543137690296258 SE Training Loss:  0.34880204542031246 Training Accuracy:  38.21666666666667 CE Test Loss:  1.5440583703842552 SE Test Loss:  0.3487010887032845 Testing Accuracy:  38.522222222222226
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 195.29it/s]


CE Training Loss:  1.2203902684813115 SE Training Loss:  0.28820185818330124 Training Accuracy:  54.98571428571429 CE Test Loss:  1.2190932669495738 SE Test Loss:  0.28799473227171213 Testing Accuracy:  54.82222222222222
Epoch:  3


100%|██████████| 1312/1312 [00:04<00:00, 299.30it/s]


CE Training Loss:  1.79676304799623 SE Training Loss:  0.306292763809768 Training Accuracy:  52.93333333333333 CE Test Loss:  1.793444353056985 SE Test Loss:  0.304169146836703 Testing Accuracy:  53.59444444444444
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 311.29it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 216.00it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁▆██▁▁
tr_loss_CE,█▃▁▅
tr_loss_SE,█▄▁▂
val_accuracy,▁▆██▁▁
val_loss_CE,█▃▁▅
val_loss_SE,█▄▁▂

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Agent Starting Run: fm6adyi8 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3388721187860932 SE Training Loss:  0.454168319

100%|██████████| 1312/1312 [00:08<00:00, 162.30it/s]


CE Training Loss:  0.6596591814751939 SE Training Loss:  0.1494238499912976 Training Accuracy:  80.60238095238095 CE Test Loss:  0.6608592574695558 SE Test Loss:  0.15113466212600954 Testing Accuracy:  80.30555555555556
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 228.00it/s]


CE Training Loss:  0.5438896298558569 SE Training Loss:  0.12680112080027617 Training Accuracy:  83.03809523809524 CE Test Loss:  0.5494686091966261 SE Test Loss:  0.12851900436443348 Testing Accuracy:  82.68888888888888
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 170.26it/s]


CE Training Loss:  0.48400118134628534 SE Training Loss:  0.11603644533189611 Training Accuracy:  84.31190476190476 CE Test Loss:  0.49271746301380903 SE Test Loss:  0.11802273247338448 Testing Accuracy:  83.97777777777777
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 261.43it/s]


CE Training Loss:  0.45534403462039136 SE Training Loss:  0.10991443935605083 Training Accuracy:  84.9 CE Test Loss:  0.4649579650193109 SE Test Loss:  0.11202515612315568 Testing Accuracy:  84.63888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 199.48it/s]


CE Training Loss:  0.4411477507075665 SE Training Loss:  0.10611451050556167 Training Accuracy:  85.37380952380953 CE Test Loss:  0.45181628179985805 SE Test Loss:  0.10869266226550273 Testing Accuracy:  85.06666666666666


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,85.37381
tr_loss_CE,0.44115
tr_loss_SE,0.10611
val_accuracy,85.06667
val_loss_CE,0.45182
val_loss_SE,0.10869


[34m[1mwandb[0m: Agent Starting Run: toljlz6w with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3291812855531133 SE Training Loss:  0.452579210

100%|██████████| 1312/1312 [00:07<00:00, 166.99it/s]


CE Training Loss:  0.6252462423308617 SE Training Loss:  0.14656092337437646 Training Accuracy:  80.95238095238095 CE Test Loss:  0.6289508404505473 SE Test Loss:  0.14774394156829593 Testing Accuracy:  80.86111111111111
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 262.56it/s]


CE Training Loss:  0.5198016722807881 SE Training Loss:  0.12506944834407088 Training Accuracy:  83.02619047619048 CE Test Loss:  0.5261331166941072 SE Test Loss:  0.12653309817591557 Testing Accuracy:  82.93888888888888
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 151.76it/s]


CE Training Loss:  0.4759301451052786 SE Training Loss:  0.11563929089145644 Training Accuracy:  84.21904761904761 CE Test Loss:  0.48527329745955405 SE Test Loss:  0.1179357176376658 Testing Accuracy:  83.88333333333334
Epoch:  4


100%|██████████| 1312/1312 [00:13<00:00, 93.97it/s]


CE Training Loss:  0.4504704038341324 SE Training Loss:  0.10975168277728599 Training Accuracy:  84.91904761904762 CE Test Loss:  0.46223181565438365 SE Test Loss:  0.11255474058987829 Testing Accuracy:  84.5111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:16<00:00, 78.05it/s]


CE Training Loss:  0.43049870862784606 SE Training Loss:  0.10505763227447942 Training Accuracy:  85.5547619047619 CE Test Loss:  0.4445200042360181 SE Test Loss:  0.10825747446418212 Testing Accuracy:  85.09444444444445


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,85.55476
tr_loss_CE,0.4305
tr_loss_SE,0.10506
val_accuracy,85.09444
val_loss_CE,0.44452
val_loss_SE,0.10826


[34m[1mwandb[0m: Agent Starting Run: blky62ua with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3322815121241747 SE Training Loss:  0.452506881

100%|██████████| 1312/1312 [00:09<00:00, 134.58it/s]


CE Training Loss:  1.1997981202272874 SE Training Loss:  0.27565848256989745 Training Accuracy:  58.25238095238095 CE Test Loss:  1.1995187204048723 SE Test Loss:  0.2757085426758021 Testing Accuracy:  58.13333333333333
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 254.78it/s]


CE Training Loss:  1.186782674509197 SE Training Loss:  0.25984935892233324 Training Accuracy:  61.65952380952381 CE Test Loss:  1.1849481339445167 SE Test Loss:  0.25995577344147053 Testing Accuracy:  61.50555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 191.13it/s]


CE Training Loss:  1.0263654940919822 SE Training Loss:  0.2434346367348259 Training Accuracy:  63.885714285714286 CE Test Loss:  1.0375864564688715 SE Test Loss:  0.24528975088843374 Testing Accuracy:  63.672222222222224
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 240.03it/s]


CE Training Loss:  0.999957228986211 SE Training Loss:  0.23993944381931692 Training Accuracy:  63.67142857142857 CE Test Loss:  1.0011466816514811 SE Test Loss:  0.24091293502187358 Testing Accuracy:  63.55555555555556
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 255.57it/s]


CE Training Loss:  0.9905877903440037 SE Training Loss:  0.23923342213520693 Training Accuracy:  63.50476190476191 CE Test Loss:  0.9901332146724411 SE Test Loss:  0.23987108706116866 Testing Accuracy:  63.40555555555556


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▂▁▁▁
tr_loss_SE,█▂▂▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▂▁▁▁
val_loss_SE,█▂▂▁▁▁

0,1
tr_accuracy,63.50476
tr_loss_CE,0.99059
tr_loss_SE,0.23923
val_accuracy,63.40556
val_loss_CE,0.99013
val_loss_SE,0.23987


[34m[1mwandb[0m: Agent Starting Run: 8bcr6kms with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.331874515176169 SE Training Loss:  0.453723588

100%|██████████| 1312/1312 [00:07<00:00, 175.11it/s]


CE Training Loss:  0.6146289955678039 SE Training Loss:  0.14979913556257063 Training Accuracy:  80.07857142857142 CE Test Loss:  0.6170266282057493 SE Test Loss:  0.151158159589349 Testing Accuracy:  79.82777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 232.82it/s]


CE Training Loss:  0.5322974152104174 SE Training Loss:  0.12855143295565388 Training Accuracy:  82.48333333333333 CE Test Loss:  0.5340491951855743 SE Test Loss:  0.129811920683882 Testing Accuracy:  82.38333333333334
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 206.33it/s]


CE Training Loss:  0.4937162856452683 SE Training Loss:  0.11932438916797174 Training Accuracy:  83.66428571428571 CE Test Loss:  0.49708509462906203 SE Test Loss:  0.12098897818916614 Testing Accuracy:  83.52222222222223
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 188.79it/s]


CE Training Loss:  0.4686659484932438 SE Training Loss:  0.11339816581535082 Training Accuracy:  84.49285714285715 CE Test Loss:  0.47323212616085836 SE Test Loss:  0.11539883399413257 Testing Accuracy:  84.3
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 207.43it/s]


CE Training Loss:  0.4515193155591474 SE Training Loss:  0.10975877536158322 Training Accuracy:  84.89761904761905 CE Test Loss:  0.4570323010901757 SE Test Loss:  0.11203881695524434 Testing Accuracy:  84.60555555555555


0,1
tr_accuracy,▁█████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁█████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,84.89762
tr_loss_CE,0.45152
tr_loss_SE,0.10976
val_accuracy,84.60556
val_loss_CE,0.45703
val_loss_SE,0.11204


[34m[1mwandb[0m: Agent Starting Run: oe2i4155 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3097716613609065 SE Training Loss:  0.45088921

100%|██████████| 1312/1312 [00:08<00:00, 161.97it/s]


CE Training Loss:  0.6222921426789847 SE Training Loss:  0.14927013553449486 Training Accuracy:  79.34761904761905 CE Test Loss:  0.6274061046284999 SE Test Loss:  0.15137675753436122 Testing Accuracy:  78.92777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 220.42it/s]


CE Training Loss:  0.5400617808134951 SE Training Loss:  0.12884747970031157 Training Accuracy:  82.6952380952381 CE Test Loss:  0.5426735293035165 SE Test Loss:  0.13026590965489615 Testing Accuracy:  82.18333333333334
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 153.60it/s]


CE Training Loss:  0.5089491115961443 SE Training Loss:  0.11913180570988828 Training Accuracy:  83.83571428571429 CE Test Loss:  0.5124599552624268 SE Test Loss:  0.12081323075241547 Testing Accuracy:  83.52777777777777
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 153.23it/s]


CE Training Loss:  0.4918959058854925 SE Training Loss:  0.11303522835855988 Training Accuracy:  84.65 CE Test Loss:  0.4980207601791633 SE Test Loss:  0.1153421892305584 Testing Accuracy:  84.23333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:15<00:00, 86.09it/s]


CE Training Loss:  0.4809061770526807 SE Training Loss:  0.10922189684982093 Training Accuracy:  85.11190476190477 CE Test Loss:  0.4897331997053455 SE Test Loss:  0.11202239017824837 Testing Accuracy:  84.47777777777777


0,1
tr_accuracy,▁▇████
tr_loss_CE,█▂▁▁▁▁
tr_loss_SE,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss_CE,█▂▁▁▁▁
val_loss_SE,█▂▁▁▁▁

0,1
tr_accuracy,85.1119
tr_loss_CE,0.48091
tr_loss_SE,0.10922
val_accuracy,84.47778
val_loss_CE,0.48973
val_loss_SE,0.11202


[34m[1mwandb[0m: Agent Starting Run: s2bpmigp with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	loss_function: squared_error
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
CE Training Loss:  2.3106319928769254 SE Training Loss:  0.45084934

100%|██████████| 1312/1312 [00:06<00:00, 196.05it/s]


CE Training Loss:  1.423761169723001 SE Training Loss:  0.2602777012080058 Training Accuracy:  64.50238095238095 CE Test Loss:  1.4267404205848975 SE Test Loss:  0.26098599670685374 Testing Accuracy:  64.40555555555555
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 194.58it/s]


CE Training Loss:  1.287912153785446 SE Training Loss:  0.24857124134878372 Training Accuracy:  63.18333333333333 CE Test Loss:  1.2855698113897758 SE Test Loss:  0.2474417570736898 Testing Accuracy:  63.48888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 207.20it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 191.95it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 226.71it/s]


CE Training Loss:  nan SE Training Loss:  nan Training Accuracy:  10.073809523809524 CE Test Loss:  nan SE Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██▁▁▁
tr_loss_CE,█▂▁
tr_loss_SE,█▁▁
val_accuracy,▁██▁▁▁
val_loss_CE,█▂▁
val_loss_SE,█▁▁

0,1
tr_accuracy,10.07381
tr_loss_CE,
tr_loss_SE,
val_accuracy,9.82778
val_loss_CE,
val_loss_SE,


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
