<a href="https://colab.research.google.com/github/arunangshudutta/DA6401_assignments/blob/main/assignment_1/Assignment_1_Q4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)

from keras.datasets import fashion_mnist
from sklearn.model_selection import train_test_split

import wandb

# Functions

In [None]:
def weights_initialization(num_neurons, initializer):
  """
  num_neurons = list of number of neurons at each layer starting from the input layer and ending at output layer
  initializer = 'random' or 'xavier'

  Returns: initialized weight matrices and bias vectors
  """
  mean=0
  std_dev=1

  W_matrices = []
  b_vectors = []

  for i in range(len(num_neurons)-1):
    rows = num_neurons[i+1]
    cols = num_neurons[i]

    if initializer == 'random':

      weight_matrix = np.random.normal(mean, std_dev, size=(rows, cols))

    elif initializer == 'Xavier':

      upper_bound = np.sqrt(6.0/(rows + cols))
      lower_bound = -1*upper_bound
      weight_matrix = np.random.uniform(low = lower_bound, high = upper_bound, size = (rows, cols))

    else:
      print('initializer invalid')


    bias_vector = np.zeros((rows,1))

    W_matrices.append(weight_matrix)
    b_vectors.append(bias_vector)


  return W_matrices, b_vectors

################################################################################

# ACTIVATION FUNCTIONS
def relu(x):
  """
  Rectified Linear Unit (ReLU) activation function
  """
  return np.maximum(0, x)

def sigmoid(x):
  """
  Sigmoid activation function
  """
  # x = np.float128(x)
  return 1 / (1 + np.exp(-x))

def tanh(x):
  """
  Hyperbolic tangent (tanh) activation function
  """
  # x = np.float128(x)
  return np.tanh(x)
def softmax(x):

  """
  Softmax function for output layer
  """
  # x = np.float128(x)
  return np.exp(x) / np.sum(np.exp(x), axis=0)

def activation_output(x, activation_function):
  """
  activation_function = 'ReLU', 'sigmoid', 'tanh'
  """
  if activation_function == 'ReLU':
    return relu(x)
  elif activation_function == 'sigmoid':
    return sigmoid(x)
  elif activation_function == 'tanh':
    return tanh(x)
  elif activation_function == 'softmax':
    return softmax(x)
  else:
    print('activation function invalid')

# DERIVATIVE OF ACTIVATION FUNCTION
def sigmoid_derivative(x):
  s = sigmoid(x)
  return s * (1 - s)

def tanh_derivative(x):
  t = tanh(x)
  return 1 - t**2

def relu_derivative(x):
  return 1*(x>0)

def activation_derivative(x, activation_function):
  """
  activation_function = 'ReLU', 'sigmoid', 'tanh'
  """
  if activation_function == 'ReLU':
    return relu_derivative(x)
  elif activation_function == 'sigmoid':
    return sigmoid_derivative(x)
  elif activation_function == 'tanh':
    return tanh_derivative(x)
  else:
    print('activation function invalid')

################################################################################

def layer_output_FP(x, weight_matrix, bias_vector, activation_function):
  pre_activation = np.add(np.matmul(weight_matrix, x), bias_vector)
  post_activation = activation_output(pre_activation, activation_function)
  return pre_activation, post_activation

def forward_propagation(ip_data, W_matrices, b_vectors, activation_functions):
  """
  forward propagation
  """

  layer_op = []
  layer_op.append(ip_data)

  layer_ip = []

  for i in range(len(W_matrices)):

    weight_matrix = W_matrices[i]
    bias_vector = b_vectors[i]

    activation_function = activation_functions[i]

    pre_activation, post_activation = layer_output_FP(layer_op[i], weight_matrix, bias_vector, activation_function)

    layer_op.append(post_activation)
    layer_ip.append(pre_activation)

  return layer_ip, layer_op

################################################################################

def back_propagation(W_matrices, b_vectors, y_true, layer_ip, layer_op, activation_functions, batch_size, w_d):

  DWs = []
  Dbs = []
  for i in range(len(W_matrices)):
    k = len(W_matrices) - i

    if k == len(W_matrices):
      Da = -np.add(y_true, -layer_op[k])
      Dw = (np.matmul(Da, layer_op[k-1].T) + w_d*W_matrices[k-1])/batch_size
    else:

      Dh = np.matmul(W_matrices[k].T, Da)
      Dg = activation_derivative(layer_ip[k-1], activation_functions[k-1])
      Da = np.multiply(Dh, Dg)
      Dw = (np.matmul(Da, layer_op[k-1].T) + w_d*W_matrices[k-1])/batch_size
    Db = np.sum(Da, axis=1, keepdims=True)/batch_size

    DWs.append(Dw)
    Dbs.append(Db)

  return DWs, Dbs


################################################################################


def update_weights_gd(W_matrices, b_vectors, DWs, Dbs, learning_rate = 0.1):

  DWs.reverse()
  Dbs.reverse()

  for i in range(len(DWs)):

    W_matrices[i] = W_matrices[i] - learning_rate*DWs[i]
    b_vectors[i] = b_vectors[i] - learning_rate*Dbs[i]
  return W_matrices, b_vectors

def update_weights_momentum(W_matrices, b_vectors, DWs, Dbs, u_past_w, u_past_b, learning_rate = 0.1, beta = 0.5):
  DWs.reverse()
  Dbs.reverse()
  u_w = u_past_w
  u_b = u_past_b
  for i in range(len(DWs)):

    u_w[i] = beta*u_past_w[i] + DWs[i]
    u_b[i] = beta*u_past_b[i] + Dbs[i]

    W_matrices[i] = W_matrices[i] - learning_rate*u_w[i]
    b_vectors[i] = b_vectors[i] - learning_rate*u_b[i]

  return W_matrices, b_vectors, u_w, u_b

def update_weights_adagrad(W_matrices, b_vectors, DWs, Dbs, u_past_w, u_past_b, learning_rate = 0.1):
  DWs.reverse()
  Dbs.reverse()

  u_w = u_past_w
  u_b = u_past_b
  eps = 1e-8
  for i in range(len(DWs)):
    u_w[i] = u_past_w[i] + DWs[i]**2
    u_b[i] = u_past_b[i] + Dbs[i]**2

    W_matrices[i] = W_matrices[i] - learning_rate*DWs[i]/(np.sqrt(u_w[i]) + eps)
    b_vectors[i] = b_vectors[i] - learning_rate*Dbs[i]/(np.sqrt(u_b[i]) + eps)

  return W_matrices, b_vectors, u_w, u_b

def update_weights_rmsprop(W_matrices, b_vectors, DWs, Dbs, u_past_w, u_past_b, learning_rate = 0.1, beta = 0.5):
  DWs.reverse()
  Dbs.reverse()

  u_w = u_past_w
  u_b = u_past_b
  eps = 1e-8
  for i in range(len(DWs)):
    u_w[i] = beta*u_past_w[i] + (1-beta)*DWs[i]**2
    u_b[i] = beta*u_past_b[i] + (1-beta)*Dbs[i]**2

    W_matrices[i] = W_matrices[i] - learning_rate*DWs[i]/(np.sqrt(u_w[i]) + eps)
    b_vectors[i] = b_vectors[i] - learning_rate*Dbs[i]/(np.sqrt(u_b[i]) + eps)

  return W_matrices, b_vectors, u_w, u_b

def update_weights_adam(W_matrices, b_vectors, DWs, Dbs, mw_past, mb_past, vw_past, vb_past, t, learning_rate = 0.1, beta1 = 0.5, beta2 =0.5):
  DWs.reverse()
  Dbs.reverse()
  mw = mw_past
  mb = mb_past
  vw = vw_past
  vb = vb_past
  eps = 1e-8

  for i in range(len(DWs)):
    mw[i] = beta1*mw_past[i] + (1-beta1)*DWs[i]
    mb[i] = beta1*mb_past[i] + (1-beta1)*Dbs[i]

    mw_cap = mw[i]/(1 - beta1**t)
    mb_cap = mb[i]/(1 - beta1**t)

    vw[i] = beta2*vw_past[i] + (1-beta2)*DWs[i]**2
    vb[i] = beta2*vb_past[i] + (1-beta2)*Dbs[i]**2
    vw_cap = vw[i]/(1 - beta2**t)
    vb_cap = vb[i]/(1 - beta2**t)

    W_matrices[i] = W_matrices[i] - learning_rate*mw_cap/(np.sqrt(vw_cap) + eps)
    b_vectors[i] = b_vectors[i] - learning_rate*mb_cap/(np.sqrt(vb_cap) + eps)

  return W_matrices, b_vectors, mw, mb, vw, vb

def update_weights_nadam(W_matrices, b_vectors, DWs, Dbs, mw_past, mb_past, vw_past, vb_past,t,  learning_rate = 0.1, beta1 = 0.5, beta2 =0.5):
  DWs.reverse()
  Dbs.reverse()
  mw = mw_past
  mb = mb_past
  vw = vw_past
  vb = vb_past
  eps = 1e-8

  for i in range(len(DWs)):
    mw[i] = beta1*mw_past[i] + (1-beta1)*DWs[i]
    mb[i] = beta1*mb_past[i] + (1-beta1)*Dbs[i]

    mw_cap = mw[i]/(1 - beta1**(t+1))
    mb_cap = mb[i]/(1 - beta1**(t+1))

    vw[i] = beta2*vw_past[i] + (1-beta2)*DWs[i]**2
    vb[i] = beta2*vb_past[i] + (1-beta2)*Dbs[i]**2
    vw_cap = vw[i]/(1 - beta2**(t+1))
    vb_cap = vb[i]/(1 - beta2**(t+1))

    W_matrices[i] = W_matrices[i] - learning_rate*(beta1*mw_cap + ((1-beta1)/(1 - beta1**(t+1)))*DWs[i])/(np.sqrt(vw_cap) + eps)
    b_vectors[i] = b_vectors[i] - learning_rate*(beta1*mb_cap + ((1-beta1)/(1 - beta1**(t+1)))*Dbs[i])/(np.sqrt(vb_cap) + eps)

  return W_matrices, b_vectors, mw, mb, vw, vb

def look_ahead_nag(W_s, b_s, u_past_w, u_past_b, beta = 0.5):
  for i in range(len(W_s)):
    W_s[i] = W_s[i] - beta*u_past_w[i]
    b_s[i] = b_s[i] - beta*u_past_b[i]
  return W_s, b_s

################################################################################


def one_hot_encode(integers, num_classes=None):
  if num_classes is None:
      num_classes = np.max(integers) + 1
  return np.eye(num_classes)[integers]

def cross_entropy_loss(y_true, y_pred, batch_size):
  # Clip the predicted probabilities to avoid numerical instability
  y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
  loss_value = np.sum(np.sum(y_true*np.log(y_pred), axis=0))/batch_size
  return loss_value*(-1)



def accuracy(y_true, y_pred, batch_size):
  n_correct = 0
  for i in range(0, batch_size, 1) :
    if y_true[:,i].argmax() == y_pred[:,i].argmax() :
      n_correct += 1
  return 100 * n_correct / batch_size

################################################################################

def load_split_dataset(test_ratio=0.3):
  # Load Fashion MNIST dataset
  (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

  # Split the training set into training and validation sets
  X_train, X_val, Y_train, Y_val = train_test_split(train_images, train_labels, test_size=test_ratio, random_state=42)

  data_size = X_train.shape[0]
  X_train = (X_train.reshape(data_size, -1).T)/255
  Y_train = one_hot_encode(Y_train, 10).T

  data_size = X_val.shape[0]
  X_val = (X_val.reshape(data_size, -1).T)/255
  Y_val = one_hot_encode(Y_val, 10).T

  data_size = test_images.shape[0]
  X_test = (test_images.reshape(data_size, -1).T)/255
  Y_test = one_hot_encode(test_labels, 10).T

  return X_train, Y_train, X_val, Y_val, X_test, Y_test


In [None]:

def train_model(X_train,Y_train, X_test, Y_test, epoch=1,batch_size=25, num_neurons_hidden = [10], activation_functions = ['sigmoid'],
                weights_init_type='random', optimizer = 'sgd', learning_rate = 0.1, opti_beta = [0.5, 0.5], w_d = 0, plot_acc_loss = False):

  """
  X has shape (number of features, number of samples in train data set)
  Y has shape (number of classes, number of samples in train data set)

  num_neurons_hidden = list of number of neurons at each hidden layer

  """
  num_ip_neurons = X_train.shape[0]
  num_op_neurons = Y_train.shape[0]
  num_neurons = [num_ip_neurons] + num_neurons_hidden + [num_op_neurons]
  activation_functions = activation_functions + ['softmax']

  W_s, b_s = weights_initialization(num_neurons, weights_init_type)
  print('Hyper parameters: \n')
  print("Weight initialization type : ", weights_init_type)
  print("Optimizer : ", optimizer)
  print("Learning rate (initial): ", learning_rate)
  print("Batch size: ", batch_size)
  print("-------------------")
  print("Architecture Description:\n")

  for i in range(len(num_neurons)-1):
    print("Layer: ", i+1, " ; number of neurons: ", num_neurons[i+1], " ; activation function: ", activation_functions[i])
    print("Weight matrix dimention", W_s[i].shape, "Bias vector dimention", b_s[i].shape)
    print("----------------")

  num_batches = np.floor(X_train.shape[1]/batch_size)
  print(num_batches)


  if optimizer == 'momentum':
    u_past_w = [x * 0 for x in W_s]
    u_past_b = [x * 0 for x in b_s]

  elif optimizer == 'nag':
    u_past_w = [x * 0 for x in W_s]
    u_past_b = [x * 0 for x in b_s]

  elif optimizer == 'rmsprop':
    u_past_w = [x * 0 for x in W_s]
    u_past_b = [x * 0 for x in b_s]

  elif optimizer == 'adagrad':
    u_past_w = [x * 0 for x in W_s]
    u_past_b = [x * 0 for x in b_s]

  elif optimizer == 'adam':
    mw_past = [x * 0 for x in W_s]
    mb_past = [x * 0 for x in b_s]
    vw_past = [x * 0 for x in W_s]
    vb_past = [x * 0 for x in b_s]
    t = 1

  elif optimizer == 'nadam':
    mw_past = [x * 0 for x in W_s]
    mb_past = [x * 0 for x in b_s]
    vw_past = [x * 0 for x in W_s]
    vb_past = [x * 0 for x in b_s]
    t = 1

  print('\n Start of training')

  ip_all, op_all = forward_propagation(X_train, W_s, b_s, activation_functions)
  loss_tr = cross_entropy_loss(Y_train, op_all[-1], X_train.shape[1])
  acc_tr = accuracy(Y_train, op_all[-1], Y_train.shape[1])

  ip_all, op_all = forward_propagation(X_test, W_s, b_s, activation_functions)
  loss_ts = cross_entropy_loss(Y_test, op_all[-1], X_test.shape[1])
  acc_ts = accuracy(Y_test, op_all[-1], Y_test.shape[1])

  print("Training Loss: ", loss_tr, "Training Accuracy: ", acc_tr, "Test Loss: ", loss_ts, "Testing Accuracy: ", acc_ts)

  wandb.log({'tr_loss' : loss_tr, 'tr_accuracy' : acc_tr, 'val_loss' : loss_ts, 'val_accuracy' : acc_ts})

  train_loss = np.array([loss_tr])
  train_acc = np.array([acc_tr])

  val_loss = np.array([loss_ts])
  val_acc = np.array([acc_ts])

  for i in range(epoch):
    print('Epoch: ', i+1)

    for j in tqdm(range(int(num_batches))):
      batch_X = X_train[:,j*batch_size:(j+1)*batch_size]
      batch_Y = Y_train[:,j*batch_size:(j+1)*batch_size]


      if optimizer == 'sgd':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d)
        W_s, b_s = update_weights_gd(W_s, b_s, DWs, Dbs, learning_rate)

      elif optimizer == 'momentum':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d)
        W_s, b_s, u_past_w, u_past_b  = update_weights_momentum(W_s, b_s, DWs, Dbs, u_past_w, u_past_b, learning_rate, opti_beta[0])

      elif optimizer == 'adagrad':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d)
        W_s, b_s, u_past_w, u_past_b  = update_weights_adagrad(W_s, b_s, DWs, Dbs, u_past_w, u_past_b, learning_rate)

      elif optimizer == 'rmsprop':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d)
        W_s, b_s, u_past_w, u_past_b  = update_weights_rmsprop(W_s, b_s, DWs, Dbs, u_past_w, u_past_b, learning_rate, opti_beta[0])

      elif optimizer == 'adam':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d)
        W_s, b_s, mw_past, mb_past, vw_past, vb_past = update_weights_adam(W_s, b_s, DWs, Dbs, mw_past, mb_past, vw_past, vb_past, t, learning_rate, opti_beta[0], opti_beta[1])
        t =t +1

      elif optimizer == 'nadam':
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(W_s, b_s, batch_Y, ip, op, activation_functions, batch_size, w_d)
        W_s, b_s, mw_past, mb_past, vw_past, vb_past = update_weights_nadam(W_s, b_s, DWs, Dbs, mw_past, mb_past, vw_past, vb_past, t, learning_rate, opti_beta[0], opti_beta[1])
        t =t +1
      elif optimizer == 'nag':
        PWs, Pbs = look_ahead_nag(W_s, b_s, u_past_w, u_past_b, opti_beta[0])
        ip, op = forward_propagation(batch_X, W_s, b_s, activation_functions)
        DWs, Dbs = back_propagation(PWs, Pbs, batch_Y, ip, op, activation_functions, batch_size, w_d)
        W_s, b_s, u_past_w, u_past_b  = update_weights_momentum(W_s, b_s, DWs, Dbs, u_past_w, u_past_b, learning_rate, opti_beta[0])


    ip_all, op_all = forward_propagation(X_train, W_s, b_s, activation_functions)
    loss_tr = cross_entropy_loss(Y_train, op_all[-1], X_train.shape[1])
    acc_tr = accuracy(Y_train, op_all[-1], Y_train.shape[1])

    ip_all, op_all = forward_propagation(X_test, W_s, b_s, activation_functions)
    loss_ts = cross_entropy_loss(Y_test, op_all[-1], X_test.shape[1])
    acc_ts = accuracy(Y_test, op_all[-1], Y_test.shape[1])

    print("Training Loss: ", loss_tr, "Training Accuracy: ", acc_tr, "Test Loss: ", loss_ts, "Testing Accuracy: ", acc_ts)

    train_loss = np.append(train_loss, [loss_tr])
    train_acc = np.append(train_acc, [acc_tr])

    val_loss = np.append(val_loss, [loss_ts])
    val_acc = np.append(val_acc, [acc_ts])

    wandb.log({'tr_loss' : loss_tr, 'tr_accuracy' : acc_tr, 'val_loss' : loss_ts, 'val_accuracy' : acc_ts})

  if plot_acc_loss == True:

    fig, ax = plt.subplots()  # Create a figure and axes object
    ax.plot(np.arange(0, epoch + 1, 1), train_acc, color='r', label='training')
    ax.plot(np.arange(0, epoch + 1, 1), val_acc, color='g', label='validation')
    ax.set_title("Accuracy")  # Set title on the axes object
    ax.legend()
    plt.grid()
    plt.show()

    fig, ax = plt.subplots()  # Create a figure and axes object for the second plot
    ax.plot(np.arange(0, epoch + 1, 1), train_loss, color='r', label='training')
    ax.plot(np.arange(0, epoch + 1, 1), val_loss, color='g', label='validation')
    ax.set_title("Loss")  # Set title on the axes object
    ax.legend()
    plt.grid()
    plt.show()




# Sweep

In [None]:
sweep_config = {
    'method': 'bayes',
    'name' : 'Bayesian_sweep_cross_entropy',
    'metric': {
      'name': 'valid accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'epochs': {
            'values': [5, 10]
        },
        'num_layers': {
            'values': [3, 4, 5]
        },
         'hidden_size': {
            'values': [32, 64, 128]
        },
        'weight_decay': {
            'values': [0, 0.0005, 0.5]
        },
         'learning_rate': {
            'values': [0.001, 0.0001]
        },
         'optimizer': {
            'values': ['sgd', 'momentum', 'nag', 'rmsprop', 'adam', 'nadam']
        },
        'batch_size': {
            'values': [16, 32, 64]
        },
         'weight_init': {
            'values': ['random', 'Xavier']
        },
        'activation': {
            'values': ['sigmoid', 'tanh', 'ReLU']
        },
    }
}

sweep_id = wandb.sweep(sweep = sweep_config, project = 'dl_assgn_1_q_4')

Create sweep with ID: 1dhp5ky6
Sweep URL: https://wandb.ai/arunangshudutta218-iitm/dl_assgn_1_q_4/sweeps/1dhp5ky6


In [None]:
def main():
  with wandb.init() as run:

    epochs = wandb.config.epochs
    nhl = wandb.config.num_layers
    sz = wandb.config.hidden_size
    w_d = wandb.config.weight_decay
    lr = wandb.config.learning_rate
    optimizer = wandb.config.optimizer
    b_sz = wandb.config.batch_size
    weight_init = wandb.config.weight_init
    act_fun = wandb.config.activation

    neuros_num = []
    act_func = []
    for i in range(nhl):
      neuros_num.append(sz)
      act_func.append(act_fun)

    wandb.run.name = "e_{}_hl_{}_hs_{}_lr_{}_opt_{}_bs_{}_init_{}_ac_{}_l2_{}".format(epochs, nhl, sz, lr, optimizer, b_sz, weight_init, act_fun, w_d)

    train_model(X_train, Y_train, X_val, Y_val, epoch=epochs, batch_size=b_sz, num_neurons_hidden = neuros_num, activation_functions = act_func,
                weights_init_type=weight_init, optimizer = optimizer, learning_rate = lr, opti_beta = [0.5, 0.5], w_d = w_d)


X_train, Y_train, X_val, Y_val, X_test, Y_test = load_split_dataset()

wandb.agent(sweep_id, function = main, count = 150)
wandb.finish()

[34m[1mwandb[0m: Agent Starting Run: kqmm9pgr with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.3620488931413965 Training Accuracy:

100%|██████████| 2625/2625 [00:12<00:00, 210.17it/s]


Training Loss:  1.721154003803654 Training Accuracy:  51.404761904761905 Test Loss:  1.7206536553215745 Testing Accuracy:  51.62222222222222
Epoch:  2


100%|██████████| 2625/2625 [00:13<00:00, 194.52it/s]


Training Loss:  1.4199340461747267 Training Accuracy:  60.87380952380953 Test Loss:  1.4208397848028294 Testing Accuracy:  60.59444444444444
Epoch:  3


100%|██████████| 2625/2625 [00:12<00:00, 215.28it/s]


Training Loss:  1.2503125144988751 Training Accuracy:  64.02619047619048 Test Loss:  1.252096188823511 Testing Accuracy:  63.92777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:12<00:00, 212.93it/s]


Training Loss:  1.14227386989184 Training Accuracy:  65.4452380952381 Test Loss:  1.1444927202640365 Testing Accuracy:  65.53888888888889
Epoch:  5


100%|██████████| 2625/2625 [00:11<00:00, 220.88it/s]


Training Loss:  1.0668924775304833 Training Accuracy:  66.45952380952382 Test Loss:  1.0693090908358294 Testing Accuracy:  66.46666666666667


0,1
tr_accuracy,▁▆▇███
tr_loss,█▅▃▂▁▁
val_accuracy,▁▆▇███
val_loss,█▅▃▂▁▁

0,1
tr_accuracy,66.45952
tr_loss,1.06689
val_accuracy,66.46667
val_loss,1.06931


[34m[1mwandb[0m: Agent Starting Run: des1otxo with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 

100%|██████████| 2625/2625 [00:04<00:00, 549.25it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:06<00:00, 376.78it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:04<00:00, 583.34it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:06<00:00, 434.64it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:04<00:00, 554.39it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 2625/2625 [00:04<00:00, 562.37it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 2625/2625 [00:07<00:00, 359.68it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 2625/2625 [00:04<00:00, 583.39it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 2625/2625 [00:07<00:00, 370.97it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 2625/2625 [00:04<00:00, 568.11it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: lcvmmber with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.784593050664683 Training Accuracy:  9.969047619047618 Test Loss:  2.7808400534966746 Testing Accuracy:  10.072222222222223
Epoch:  1


100%|██████████| 1312/1312 [00:07<00:00, 177.49it/s]


Training Loss:  1.1877731534194327 Training Accuracy:  63.18809523809524 Test Loss:  1.1891087352269987 Testing Accuracy:  62.977777777777774
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 148.42it/s]


Training Loss:  0.8106531850942679 Training Accuracy:  72.41904761904762 Test Loss:  0.8140633630009685 Testing Accuracy:  71.88333333333334
Epoch:  3


100%|██████████| 1312/1312 [00:10<00:00, 129.45it/s]


Training Loss:  0.6475722958989861 Training Accuracy:  76.42142857142858 Test Loss:  0.6515482505514768 Testing Accuracy:  76.3
Epoch:  4


100%|██████████| 1312/1312 [00:09<00:00, 140.70it/s]


Training Loss:  0.5729919218353101 Training Accuracy:  78.95952380952382 Test Loss:  0.5764011343788581 Testing Accuracy:  79.0111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 177.91it/s]


Training Loss:  0.5294324711577127 Training Accuracy:  81.11428571428571 Test Loss:  0.5325619816727941 Testing Accuracy:  80.97777777777777


0,1
tr_accuracy,▁▆▇███
tr_loss,█▃▂▁▁▁
val_accuracy,▁▆▇███
val_loss,█▃▂▁▁▁

0,1
tr_accuracy,81.11429
tr_loss,0.52943
val_accuracy,80.97778
val_loss,0.53256


[34m[1mwandb[0m: Agent Starting Run: j6ljxbq0 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  10.73087750710737 Training Accuracy:  11.021428571

100%|██████████| 2625/2625 [00:04<00:00, 591.57it/s]


Training Loss:  0.7323988373086111 Training Accuracy:  76.32619047619048 Test Loss:  0.7377299729155486 Testing Accuracy:  76.22777777777777
Epoch:  2


100%|██████████| 2625/2625 [00:05<00:00, 462.76it/s]


Training Loss:  0.7439235984522955 Training Accuracy:  75.42142857142858 Test Loss:  0.7502826607485729 Testing Accuracy:  75.43888888888888
Epoch:  3


100%|██████████| 2625/2625 [00:03<00:00, 671.12it/s]


Training Loss:  0.7394900569353224 Training Accuracy:  75.72142857142858 Test Loss:  0.7461699782566494 Testing Accuracy:  75.66111111111111
Epoch:  4


100%|██████████| 2625/2625 [00:03<00:00, 706.05it/s]


Training Loss:  0.735800088067281 Training Accuracy:  75.71428571428571 Test Loss:  0.7426247776719745 Testing Accuracy:  75.5
Epoch:  5


100%|██████████| 2625/2625 [00:06<00:00, 395.27it/s]


Training Loss:  0.7341281079399932 Training Accuracy:  75.81428571428572 Test Loss:  0.7410560741219407 Testing Accuracy:  75.77777777777777


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,75.81429
tr_loss,0.73413
val_accuracy,75.77778
val_loss,0.74106


[34m[1mwandb[0m: Agent Starting Run: spi2sy4e with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.5256072585482965 Training Accuracy:  9

100%|██████████| 1312/1312 [00:06<00:00, 192.92it/s]


Training Loss:  1.7338388450189977 Training Accuracy:  25.785714285714285 Test Loss:  1.735910919159919 Testing Accuracy:  26.511111111111113
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 303.79it/s]


Training Loss:  1.4794986359807836 Training Accuracy:  44.0 Test Loss:  1.480611895115562 Testing Accuracy:  44.53888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 223.28it/s]


Training Loss:  1.2683448990959074 Training Accuracy:  48.70238095238095 Test Loss:  1.2696352395451331 Testing Accuracy:  48.766666666666666
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 306.80it/s]


Training Loss:  1.1249192570931923 Training Accuracy:  52.24761904761905 Test Loss:  1.1244564597606865 Testing Accuracy:  52.833333333333336
Epoch:  5


100%|██████████| 1312/1312 [00:04<00:00, 280.15it/s]


Training Loss:  1.018637431883004 Training Accuracy:  61.10476190476191 Test Loss:  1.0174761712993003 Testing Accuracy:  61.08888888888889


0,1
tr_accuracy,▁▃▆▆▇█
tr_loss,█▄▃▂▁▁
val_accuracy,▁▃▆▆▇█
val_loss,█▄▃▂▁▁

0,1
tr_accuracy,61.10476
tr_loss,1.01864
val_accuracy,61.08889
val_loss,1.01748


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: bmsb6be7 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.465894634635392 Training Accuracy:  9.969047619047618 Test Loss:  2.47143810928384 Testing Accuracy:  10.072222222222223
Epoch:  1


100%|██████████| 656/656 [00:00<00:00, 710.74it/s]


Training Loss:  2.348960413699746 Training Accuracy:  9.969047619047618 Test Loss:  2.351931148965011 Testing Accuracy:  10.072222222222223
Epoch:  2


100%|██████████| 656/656 [00:00<00:00, 742.42it/s]


Training Loss:  2.3144778119855323 Training Accuracy:  11.092857142857143 Test Loss:  2.3160993899691698 Testing Accuracy:  11.161111111111111
Epoch:  3


100%|██████████| 656/656 [00:00<00:00, 734.78it/s]


Training Loss:  2.304299559534609 Training Accuracy:  14.05952380952381 Test Loss:  2.305220712967914 Testing Accuracy:  13.755555555555556
Epoch:  4


100%|██████████| 656/656 [00:00<00:00, 723.57it/s]


Training Loss:  2.3011118615981476 Training Accuracy:  18.04047619047619 Test Loss:  2.3016710880145874 Testing Accuracy:  17.738888888888887
Epoch:  5


100%|██████████| 656/656 [00:00<00:00, 727.16it/s]


Training Loss:  2.2998335061533592 Training Accuracy:  19.135714285714286 Test Loss:  2.3002071766720853 Testing Accuracy:  18.8
Epoch:  6


100%|██████████| 656/656 [00:00<00:00, 752.54it/s]


Training Loss:  2.29904923251053 Training Accuracy:  19.988095238095237 Test Loss:  2.299328910340058 Testing Accuracy:  19.666666666666668
Epoch:  7


100%|██████████| 656/656 [00:00<00:00, 730.31it/s]


Training Loss:  2.298383332837855 Training Accuracy:  20.68095238095238 Test Loss:  2.298616393636196 Testing Accuracy:  20.288888888888888
Epoch:  8


100%|██████████| 656/656 [00:02<00:00, 310.93it/s]


Training Loss:  2.297738891267641 Training Accuracy:  20.95952380952381 Test Loss:  2.2979497965107125 Testing Accuracy:  20.544444444444444
Epoch:  9


100%|██████████| 656/656 [00:02<00:00, 300.14it/s]


Training Loss:  2.2970909456053956 Training Accuracy:  21.169047619047618 Test Loss:  2.2972923156703526 Testing Accuracy:  20.85
Epoch:  10


100%|██████████| 656/656 [00:00<00:00, 719.43it/s]


Training Loss:  2.296432624783391 Training Accuracy:  21.55952380952381 Test Loss:  2.2966309756187644 Testing Accuracy:  21.255555555555556


0,1
tr_accuracy,▁▁▂▃▆▇▇▇███
tr_loss,█▃▂▁▁▁▁▁▁▁▁
val_accuracy,▁▁▂▃▆▆▇▇███
val_loss,█▃▂▁▁▁▁▁▁▁▁

0,1
tr_accuracy,21.55952
tr_loss,2.29643
val_accuracy,21.25556
val_loss,2.29663


[34m[1mwandb[0m: Agent Starting Run: 532jjj71 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  15.004716905481631 Training Accuracy:

100%|██████████| 2625/2625 [00:20<00:00, 128.70it/s]


Training Loss:  2.376422634101403 Training Accuracy:  49.076190476190476 Test Loss:  2.414137635791269 Testing Accuracy:  48.7
Epoch:  2


100%|██████████| 2625/2625 [00:22<00:00, 119.19it/s]


Training Loss:  1.2102415441431884 Training Accuracy:  61.84285714285714 Test Loss:  1.207257833095189 Testing Accuracy:  61.92777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:20<00:00, 130.81it/s]


Training Loss:  1.039140117543935 Training Accuracy:  65.44761904761904 Test Loss:  1.0569139085559691 Testing Accuracy:  65.05555555555556
Epoch:  4


100%|██████████| 2625/2625 [00:20<00:00, 128.86it/s]


Training Loss:  0.9047869733845258 Training Accuracy:  70.21190476190476 Test Loss:  0.9182010384145356 Testing Accuracy:  69.96666666666667
Epoch:  5


100%|██████████| 2625/2625 [00:22<00:00, 119.18it/s]


Training Loss:  0.8374287528829524 Training Accuracy:  71.63571428571429 Test Loss:  0.8629220546473145 Testing Accuracy:  70.89444444444445
Epoch:  6


100%|██████████| 2625/2625 [00:19<00:00, 131.83it/s]


Training Loss:  0.835942218038087 Training Accuracy:  72.34285714285714 Test Loss:  0.8651242581623407 Testing Accuracy:  72.33888888888889
Epoch:  7


100%|██████████| 2625/2625 [00:20<00:00, 128.79it/s]


Training Loss:  0.7666184063489804 Training Accuracy:  74.26904761904763 Test Loss:  0.8028159716633059 Testing Accuracy:  73.71666666666667
Epoch:  8


100%|██████████| 2625/2625 [00:21<00:00, 119.95it/s]


Training Loss:  0.7297370920938886 Training Accuracy:  75.91428571428571 Test Loss:  0.7543463572127118 Testing Accuracy:  75.61666666666666
Epoch:  9


100%|██████████| 2625/2625 [00:19<00:00, 136.45it/s]


Training Loss:  0.695681910502822 Training Accuracy:  78.39761904761905 Test Loss:  0.7033351401045118 Testing Accuracy:  78.13888888888889
Epoch:  10


100%|██████████| 2625/2625 [00:22<00:00, 118.96it/s]


Training Loss:  0.6409494308835414 Training Accuracy:  79.2547619047619 Test Loss:  0.6681716892993392 Testing Accuracy:  79.06111111111112


0,1
tr_accuracy,▁▅▆▇▇▇▇▇███
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▅▆▇▇▇▇▇███
val_loss,█▂▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,79.25476
tr_loss,0.64095
val_accuracy,79.06111
val_loss,0.66817


[34m[1mwandb[0m: Agent Starting Run: z4yak340 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight mat

100%|██████████| 656/656 [00:02<00:00, 258.96it/s]


Training Loss:  5.313310872420036 Training Accuracy:  6.79047619047619 Test Loss:  5.305280654054661 Testing Accuracy:  6.888888888888889
Epoch:  2


100%|██████████| 656/656 [00:01<00:00, 485.24it/s]


Training Loss:  4.476672332268963 Training Accuracy:  8.928571428571429 Test Loss:  4.472094253024557 Testing Accuracy:  9.066666666666666
Epoch:  3


100%|██████████| 656/656 [00:01<00:00, 477.13it/s]


Training Loss:  3.8970202873301787 Training Accuracy:  11.64047619047619 Test Loss:  3.89493608689235 Testing Accuracy:  11.566666666666666
Epoch:  4


100%|██████████| 656/656 [00:01<00:00, 502.71it/s]


Training Loss:  3.4827388008736646 Training Accuracy:  12.345238095238095 Test Loss:  3.4822795816524885 Testing Accuracy:  12.422222222222222
Epoch:  5


100%|██████████| 656/656 [00:01<00:00, 487.30it/s]


Training Loss:  3.1845521021759624 Training Accuracy:  11.302380952380952 Test Loss:  3.185007778867187 Testing Accuracy:  11.266666666666667


0,1
tr_accuracy,▃▁▄▇█▇
tr_loss,█▅▄▂▂▁
val_accuracy,▃▁▄▇█▇
val_loss,█▅▄▂▂▁

0,1
tr_accuracy,11.30238
tr_loss,3.18455
val_accuracy,11.26667
val_loss,3.18501


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: fj6fpi5w with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10,

100%|██████████| 1312/1312 [00:06<00:00, 207.33it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 280.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 171.53it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 271.42it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 177.78it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 1312/1312 [00:05<00:00, 262.21it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 1312/1312 [00:07<00:00, 179.49it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 1312/1312 [00:04<00:00, 268.30it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 1312/1312 [00:05<00:00, 235.24it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 1312/1312 [00:05<00:00, 230.13it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,▁██████████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: w4xv8iot with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 6

100%|██████████| 656/656 [00:01<00:00, 388.38it/s]


Training Loss:  2.3222138421336234 Training Accuracy:  10.185714285714285 Test Loss:  2.322396380333019 Testing Accuracy:  10.227777777777778
Epoch:  2


100%|██████████| 656/656 [00:02<00:00, 233.99it/s]


Training Loss:  2.3085882746815414 Training Accuracy:  10.535714285714286 Test Loss:  2.308779437824651 Testing Accuracy:  10.527777777777779
Epoch:  3


100%|██████████| 656/656 [00:02<00:00, 255.28it/s]


Training Loss:  2.297647764505708 Training Accuracy:  11.34047619047619 Test Loss:  2.297845673389029 Testing Accuracy:  11.411111111111111
Epoch:  4


100%|██████████| 656/656 [00:01<00:00, 393.60it/s]


Training Loss:  2.288550346352683 Training Accuracy:  12.866666666666667 Test Loss:  2.2887240531634827 Testing Accuracy:  12.883333333333333
Epoch:  5


100%|██████████| 656/656 [00:01<00:00, 385.69it/s]


Training Loss:  2.2806059340346643 Training Accuracy:  14.804761904761905 Test Loss:  2.280754910147442 Testing Accuracy:  14.955555555555556


0,1
tr_accuracy,▁▁▂▃▅█
tr_loss,█▆▄▃▂▁
val_accuracy,▁▁▂▃▅█
val_loss,█▆▄▃▂▁

0,1
tr_accuracy,14.80476
tr_loss,2.28061
val_accuracy,14.95556
val_loss,2.28075


[34m[1mwandb[0m: Agent Starting Run: t8mgmvc6 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  nan Training Accuracy:  8.34047619047619 Test Loss:  nan Testing Accuracy:  8.433333333333334
Epoch:  1


100%|██████████| 1312/1312 [00:02<00:00, 504.67it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:02<00:00, 456.51it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 511.52it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 253.82it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 505.28it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 1312/1312 [00:02<00:00, 498.22it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 1312/1312 [00:03<00:00, 375.60it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 1312/1312 [00:04<00:00, 323.59it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 1312/1312 [00:02<00:00, 496.86it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 1312/1312 [00:02<00:00, 492.73it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,▁██████████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: pkjv1xhe with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  nan Training Accuracy:  5.8523809523

100%|██████████| 656/656 [00:05<00:00, 111.51it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:06<00:00, 107.72it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:05<00:00, 112.57it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:06<00:00, 101.17it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 117.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 656/656 [00:06<00:00, 105.89it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 656/656 [00:05<00:00, 110.44it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 656/656 [00:06<00:00, 108.51it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 656/656 [00:06<00:00, 104.98it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 656/656 [00:05<00:00, 119.49it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,▁██████████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: ure4mjvy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 6

100%|██████████| 1312/1312 [00:05<00:00, 249.94it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 312.80it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 192.52it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 308.46it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 175.55it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█████
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: 62t8ya7w with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  5.6552827926960605 Training Accuracy:  11.

100%|██████████| 656/656 [00:04<00:00, 145.89it/s]


Training Loss:  0.608564602318777 Training Accuracy:  78.3547619047619 Test Loss:  0.6259679574678212 Testing Accuracy:  77.45555555555555
Epoch:  2


100%|██████████| 656/656 [00:02<00:00, 296.26it/s]


Training Loss:  0.5294480705506752 Training Accuracy:  80.94285714285714 Test Loss:  0.5523208228083171 Testing Accuracy:  80.41666666666667
Epoch:  3


100%|██████████| 656/656 [00:02<00:00, 291.92it/s]


Training Loss:  0.4784529950568467 Training Accuracy:  82.78809523809524 Test Loss:  0.513169281004348 Testing Accuracy:  82.03333333333333
Epoch:  4


100%|██████████| 656/656 [00:02<00:00, 302.20it/s]


Training Loss:  0.45021413697600554 Training Accuracy:  83.78809523809524 Test Loss:  0.49228193234770123 Testing Accuracy:  82.72222222222223
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 138.96it/s]


Training Loss:  0.43061099982693996 Training Accuracy:  84.47142857142858 Test Loss:  0.47731330616570167 Testing Accuracy:  83.08333333333333


0,1
tr_accuracy,▁▇████
tr_loss,█▁▁▁▁▁
val_accuracy,▁▇████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,84.47143
tr_loss,0.43061
val_accuracy,83.08333
val_loss,0.47731


[34m[1mwandb[0m: Agent Starting Run: jbm2yzwz with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight 

100%|██████████| 1312/1312 [00:09<00:00, 138.84it/s]


Training Loss:  0.5943853201048216 Training Accuracy:  80.66904761904762 Test Loss:  0.5983481089743176 Testing Accuracy:  80.53333333333333
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 170.36it/s]


Training Loss:  0.5274463501849921 Training Accuracy:  82.69285714285714 Test Loss:  0.5318553586873203 Testing Accuracy:  82.54444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 134.41it/s]


Training Loss:  0.5078098643853888 Training Accuracy:  83.11190476190477 Test Loss:  0.5126214043000369 Testing Accuracy:  83.02222222222223
Epoch:  4


100%|██████████| 1312/1312 [00:09<00:00, 137.32it/s]


Training Loss:  0.49787048991212796 Training Accuracy:  83.37619047619047 Test Loss:  0.5032427355448184 Testing Accuracy:  83.36111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 174.99it/s]


Training Loss:  0.49263270662744 Training Accuracy:  83.51904761904763 Test Loss:  0.4984750363195868 Testing Accuracy:  83.38333333333334
Epoch:  6


100%|██████████| 1312/1312 [00:07<00:00, 164.12it/s]


Training Loss:  0.48857603349684825 Training Accuracy:  83.67857142857143 Test Loss:  0.49477952949499415 Testing Accuracy:  83.53333333333333
Epoch:  7


100%|██████████| 1312/1312 [00:09<00:00, 134.68it/s]


Training Loss:  0.4849177816631534 Training Accuracy:  83.77619047619048 Test Loss:  0.4913080964514938 Testing Accuracy:  83.65555555555555
Epoch:  8


100%|██████████| 1312/1312 [00:08<00:00, 151.25it/s]


Training Loss:  0.48113929388122173 Training Accuracy:  83.8952380952381 Test Loss:  0.4878992153932288 Testing Accuracy:  83.7611111111111
Epoch:  9


100%|██████████| 1312/1312 [00:07<00:00, 181.29it/s]


Training Loss:  0.4784002164322463 Training Accuracy:  84.02619047619048 Test Loss:  0.485468661403487 Testing Accuracy:  83.84444444444445
Epoch:  10


100%|██████████| 1312/1312 [00:09<00:00, 134.62it/s]


Training Loss:  0.47574321455585944 Training Accuracy:  84.11904761904762 Test Loss:  0.48321894972856544 Testing Accuracy:  83.82777777777778


0,1
tr_accuracy,▁██████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁██████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,84.11905
tr_loss,0.47574
val_accuracy,83.82778
val_loss,0.48322


[34m[1mwandb[0m: Agent Starting Run: em2hhn83 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nag
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.424474623249198 Training Accuracy:  10.073809523809524 Test Loss:  2.426950524470001 Testing Accuracy:  9.827777777777778
Epoch:  1


100%|██████████| 656/656 [00:06<00:00, 103.71it/s]


Training Loss:  1.4111578729276368 Training Accuracy:  44.378571428571426 Test Loss:  1.4106551575123296 Testing Accuracy:  44.45
Epoch:  2


100%|██████████| 656/656 [00:04<00:00, 159.53it/s]


Training Loss:  1.3511264080478873 Training Accuracy:  47.77142857142857 Test Loss:  1.3505385151771807 Testing Accuracy:  47.90555555555556
Epoch:  3


100%|██████████| 656/656 [00:05<00:00, 116.46it/s]


Training Loss:  1.1973642818390908 Training Accuracy:  55.088095238095235 Test Loss:  1.1954122354006316 Testing Accuracy:  55.144444444444446
Epoch:  4


100%|██████████| 656/656 [00:03<00:00, 178.93it/s]


Training Loss:  1.0709288394039962 Training Accuracy:  61.892857142857146 Test Loss:  1.071419379832775 Testing Accuracy:  61.56666666666667
Epoch:  5


100%|██████████| 656/656 [00:03<00:00, 175.23it/s]


Training Loss:  1.0622472486876036 Training Accuracy:  61.56190476190476 Test Loss:  1.0634177885220353 Testing Accuracy:  61.15555555555556
Epoch:  6


100%|██████████| 656/656 [00:05<00:00, 117.54it/s]


Training Loss:  1.0569048371071936 Training Accuracy:  61.60476190476191 Test Loss:  1.0584379387601128 Testing Accuracy:  61.233333333333334
Epoch:  7


100%|██████████| 656/656 [00:03<00:00, 165.00it/s]


Training Loss:  1.0521414374653115 Training Accuracy:  61.84047619047619 Test Loss:  1.0538488316939285 Testing Accuracy:  61.361111111111114
Epoch:  8


100%|██████████| 656/656 [00:06<00:00, 99.31it/s] 


Training Loss:  1.0487682016155664 Training Accuracy:  61.90952380952381 Test Loss:  1.0505858228848306 Testing Accuracy:  61.46111111111111
Epoch:  9


100%|██████████| 656/656 [00:03<00:00, 177.06it/s]


Training Loss:  1.0467131839223334 Training Accuracy:  61.983333333333334 Test Loss:  1.048589654618271 Testing Accuracy:  61.55555555555556
Epoch:  10


100%|██████████| 656/656 [00:04<00:00, 133.06it/s]


Training Loss:  1.0454535770526772 Training Accuracy:  62.00952380952381 Test Loss:  1.0473685762560514 Testing Accuracy:  61.611111111111114


0,1
tr_accuracy,▁▆▆▇███████
tr_loss,█▃▃▂▁▁▁▁▁▁▁
val_accuracy,▁▆▆▇███████
val_loss,█▃▃▂▁▁▁▁▁▁▁

0,1
tr_accuracy,62.00952
tr_loss,1.04545
val_accuracy,61.61111
val_loss,1.04737


[34m[1mwandb[0m: Agent Starting Run: rxkmn4ha with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (

100%|██████████| 656/656 [00:06<00:00, 107.25it/s]


Training Loss:  10.564393890794372 Training Accuracy:  11.31904761904762 Test Loss:  10.674369650045412 Testing Accuracy:  10.805555555555555
Epoch:  2


100%|██████████| 656/656 [00:03<00:00, 167.46it/s]


Training Loss:  9.947935903674782 Training Accuracy:  12.45 Test Loss:  10.087977041439434 Testing Accuracy:  11.738888888888889
Epoch:  3


100%|██████████| 656/656 [00:05<00:00, 125.33it/s]


Training Loss:  9.503587697613465 Training Accuracy:  13.345238095238095 Test Loss:  9.641773632266029 Testing Accuracy:  12.605555555555556
Epoch:  4


100%|██████████| 656/656 [00:03<00:00, 165.28it/s]


Training Loss:  9.046172902914766 Training Accuracy:  14.195238095238095 Test Loss:  9.226733247870127 Testing Accuracy:  13.438888888888888
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 142.68it/s]


Training Loss:  8.622138360689698 Training Accuracy:  15.464285714285714 Test Loss:  8.8438849371093 Testing Accuracy:  14.705555555555556


0,1
tr_accuracy,▁▃▄▅▆█
tr_loss,█▆▄▃▂▁
val_accuracy,▁▂▄▅▆█
val_loss,█▆▄▃▂▁

0,1
tr_accuracy,15.46429
tr_loss,8.62214
val_accuracy,14.70556
val_loss,8.84388


[34m[1mwandb[0m: Agent Starting Run: zqbwpggi with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nag
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.2941718325170726 Training Accuracy:  10.035714285714286 Test Loss:  2.2932702167000762 Testing Accuracy:  10.03888888888889
Epoch:  1


100%|██████████| 656/656 [00:00<00:00, 723.81it/s]


Training Loss:  1.7138057800668633 Training Accuracy:  20.0 Test Loss:  1.716856637873455 Testing Accuracy:  19.66111111111111
Epoch:  2


100%|██████████| 656/656 [00:00<00:00, 664.25it/s]


Training Loss:  1.8008791326654991 Training Accuracy:  19.788095238095238 Test Loss:  1.8032817024496497 Testing Accuracy:  19.433333333333334
Epoch:  3


100%|██████████| 656/656 [00:00<00:00, 709.73it/s]


Training Loss:  2.305589264811448 Training Accuracy:  10.869047619047619 Test Loss:  2.307797395258612 Testing Accuracy:  10.455555555555556
Epoch:  4


100%|██████████| 656/656 [00:00<00:00, 750.97it/s]


Training Loss:  2.3025766359816355 Training Accuracy:  10.088095238095239 Test Loss:  2.302345322239113 Testing Accuracy:  9.994444444444444
Epoch:  5


100%|██████████| 656/656 [00:00<00:00, 748.21it/s]


Training Loss:  2.3054037844682074 Training Accuracy:  9.973809523809523 Test Loss:  2.305513140386596 Testing Accuracy:  10.066666666666666
Epoch:  6


100%|██████████| 656/656 [00:01<00:00, 344.52it/s]


Training Loss:  2.305197655910285 Training Accuracy:  10.023809523809524 Test Loss:  2.3049442855935407 Testing Accuracy:  9.938888888888888
Epoch:  7


100%|██████████| 656/656 [00:01<00:00, 363.75it/s]


Training Loss:  2.3051159333375155 Training Accuracy:  10.023809523809524 Test Loss:  2.304838728449292 Testing Accuracy:  9.938888888888888
Epoch:  8


100%|██████████| 656/656 [00:00<00:00, 748.47it/s]


Training Loss:  2.3050598050896465 Training Accuracy:  10.023809523809524 Test Loss:  2.3047830816310664 Testing Accuracy:  9.938888888888888
Epoch:  9


100%|██████████| 656/656 [00:00<00:00, 714.35it/s]


Training Loss:  2.3050193973907382 Training Accuracy:  10.023809523809524 Test Loss:  2.304755302308204 Testing Accuracy:  9.938888888888888
Epoch:  10


100%|██████████| 656/656 [00:00<00:00, 742.60it/s]


Training Loss:  2.3049894291739057 Training Accuracy:  10.023809523809524 Test Loss:  2.304743920744502 Testing Accuracy:  9.938888888888888


0,1
tr_accuracy,▁██▂▁▁▁▁▁▁▁
tr_loss,█▁▂████████
val_accuracy,▁██▁▁▁▁▁▁▁▁
val_loss,█▁▂████████

0,1
tr_accuracy,10.02381
tr_loss,2.30499
val_accuracy,9.93889
val_loss,2.30474


[34m[1mwandb[0m: Agent Starting Run: ev2azmdo with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.3182408306721762 Training Accuracy:  10.761904761904763 Test Loss:  2.315485141612039 Testing Accuracy:  11.4
Epoch:  1


100%|██████████| 2625/2625 [00:08<00:00, 297.37it/s]


Training Loss:  0.9962037800862239 Training Accuracy:  68.69285714285714 Test Loss:  0.9985301307704236 Testing Accuracy:  68.5111111111111
Epoch:  2


100%|██████████| 2625/2625 [00:07<00:00, 364.20it/s]


Training Loss:  0.7959180393926167 Training Accuracy:  73.61904761904762 Test Loss:  0.7996414583227501 Testing Accuracy:  73.23333333333333
Epoch:  3


100%|██████████| 2625/2625 [00:06<00:00, 378.46it/s]


Training Loss:  0.7345881536753478 Training Accuracy:  75.76190476190476 Test Loss:  0.73888551864484 Testing Accuracy:  75.64444444444445
Epoch:  4


100%|██████████| 2625/2625 [00:09<00:00, 285.62it/s]


Training Loss:  0.7016962019303392 Training Accuracy:  77.24047619047619 Test Loss:  0.706345835553064 Testing Accuracy:  77.07222222222222
Epoch:  5


100%|██████████| 2625/2625 [00:06<00:00, 429.65it/s]


Training Loss:  0.6799261533860524 Training Accuracy:  78.20714285714286 Test Loss:  0.6847843603819453 Testing Accuracy:  78.18333333333334


0,1
tr_accuracy,▁▇████
tr_loss,█▂▁▁▁▁
val_accuracy,▁▇▇███
val_loss,█▂▁▁▁▁

0,1
tr_accuracy,78.20714
tr_loss,0.67993
val_accuracy,78.18333
val_loss,0.68478


[34m[1mwandb[0m: Agent Starting Run: 8nq4sgno with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight ma

100%|██████████| 2625/2625 [00:20<00:00, 125.97it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:22<00:00, 117.91it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:23<00:00, 113.30it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:20<00:00, 128.39it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:22<00:00, 114.20it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: 0t5dvruh with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  12.694240591934705 Training Accuracy:  8.97142857142

100%|██████████| 656/656 [00:06<00:00, 99.09it/s]


Training Loss:  8.800606652914363 Training Accuracy:  15.14047619047619 Test Loss:  9.050376602907193 Testing Accuracy:  14.25
Epoch:  2


100%|██████████| 656/656 [00:04<00:00, 146.54it/s]


Training Loss:  6.5917144011310675 Training Accuracy:  21.63095238095238 Test Loss:  6.918869779029102 Testing Accuracy:  20.555555555555557
Epoch:  3


100%|██████████| 656/656 [00:06<00:00, 106.24it/s]


Training Loss:  5.265102962309955 Training Accuracy:  27.902380952380952 Test Loss:  5.547209976822084 Testing Accuracy:  27.022222222222222
Epoch:  4


100%|██████████| 656/656 [00:04<00:00, 153.15it/s]


Training Loss:  4.346028550867609 Training Accuracy:  33.00476190476191 Test Loss:  4.6508385854189775 Testing Accuracy:  31.555555555555557
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 122.45it/s]


Training Loss:  3.712888342525745 Training Accuracy:  37.385714285714286 Test Loss:  4.024565486632513 Testing Accuracy:  35.52777777777778
Epoch:  6


100%|██████████| 656/656 [00:04<00:00, 152.65it/s]


Training Loss:  3.2503249427387235 Training Accuracy:  40.621428571428574 Test Loss:  3.547498939909322 Testing Accuracy:  38.205555555555556
Epoch:  7


100%|██████████| 656/656 [00:04<00:00, 147.91it/s]


Training Loss:  2.903944224398557 Training Accuracy:  43.14523809523809 Test Loss:  3.2013895222472235 Testing Accuracy:  40.28888888888889
Epoch:  8


100%|██████████| 656/656 [00:05<00:00, 124.27it/s]


Training Loss:  2.6333339777987317 Training Accuracy:  45.11666666666667 Test Loss:  2.8866742703859627 Testing Accuracy:  42.638888888888886
Epoch:  9


100%|██████████| 656/656 [00:04<00:00, 154.01it/s]


Training Loss:  2.399478849328182 Training Accuracy:  46.52142857142857 Test Loss:  2.636475036330265 Testing Accuracy:  44.05
Epoch:  10


100%|██████████| 656/656 [00:06<00:00, 107.05it/s]


Training Loss:  2.2015593124074955 Training Accuracy:  48.28809523809524 Test Loss:  2.4259495056610865 Testing Accuracy:  45.91111111111111


0,1
tr_accuracy,▁▂▃▄▅▆▇▇▇██
tr_loss,█▅▄▃▂▂▂▁▁▁▁
val_accuracy,▁▂▃▄▅▆▇▇▇██
val_loss,█▅▄▃▃▂▂▂▁▁▁

0,1
tr_accuracy,48.2881
tr_loss,2.20156
val_accuracy,45.91111
val_loss,2.42595


[34m[1mwandb[0m: Agent Starting Run: 8nsz76yl with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matr

100%|██████████| 2625/2625 [00:18<00:00, 141.04it/s]


Training Loss:  28.309896293343684 Training Accuracy:  9.973809523809523 Test Loss:  28.366945544072724 Testing Accuracy:  10.061111111111112
Epoch:  2


100%|██████████| 2625/2625 [00:18<00:00, 140.34it/s]


Training Loss:  25.947094926214916 Training Accuracy:  10.040476190476191 Test Loss:  25.90944862784706 Testing Accuracy:  9.905555555555555
Epoch:  3


100%|██████████| 2625/2625 [00:17<00:00, 146.68it/s]


Training Loss:  23.445064170217456 Training Accuracy:  9.964285714285714 Test Loss:  23.32580726794751 Testing Accuracy:  10.083333333333334
Epoch:  4


100%|██████████| 2625/2625 [00:19<00:00, 136.15it/s]


Training Loss:  25.739744279183615 Training Accuracy:  9.94047619047619 Test Loss:  25.76542015244354 Testing Accuracy:  10.13888888888889
Epoch:  5


100%|██████████| 2625/2625 [00:17<00:00, 146.88it/s]


Training Loss:  26.496147186538405 Training Accuracy:  9.973809523809523 Test Loss:  26.459568148011844 Testing Accuracy:  10.061111111111112
Epoch:  6


100%|██████████| 2625/2625 [00:20<00:00, 130.99it/s]


Training Loss:  22.217971323334968 Training Accuracy:  9.969047619047618 Test Loss:  22.299867243003227 Testing Accuracy:  10.072222222222223
Epoch:  7


100%|██████████| 2625/2625 [00:18<00:00, 143.87it/s]


Training Loss:  22.801231305240496 Training Accuracy:  9.973809523809523 Test Loss:  22.710743577914158 Testing Accuracy:  10.061111111111112
Epoch:  8


100%|██████████| 2625/2625 [00:21<00:00, 122.79it/s]


Training Loss:  20.17181391362519 Training Accuracy:  9.969047619047618 Test Loss:  20.142056493269546 Testing Accuracy:  10.072222222222223
Epoch:  9


100%|██████████| 2625/2625 [00:18<00:00, 141.47it/s]


Training Loss:  23.985771827967437 Training Accuracy:  9.916666666666666 Test Loss:  23.887536973767787 Testing Accuracy:  10.194444444444445
Epoch:  10


100%|██████████| 2625/2625 [00:18<00:00, 140.16it/s]


Training Loss:  23.445975049955994 Training Accuracy:  9.997619047619047 Test Loss:  23.50756346931823 Testing Accuracy:  10.005555555555556


0,1
tr_accuracy,█▂▂▂▁▂▂▂▂▁▂
tr_loss,▁█▇▅▇▇▄▅▃▅▅
val_accuracy,█▂▁▂▃▂▂▂▂▃▂
val_loss,▁█▇▅▇▇▅▅▃▅▅

0,1
tr_accuracy,9.99762
tr_loss,23.44598
val_accuracy,10.00556
val_loss,23.50756


[34m[1mwandb[0m: Agent Starting Run: 1n5mt9tv with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  

100%|██████████| 656/656 [00:05<00:00, 126.29it/s]


Training Loss:  2.302607272744578 Training Accuracy:  12.066666666666666 Test Loss:  2.302603359912173 Testing Accuracy:  11.988888888888889
Epoch:  2


100%|██████████| 656/656 [00:07<00:00, 86.23it/s] 


Training Loss:  2.3025995370835712 Training Accuracy:  10.040476190476191 Test Loss:  2.302583865445479 Testing Accuracy:  9.905555555555555
Epoch:  3


100%|██████████| 656/656 [00:05<00:00, 127.67it/s]


Training Loss:  2.3025932140071115 Training Accuracy:  10.040476190476191 Test Loss:  2.302577500063366 Testing Accuracy:  9.905555555555555
Epoch:  4


100%|██████████| 656/656 [00:06<00:00, 105.45it/s]


Training Loss:  2.302586869000891 Training Accuracy:  10.040476190476191 Test Loss:  2.302571170252949 Testing Accuracy:  9.905555555555555
Epoch:  5


100%|██████████| 656/656 [00:06<00:00, 98.66it/s]


Training Loss:  2.3025805224474816 Training Accuracy:  10.040476190476191 Test Loss:  2.302564839236862 Testing Accuracy:  9.905555555555555
Epoch:  6


100%|██████████| 656/656 [00:05<00:00, 125.83it/s]


Training Loss:  2.302574173894093 Training Accuracy:  10.040476190476191 Test Loss:  2.3025585062830203 Testing Accuracy:  9.905555555555555
Epoch:  7


100%|██████████| 656/656 [00:07<00:00, 86.04it/s] 


Training Loss:  2.302567822812861 Training Accuracy:  10.040476190476191 Test Loss:  2.302552170863542 Testing Accuracy:  9.905555555555555
Epoch:  8


100%|██████████| 656/656 [00:05<00:00, 129.72it/s]


Training Loss:  2.302561468675439 Training Accuracy:  10.040476190476191 Test Loss:  2.302545832451444 Testing Accuracy:  9.905555555555555
Epoch:  9


100%|██████████| 656/656 [00:07<00:00, 87.02it/s] 


Training Loss:  2.3025551109530396 Training Accuracy:  10.040476190476191 Test Loss:  2.302539490519314 Testing Accuracy:  9.905555555555555
Epoch:  10


100%|██████████| 656/656 [00:05<00:00, 127.38it/s]


Training Loss:  2.3025487491163097 Training Accuracy:  10.040476190476191 Test Loss:  2.302533144539185 Testing Accuracy:  9.905555555555555


0,1
tr_accuracy,▁█▁▁▁▁▁▁▁▁▁
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁█▁▁▁▁▁▁▁▁▁
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.04048
tr_loss,2.30255
val_accuracy,9.90556
val_loss,2.30253


[34m[1mwandb[0m: Agent Starting Run: 30iqkqvj with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  15.975041473773 Training Accuracy:  10.345238095238095 Test Loss:  15.914957928875804 Testing Accuracy:  10.416666666666666
Epoch:  1


100%|██████████| 2625/2625 [00:17<00:00, 150.13it/s]


Training Loss:  5.692579751811437 Training Accuracy:  37.82380952380952 Test Loss:  5.814180164552639 Testing Accuracy:  37.07222222222222
Epoch:  2


100%|██████████| 2625/2625 [00:18<00:00, 139.12it/s]


Training Loss:  3.5039963807235224 Training Accuracy:  50.80952380952381 Test Loss:  3.632638978952625 Testing Accuracy:  50.111111111111114
Epoch:  3


100%|██████████| 2625/2625 [00:18<00:00, 143.44it/s]


Training Loss:  2.4051827079376484 Training Accuracy:  57.99761904761905 Test Loss:  2.518812689332088 Testing Accuracy:  56.96111111111111
Epoch:  4


100%|██████████| 2625/2625 [00:20<00:00, 129.60it/s]


Training Loss:  1.7068249270088702 Training Accuracy:  62.38095238095238 Test Loss:  1.8329232097401873 Testing Accuracy:  61.666666666666664
Epoch:  5


100%|██████████| 2625/2625 [00:18<00:00, 140.92it/s]


Training Loss:  1.2401803674701826 Training Accuracy:  66.76428571428572 Test Loss:  1.3415427762618295 Testing Accuracy:  65.35555555555555
Epoch:  6


100%|██████████| 2625/2625 [00:21<00:00, 124.40it/s]


Training Loss:  0.923995437651024 Training Accuracy:  71.39761904761905 Test Loss:  0.9859011105836196 Testing Accuracy:  69.66666666666667
Epoch:  7


100%|██████████| 2625/2625 [00:18<00:00, 143.64it/s]


Training Loss:  0.7209073432682109 Training Accuracy:  75.45714285714286 Test Loss:  0.7569745213958118 Testing Accuracy:  74.36111111111111
Epoch:  8


100%|██████████| 2625/2625 [00:21<00:00, 123.12it/s]


Training Loss:  0.576634554493332 Training Accuracy:  79.96666666666667 Test Loss:  0.603380822893965 Testing Accuracy:  78.99444444444444
Epoch:  9


100%|██████████| 2625/2625 [00:19<00:00, 136.45it/s]


Training Loss:  0.5052061851518129 Training Accuracy:  82.70238095238095 Test Loss:  0.5248599850392535 Testing Accuracy:  81.93333333333334
Epoch:  10


100%|██████████| 2625/2625 [00:20<00:00, 125.65it/s]


Training Loss:  0.5078557257034252 Training Accuracy:  83.44047619047619 Test Loss:  0.5165222116148321 Testing Accuracy:  83.42222222222222


0,1
tr_accuracy,▁▄▅▆▆▆▇▇███
tr_loss,█▃▂▂▂▁▁▁▁▁▁
val_accuracy,▁▄▅▅▆▆▇▇███
val_loss,█▃▂▂▂▁▁▁▁▁▁

0,1
tr_accuracy,83.44048
tr_loss,0.50786
val_accuracy,83.42222
val_loss,0.51652


[34m[1mwandb[0m: Agent Starting Run: p8cmjf80 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.533978313377783 Traini

100%|██████████| 1312/1312 [00:12<00:00, 104.53it/s]


Training Loss:  0.5651351593229039 Training Accuracy:  79.53571428571429 Test Loss:  0.5658333628662008 Testing Accuracy:  79.46666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:12<00:00, 103.42it/s]


Training Loss:  0.4678702172741944 Training Accuracy:  84.28095238095239 Test Loss:  0.47501321425443954 Testing Accuracy:  83.95555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:12<00:00, 101.49it/s]


Training Loss:  0.43293449043607635 Training Accuracy:  85.45476190476191 Test Loss:  0.44609857556678556 Testing Accuracy:  85.06111111111112
Epoch:  4


100%|██████████| 1312/1312 [00:12<00:00, 103.24it/s]


Training Loss:  0.4157502835758853 Training Accuracy:  86.08333333333333 Test Loss:  0.435603608158967 Testing Accuracy:  85.53888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:13<00:00, 97.95it/s] 


Training Loss:  0.40713509764329164 Training Accuracy:  86.49047619047619 Test Loss:  0.4338116613669807 Testing Accuracy:  85.83888888888889
Epoch:  6


100%|██████████| 1312/1312 [00:13<00:00, 100.80it/s]


Training Loss:  0.40176034401442545 Training Accuracy:  86.83333333333333 Test Loss:  0.43505835968395956 Testing Accuracy:  86.02777777777777
Epoch:  7


100%|██████████| 1312/1312 [00:12<00:00, 104.46it/s]


Training Loss:  0.3871875219648528 Training Accuracy:  87.42380952380952 Test Loss:  0.42666869083657216 Testing Accuracy:  86.54444444444445
Epoch:  8


100%|██████████| 1312/1312 [00:12<00:00, 105.01it/s]


Training Loss:  0.3773523251785653 Training Accuracy:  87.86666666666666 Test Loss:  0.4237213639634707 Testing Accuracy:  86.62222222222222
Epoch:  9


100%|██████████| 1312/1312 [00:12<00:00, 104.42it/s]


Training Loss:  0.3751173241202813 Training Accuracy:  88.07857142857142 Test Loss:  0.4258715036419917 Testing Accuracy:  86.8
Epoch:  10


100%|██████████| 1312/1312 [00:12<00:00, 103.68it/s]


Training Loss:  0.3643019536384709 Training Accuracy:  88.43333333333334 Test Loss:  0.4197458890377567 Testing Accuracy:  87.03333333333333


0,1
tr_accuracy,▁▇█████████
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,88.43333
tr_loss,0.3643
val_accuracy,87.03333
val_loss,0.41975


[34m[1mwandb[0m: Agent Starting Run: q13br6x7 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  5.804278559819152 Training Accuracy:  10.183333333333334 Test Loss:  5.822921504982788 Testing Accuracy:  10.333333333333334
Epoch:  1


100%|██████████| 1312/1312 [00:02<00:00, 481.50it/s]


Training Loss:  4.356707694040138 Training Accuracy:  12.952380952380953 Test Loss:  4.3772141741343304 Testing Accuracy:  13.083333333333334
Epoch:  2


100%|██████████| 1312/1312 [00:01<00:00, 883.92it/s]


Training Loss:  3.7094060992057805 Training Accuracy:  15.58095238095238 Test Loss:  3.728853602023289 Testing Accuracy:  15.783333333333333
Epoch:  3


100%|██████████| 1312/1312 [00:01<00:00, 935.09it/s]


Training Loss:  3.4121584812695454 Training Accuracy:  15.783333333333333 Test Loss:  3.4300622662802103 Testing Accuracy:  15.883333333333333
Epoch:  4


100%|██████████| 1312/1312 [00:01<00:00, 941.26it/s]


Training Loss:  3.221809746263887 Training Accuracy:  15.621428571428572 Test Loss:  3.238272936395502 Testing Accuracy:  15.583333333333334
Epoch:  5


100%|██████████| 1312/1312 [00:01<00:00, 792.37it/s]


Training Loss:  3.0767179579072566 Training Accuracy:  15.388095238095238 Test Loss:  3.091850289247276 Testing Accuracy:  15.238888888888889
Epoch:  6


100%|██████████| 1312/1312 [00:02<00:00, 624.64it/s]


Training Loss:  2.9597962637522675 Training Accuracy:  15.202380952380953 Test Loss:  2.973691318339994 Testing Accuracy:  14.977777777777778
Epoch:  7


100%|██████████| 1312/1312 [00:02<00:00, 458.76it/s]


Training Loss:  2.8626507824948333 Training Accuracy:  15.057142857142857 Test Loss:  2.875396468967593 Testing Accuracy:  14.894444444444444
Epoch:  8


100%|██████████| 1312/1312 [00:01<00:00, 936.24it/s]


Training Loss:  2.7796458147605025 Training Accuracy:  15.123809523809523 Test Loss:  2.7913309178585233 Testing Accuracy:  14.738888888888889
Epoch:  9


100%|██████████| 1312/1312 [00:01<00:00, 953.76it/s] 


Training Loss:  2.707061975806702 Training Accuracy:  15.107142857142858 Test Loss:  2.7177792642562175 Testing Accuracy:  14.694444444444445
Epoch:  10


100%|██████████| 1312/1312 [00:01<00:00, 947.70it/s]


Training Loss:  2.6425977394194073 Training Accuracy:  15.14047619047619 Test Loss:  2.652445130446869 Testing Accuracy:  14.755555555555556


0,1
tr_accuracy,▁▄████▇▇▇▇▇
tr_loss,█▅▃▃▂▂▂▁▁▁▁
val_accuracy,▁▄███▇▇▇▇▇▇
val_loss,█▅▃▃▂▂▂▁▁▁▁

0,1
tr_accuracy,15.14048
tr_loss,2.6426
val_accuracy,14.75556
val_loss,2.65245


[34m[1mwandb[0m: Agent Starting Run: m08aby8l with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  nan Training Accuracy:  10.228571428571428 Test Loss:

100%|██████████| 2625/2625 [00:02<00:00, 1133.70it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:02<00:00, 1144.37it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:02<00:00, 1146.55it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:03<00:00, 716.77it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:04<00:00, 642.50it/s] 


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: l6q2zvrj with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 3

100%|██████████| 656/656 [00:03<00:00, 203.59it/s]


Training Loss:  0.4706620235416984 Training Accuracy:  83.42380952380952 Test Loss:  0.47770083374881084 Testing Accuracy:  83.42777777777778
Epoch:  2


100%|██████████| 656/656 [00:02<00:00, 257.86it/s]


Training Loss:  0.4141176123708728 Training Accuracy:  85.17619047619047 Test Loss:  0.4333530904325024 Testing Accuracy:  84.78888888888889
Epoch:  3


100%|██████████| 656/656 [00:01<00:00, 389.79it/s]


Training Loss:  0.38744174619479294 Training Accuracy:  86.09285714285714 Test Loss:  0.4138417775892463 Testing Accuracy:  85.49444444444444
Epoch:  4


100%|██████████| 656/656 [00:01<00:00, 382.07it/s]


Training Loss:  0.35602811818973634 Training Accuracy:  87.20238095238095 Test Loss:  0.39096736968563656 Testing Accuracy:  86.3
Epoch:  5


100%|██████████| 656/656 [00:01<00:00, 383.13it/s]


Training Loss:  0.3435963682418061 Training Accuracy:  87.55 Test Loss:  0.38269765756696156 Testing Accuracy:  86.77222222222223


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,87.55
tr_loss,0.3436
val_accuracy,86.77222
val_loss,0.3827


[34m[1mwandb[0m: Agent Starting Run: u7u49aph with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.3058474929604045 Training Accuracy:  9.35 Test Loss:  2.30426591936039 Testing Accuracy:  9.222222222222221
Epoch:  1


100%|██████████| 1312/1312 [00:01<00:00, 782.83it/s]


Training Loss:  0.4676137975871308 Training Accuracy:  83.99761904761905 Test Loss:  0.4815362113170707 Testing Accuracy:  83.67222222222222
Epoch:  2


100%|██████████| 1312/1312 [00:01<00:00, 720.75it/s]


Training Loss:  0.4150635078945896 Training Accuracy:  85.56666666666666 Test Loss:  0.4371569053063601 Testing Accuracy:  85.25
Epoch:  3


100%|██████████| 1312/1312 [00:01<00:00, 783.37it/s]


Training Loss:  0.40865967853848095 Training Accuracy:  86.24761904761905 Test Loss:  0.44197329484416575 Testing Accuracy:  85.56111111111112
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 442.63it/s]


Training Loss:  0.40335214481346754 Training Accuracy:  86.9047619047619 Test Loss:  0.449319225292129 Testing Accuracy:  86.04444444444445
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 387.68it/s]


Training Loss:  0.42686626623555174 Training Accuracy:  86.57619047619048 Test Loss:  0.48176388798183184 Testing Accuracy:  85.61111111111111


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,86.57619
tr_loss,0.42687
val_accuracy,85.61111
val_loss,0.48176


[34m[1mwandb[0m: Agent Starting Run: 6qd315ij with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nag
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.592902290837553 Training Accuracy:  10.0

100%|██████████| 1312/1312 [00:02<00:00, 499.12it/s]


Training Loss:  0.7614684441573959 Training Accuracy:  70.54047619047618 Test Loss:  0.7650784933649901 Testing Accuracy:  70.03333333333333
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 310.58it/s]


Training Loss:  0.5689006337841354 Training Accuracy:  79.73809523809524 Test Loss:  0.578656131814344 Testing Accuracy:  79.59444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:01<00:00, 657.86it/s]


Training Loss:  0.4930000480960016 Training Accuracy:  81.67619047619047 Test Loss:  0.5091802665324285 Testing Accuracy:  81.38333333333334
Epoch:  4


100%|██████████| 1312/1312 [00:01<00:00, 692.01it/s]


Training Loss:  0.4589410399695208 Training Accuracy:  83.13333333333334 Test Loss:  0.4825515739802781 Testing Accuracy:  82.69444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 654.31it/s]


Training Loss:  0.42340295106372927 Training Accuracy:  84.94761904761904 Test Loss:  0.45258886477499344 Testing Accuracy:  84.21666666666667


0,1
tr_accuracy,▁▇████
tr_loss,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss,█▂▁▁▁▁

0,1
tr_accuracy,84.94762
tr_loss,0.4234
val_accuracy,84.21667
val_loss,0.45259


[34m[1mwandb[0m: Agent Starting Run: xmis0fex with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  8.662554322942153 Training Accuracy:  7.445238095

100%|██████████| 1312/1312 [00:02<00:00, 478.61it/s]


Training Loss:  5.462490377863767 Training Accuracy:  13.852380952380953 Test Loss:  5.481447778307901 Testing Accuracy:  13.588888888888889
Epoch:  2


100%|██████████| 1312/1312 [00:02<00:00, 473.34it/s]


Training Loss:  3.9850303067429755 Training Accuracy:  21.692857142857143 Test Loss:  4.015243263034286 Testing Accuracy:  21.655555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:03<00:00, 343.29it/s]


Training Loss:  3.061612513063537 Training Accuracy:  28.511904761904763 Test Loss:  3.0902833090334036 Testing Accuracy:  28.394444444444446
Epoch:  4


100%|██████████| 1312/1312 [00:03<00:00, 331.15it/s]


Training Loss:  2.5340057830172897 Training Accuracy:  34.81666666666667 Test Loss:  2.5681242082652767 Testing Accuracy:  34.62777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 486.80it/s]


Training Loss:  2.1550803870815414 Training Accuracy:  40.35 Test Loss:  2.194219402759283 Testing Accuracy:  39.93333333333333


0,1
tr_accuracy,▁▂▄▅▇█
tr_loss,█▅▃▂▁▁
val_accuracy,▁▂▄▆▇█
val_loss,█▅▃▂▁▁

0,1
tr_accuracy,40.35
tr_loss,2.15508
val_accuracy,39.93333
val_loss,2.19422


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: h9qyatnv with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  15.164056430738702 Trainin

100%|██████████| 2625/2625 [00:11<00:00, 223.15it/s]


Training Loss:  1.7307140101435576 Training Accuracy:  39.81190476190476 Test Loss:  1.7305010624634523 Testing Accuracy:  39.34444444444444
Epoch:  2


100%|██████████| 2625/2625 [00:11<00:00, 230.29it/s]


Training Loss:  1.3611780308605597 Training Accuracy:  51.33095238095238 Test Loss:  1.3630873258599516 Testing Accuracy:  50.85
Epoch:  3


100%|██████████| 2625/2625 [00:11<00:00, 234.85it/s]


Training Loss:  1.1967651807369928 Training Accuracy:  58.121428571428574 Test Loss:  1.2002288182921796 Testing Accuracy:  57.46111111111111
Epoch:  4


100%|██████████| 2625/2625 [00:11<00:00, 232.96it/s]


Training Loss:  1.1072148965423194 Training Accuracy:  62.34285714285714 Test Loss:  1.1107371847729859 Testing Accuracy:  61.84444444444444
Epoch:  5


100%|██████████| 2625/2625 [00:09<00:00, 270.31it/s]


Training Loss:  1.0566566663002177 Training Accuracy:  65.13809523809523 Test Loss:  1.0597646320065406 Testing Accuracy:  65.05


0,1
tr_accuracy,▁▅▆▇██
tr_loss,█▁▁▁▁▁
val_accuracy,▁▅▆▇██
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,65.1381
tr_loss,1.05666
val_accuracy,65.05
val_loss,1.05976


[34m[1mwandb[0m: Agent Starting Run: c7erdopp with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matr

100%|██████████| 1312/1312 [00:14<00:00, 92.89it/s] 


Training Loss:  25.127334322742925 Training Accuracy:  10.026190476190477 Test Loss:  25.101844424120326 Testing Accuracy:  9.938888888888888
Epoch:  2


100%|██████████| 1312/1312 [00:13<00:00, 99.21it/s] 


Training Loss:  27.914551172007346 Training Accuracy:  9.973809523809523 Test Loss:  27.847360562791746 Testing Accuracy:  10.061111111111112
Epoch:  3


100%|██████████| 1312/1312 [00:13<00:00, 98.14it/s]


Training Loss:  23.561753410399433 Training Accuracy:  10.073809523809524 Test Loss:  23.599171046262995 Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:13<00:00, 94.32it/s] 


Training Loss:  26.72551440700035 Training Accuracy:  9.916666666666666 Test Loss:  26.72660379254029 Testing Accuracy:  10.194444444444445
Epoch:  5


100%|██████████| 1312/1312 [00:17<00:00, 76.39it/s]


Training Loss:  22.668490614545217 Training Accuracy:  9.964285714285714 Test Loss:  22.587698979780324 Testing Accuracy:  10.083333333333334


0,1
tr_accuracy,█▂▁▂▁▁
tr_loss,▁▆█▆▇▅
val_accuracy,█▁▂▁▃▂
val_loss,▁▆█▆▇▅

0,1
tr_accuracy,9.96429
tr_loss,22.66849
val_accuracy,10.08333
val_loss,22.5877


[34m[1mwandb[0m: Agent Starting Run: j4u50dkv with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.3513566579390774 Training Accuracy:  3.533333333333333 Test Loss:  2.350486189492554 Testing Accuracy:  3.522222222222222
Epoch:  1


100%|██████████| 1312/1312 [00:05<00:00, 229.86it/s]


Training Loss:  0.5061622122249056 Training Accuracy:  82.89285714285714 Test Loss:  0.5106962873809092 Testing Accuracy:  82.7388888888889
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 162.87it/s]


Training Loss:  0.4383178036384064 Training Accuracy:  84.83095238095238 Test Loss:  0.443567679375854 Testing Accuracy:  84.91666666666667
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 218.79it/s]


Training Loss:  0.4067673632546786 Training Accuracy:  85.8047619047619 Test Loss:  0.41462179276650396 Testing Accuracy:  85.72777777777777
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 168.46it/s]


Training Loss:  0.38475304162689045 Training Accuracy:  86.53095238095239 Test Loss:  0.3956138508006673 Testing Accuracy:  86.21666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:11<00:00, 111.27it/s]


Training Loss:  0.3695975601855677 Training Accuracy:  86.98095238095237 Test Loss:  0.38372770390301103 Testing Accuracy:  86.71111111111111


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,86.98095
tr_loss,0.3696
val_accuracy,86.71111
val_loss,0.38373


[34m[1mwandb[0m: Agent Starting Run: vrkqe6tn with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.313477532809444 Training Accuracy:  8.516666666666667 Test Loss:  2.312887838296707 Testing Accuracy:  8.977777777777778
Epoch:  1


100%|██████████| 2625/2625 [00:02<00:00, 1152.67it/s]


Training Loss:  2.1099509243160224 Training Accuracy:  31.364285714285714 Test Loss:  2.108417522282884 Testing Accuracy:  31.3
Epoch:  2


100%|██████████| 2625/2625 [00:02<00:00, 1187.90it/s]


Training Loss:  1.8641071099179438 Training Accuracy:  38.364285714285714 Test Loss:  1.8612102034904354 Testing Accuracy:  38.37222222222222
Epoch:  3


100%|██████████| 2625/2625 [00:02<00:00, 1142.02it/s]


Training Loss:  1.5730545446333477 Training Accuracy:  48.916666666666664 Test Loss:  1.570477262843432 Testing Accuracy:  48.73888888888889
Epoch:  4


100%|██████████| 2625/2625 [00:04<00:00, 578.68it/s]


Training Loss:  1.3322477349812734 Training Accuracy:  59.42857142857143 Test Loss:  1.3311299033225938 Testing Accuracy:  59.611111111111114
Epoch:  5


100%|██████████| 2625/2625 [00:02<00:00, 1084.07it/s]


Training Loss:  1.1648991743619166 Training Accuracy:  63.02857142857143 Test Loss:  1.165018227712875 Testing Accuracy:  63.044444444444444
Epoch:  6


100%|██████████| 2625/2625 [00:02<00:00, 1091.71it/s]


Training Loss:  1.0488136728568584 Training Accuracy:  65.16666666666667 Test Loss:  1.049669097494028 Testing Accuracy:  65.09444444444445
Epoch:  7


100%|██████████| 2625/2625 [00:02<00:00, 1142.40it/s]


Training Loss:  0.9668270284029997 Training Accuracy:  66.88095238095238 Test Loss:  0.96800174483042 Testing Accuracy:  66.71666666666667
Epoch:  8


100%|██████████| 2625/2625 [00:03<00:00, 804.33it/s]


Training Loss:  0.9065902366321869 Training Accuracy:  68.34285714285714 Test Loss:  0.9079070779213213 Testing Accuracy:  68.2388888888889
Epoch:  9


100%|██████████| 2625/2625 [00:05<00:00, 524.52it/s] 


Training Loss:  0.8601774698299525 Training Accuracy:  69.84761904761905 Test Loss:  0.861597229571346 Testing Accuracy:  69.69444444444444
Epoch:  10


100%|██████████| 2625/2625 [00:02<00:00, 1169.94it/s]


Training Loss:  0.8229378769055801 Training Accuracy:  70.97380952380952 Test Loss:  0.8243979488996003 Testing Accuracy:  71.0111111111111


0,1
tr_accuracy,▁▄▄▆▇▇▇████
tr_loss,█▇▆▅▃▃▂▂▁▁▁
val_accuracy,▁▄▄▅▇▇▇████
val_loss,█▇▆▅▃▃▂▂▁▁▁

0,1
tr_accuracy,70.97381
tr_loss,0.82294
val_accuracy,71.01111
val_loss,0.8244


[34m[1mwandb[0m: Agent Starting Run: f1ct4jpi with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight m

100%|██████████| 656/656 [00:05<00:00, 116.20it/s]


Training Loss:  0.6049879802374815 Training Accuracy:  78.44761904761904 Test Loss:  0.6112035243639531 Testing Accuracy:  78.4
Epoch:  2


100%|██████████| 656/656 [00:04<00:00, 138.24it/s]


Training Loss:  0.5884125696825777 Training Accuracy:  78.94761904761904 Test Loss:  0.5947137376901308 Testing Accuracy:  79.07777777777778
Epoch:  3


100%|██████████| 656/656 [00:06<00:00, 94.66it/s] 


Training Loss:  0.542478204261984 Training Accuracy:  80.82142857142857 Test Loss:  0.5511448420767167 Testing Accuracy:  80.71111111111111
Epoch:  4


100%|██████████| 656/656 [00:04<00:00, 139.32it/s]


Training Loss:  0.5119516517774283 Training Accuracy:  81.90714285714286 Test Loss:  0.5208823018528355 Testing Accuracy:  81.72222222222223
Epoch:  5


100%|██████████| 656/656 [00:06<00:00, 95.53it/s] 


Training Loss:  0.4693665297276624 Training Accuracy:  83.50952380952381 Test Loss:  0.47750666711610534 Testing Accuracy:  83.35555555555555
Epoch:  6


100%|██████████| 656/656 [00:04<00:00, 140.63it/s]


Training Loss:  0.4577225588610363 Training Accuracy:  84.00714285714285 Test Loss:  0.46698078574309987 Testing Accuracy:  83.85
Epoch:  7


100%|██████████| 656/656 [00:07<00:00, 86.88it/s] 


Training Loss:  0.45050445782375365 Training Accuracy:  84.37619047619047 Test Loss:  0.460079287168315 Testing Accuracy:  84.07222222222222
Epoch:  8


100%|██████████| 656/656 [00:04<00:00, 136.79it/s]


Training Loss:  0.4542886571678982 Training Accuracy:  84.35 Test Loss:  0.46339835689724834 Testing Accuracy:  84.21111111111111
Epoch:  9


100%|██████████| 656/656 [00:07<00:00, 89.21it/s] 


Training Loss:  0.4618476692662124 Training Accuracy:  84.1595238095238 Test Loss:  0.47140672276202594 Testing Accuracy:  83.87777777777778
Epoch:  10


100%|██████████| 656/656 [00:04<00:00, 137.32it/s]


Training Loss:  0.4562104771057193 Training Accuracy:  84.36428571428571 Test Loss:  0.4659634055457473 Testing Accuracy:  84.20555555555555


0,1
tr_accuracy,▁▇▇████████
tr_loss,█▂▂▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▂▂▁▁▁▁▁▁▁▁

0,1
tr_accuracy,84.36429
tr_loss,0.45621
val_accuracy,84.20556
val_loss,0.46596


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3d7jajlw with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix di

100%|██████████| 2625/2625 [00:05<00:00, 516.85it/s]


Training Loss:  2.0855657897501776 Training Accuracy:  23.821428571428573 Test Loss:  2.0886374007935333 Testing Accuracy:  23.833333333333332
Epoch:  2


100%|██████████| 2625/2625 [00:08<00:00, 327.42it/s]


Training Loss:  1.826364906026737 Training Accuracy:  34.74523809523809 Test Loss:  1.8302879606779752 Testing Accuracy:  34.24444444444445
Epoch:  3


100%|██████████| 2625/2625 [00:05<00:00, 498.04it/s]


Training Loss:  1.6359560598823046 Training Accuracy:  42.98095238095238 Test Loss:  1.641032848667294 Testing Accuracy:  42.333333333333336
Epoch:  4


100%|██████████| 2625/2625 [00:07<00:00, 330.97it/s]


Training Loss:  1.4990912933356884 Training Accuracy:  49.43333333333333 Test Loss:  1.5055615107462572 Testing Accuracy:  49.044444444444444
Epoch:  5


100%|██████████| 2625/2625 [00:04<00:00, 535.07it/s]


Training Loss:  1.39145161372169 Training Accuracy:  53.7047619047619 Test Loss:  1.3996193203368097 Testing Accuracy:  53.28333333333333


0,1
tr_accuracy,▁▄▅▆▇█
tr_loss,█▂▂▁▁▁
val_accuracy,▁▄▅▆▇█
val_loss,█▂▂▁▁▁

0,1
tr_accuracy,53.70476
tr_loss,1.39145
val_accuracy,53.28333
val_loss,1.39962


[34m[1mwandb[0m: Agent Starting Run: 9et4ah25 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight

100%|██████████| 2625/2625 [00:13<00:00, 187.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:13<00:00, 201.19it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:13<00:00, 201.23it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:12<00:00, 212.05it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:11<00:00, 223.07it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 2625/2625 [00:10<00:00, 240.22it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 2625/2625 [00:11<00:00, 220.16it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 2625/2625 [00:12<00:00, 202.64it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 2625/2625 [00:13<00:00, 199.16it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 2625/2625 [00:13<00:00, 197.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: wb9yt9q7 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  11.00198692951802 Training Accuracy:  11.630952380952381 Test Loss:  11.022097937398204 Testing Accuracy:  11.516666666666667
Epoch:  1


100%|██████████| 2625/2625 [00:08<00:00, 321.64it/s]


Training Loss:  4.079752331935752 Training Accuracy:  33.392857142857146 Test Loss:  4.198302894961299 Testing Accuracy:  32.605555555555554
Epoch:  2


100%|██████████| 2625/2625 [00:09<00:00, 289.17it/s]


Training Loss:  2.4830710794098665 Training Accuracy:  46.34047619047619 Test Loss:  2.5652664617796797 Testing Accuracy:  45.522222222222226
Epoch:  3


100%|██████████| 2625/2625 [00:10<00:00, 251.58it/s]


Training Loss:  1.7288983489441931 Training Accuracy:  53.36190476190476 Test Loss:  1.7843456336328976 Testing Accuracy:  52.76111111111111
Epoch:  4


100%|██████████| 2625/2625 [00:09<00:00, 273.07it/s]


Training Loss:  1.2953127082237559 Training Accuracy:  59.82857142857143 Test Loss:  1.3202896244365179 Testing Accuracy:  59.23888888888889
Epoch:  5


100%|██████████| 2625/2625 [00:07<00:00, 334.41it/s]


Training Loss:  1.0130533105920625 Training Accuracy:  65.62380952380953 Test Loss:  1.0358414180160387 Testing Accuracy:  65.1


0,1
tr_accuracy,▁▄▅▆▇█
tr_loss,█▃▂▂▁▁
val_accuracy,▁▄▅▆▇█
val_loss,█▃▂▂▁▁

0,1
tr_accuracy,65.62381
tr_loss,1.01305
val_accuracy,65.1
val_loss,1.03584


[34m[1mwandb[0m: Agent Starting Run: u80tenbu with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (

100%|██████████| 2625/2625 [00:04<00:00, 644.53it/s]


Training Loss:  1.8624160405242005 Training Accuracy:  40.13095238095238 Test Loss:  1.861138629640957 Testing Accuracy:  40.144444444444446
Epoch:  2


100%|██████████| 2625/2625 [00:07<00:00, 370.37it/s]


Training Loss:  1.5909212050381147 Training Accuracy:  52.930952380952384 Test Loss:  1.5903655223200015 Testing Accuracy:  52.96111111111111
Epoch:  3


100%|██████████| 2625/2625 [00:04<00:00, 651.23it/s]


Training Loss:  1.405513731528012 Training Accuracy:  57.01428571428571 Test Loss:  1.4053733506103 Testing Accuracy:  57.01111111111111
Epoch:  4


100%|██████████| 2625/2625 [00:05<00:00, 440.86it/s]


Training Loss:  1.2684610582078604 Training Accuracy:  60.96666666666667 Test Loss:  1.2684490967065545 Testing Accuracy:  60.81111111111111
Epoch:  5


100%|██████████| 2625/2625 [00:03<00:00, 658.94it/s]


Training Loss:  1.162590877768401 Training Accuracy:  64.74761904761905 Test Loss:  1.1625839977716577 Testing Accuracy:  64.58888888888889


0,1
tr_accuracy,▁▅▇▇██
tr_loss,█▅▄▂▂▁
val_accuracy,▁▅▇▇██
val_loss,█▅▄▂▂▁

0,1
tr_accuracy,64.74762
tr_loss,1.16259
val_accuracy,64.58889
val_loss,1.16258


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 90avkvk1 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.353609125497706 Training Accuracy:  7.430952380952381 Test Loss:  2.3563989123647304 Testing Accuracy:  7.35
Epoch:  1


100%|██████████| 1312/1312 [00:02<00:00, 512.63it/s]


Training Loss:  0.7550008923586335 Training Accuracy:  78.9 Test Loss:  0.760530224155239 Testing Accuracy:  78.45555555555555
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 302.00it/s]


Training Loss:  0.6273426809421163 Training Accuracy:  81.15238095238095 Test Loss:  0.6320646720695404 Testing Accuracy:  80.98333333333333
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 561.16it/s]


Training Loss:  0.5870386773368161 Training Accuracy:  81.84761904761905 Test Loss:  0.5915474165725673 Testing Accuracy:  81.79444444444445
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 548.42it/s]


Training Loss:  0.5679027361213084 Training Accuracy:  82.22619047619048 Test Loss:  0.572527475653136 Testing Accuracy:  82.21666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 572.60it/s]


Training Loss:  0.5564021698507726 Training Accuracy:  82.49761904761905 Test Loss:  0.5612482924989436 Testing Accuracy:  82.59444444444445


0,1
tr_accuracy,▁█████
tr_loss,█▂▁▁▁▁
val_accuracy,▁█████
val_loss,█▂▁▁▁▁

0,1
tr_accuracy,82.49762
tr_loss,0.5564
val_accuracy,82.59444
val_loss,0.56125


[34m[1mwandb[0m: Agent Starting Run: j2zm1h19 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  3.9475626253062335 Training Accuracy:  11.133333333333333 Test Loss:  3.968842763978122 Testing Accuracy:  11.022222222222222
Epoch:  1


100%|██████████| 1312/1312 [00:02<00:00, 515.90it/s]


Training Loss:  1.31287945058173 Training Accuracy:  69.82142857142857 Test Loss:  1.3139479656899005 Testing Accuracy:  69.90555555555555
Epoch:  2


100%|██████████| 1312/1312 [00:02<00:00, 566.31it/s]


Training Loss:  1.4181221571938287 Training Accuracy:  57.826190476190476 Test Loss:  1.418391345503164 Testing Accuracy:  57.74444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 261.41it/s]


Training Loss:  1.4243591302499035 Training Accuracy:  52.97857142857143 Test Loss:  1.4234900459792572 Testing Accuracy:  53.05
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 524.79it/s]


Training Loss:  1.4170419365094833 Training Accuracy:  50.95 Test Loss:  1.415745655775778 Testing Accuracy:  51.22222222222222
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 535.50it/s]


Training Loss:  1.4129682622744526 Training Accuracy:  49.911904761904765 Test Loss:  1.4114215883802053 Testing Accuracy:  49.827777777777776
Epoch:  6


100%|██████████| 1312/1312 [00:02<00:00, 551.06it/s]


Training Loss:  1.4102814873984861 Training Accuracy:  49.07142857142857 Test Loss:  1.408591403691514 Testing Accuracy:  48.672222222222224
Epoch:  7


100%|██████████| 1312/1312 [00:04<00:00, 299.77it/s]


Training Loss:  1.407858418926539 Training Accuracy:  49.18571428571428 Test Loss:  1.4060841070661065 Testing Accuracy:  49.1
Epoch:  8


100%|██████████| 1312/1312 [00:03<00:00, 436.39it/s]


Training Loss:  1.4058444858537236 Training Accuracy:  50.98571428571429 Test Loss:  1.4040202274386646 Testing Accuracy:  51.016666666666666
Epoch:  9


100%|██████████| 1312/1312 [00:02<00:00, 527.50it/s]


Training Loss:  1.40415897059548 Training Accuracy:  50.88095238095238 Test Loss:  1.4023047908896382 Testing Accuracy:  51.1
Epoch:  10


100%|██████████| 1312/1312 [00:02<00:00, 541.75it/s]


Training Loss:  1.4027562886177207 Training Accuracy:  50.81666666666667 Test Loss:  1.4008838010641373 Testing Accuracy:  51.233333333333334


0,1
tr_accuracy,▁█▇▆▆▆▆▆▆▆▆
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁█▇▆▆▆▅▆▆▆▆
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,50.81667
tr_loss,1.40276
val_accuracy,51.23333
val_loss,1.40088


[34m[1mwandb[0m: Agent Starting Run: pn68oav3 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  12.350601382298823 Training Accuracy:  10.67142857142857 Test Loss:  12.356120205691463 Testing Accuracy:  10.411111111111111
Epoch:  1


100%|██████████| 656/656 [00:04<00:00, 158.21it/s]


Training Loss:  0.9351051769281497 Training Accuracy:  66.31428571428572 Test Loss:  0.9417446267098346 Testing Accuracy:  65.2611111111111
Epoch:  2


100%|██████████| 656/656 [00:06<00:00, 107.20it/s]


Training Loss:  0.9065960130084849 Training Accuracy:  67.58333333333333 Test Loss:  0.9128327620539335 Testing Accuracy:  66.7388888888889
Epoch:  3


100%|██████████| 656/656 [00:04<00:00, 151.81it/s]


Training Loss:  0.8965692692687243 Training Accuracy:  68.00714285714285 Test Loss:  0.9025976948095339 Testing Accuracy:  67.36111111111111
Epoch:  4


100%|██████████| 656/656 [00:04<00:00, 155.93it/s]


Training Loss:  0.8916779277874675 Training Accuracy:  68.24761904761905 Test Loss:  0.8975414434971647 Testing Accuracy:  67.59444444444445
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 125.43it/s]


Training Loss:  0.889123947659943 Training Accuracy:  68.34761904761905 Test Loss:  0.8948716306396372 Testing Accuracy:  67.66111111111111
Epoch:  6


100%|██████████| 656/656 [00:03<00:00, 166.68it/s]


Training Loss:  0.8874357839889184 Training Accuracy:  68.4095238095238 Test Loss:  0.8930862315558422 Testing Accuracy:  67.75555555555556
Epoch:  7


100%|██████████| 656/656 [00:07<00:00, 91.92it/s] 


Training Loss:  0.8860529024599193 Training Accuracy:  68.46904761904761 Test Loss:  0.8916212719511617 Testing Accuracy:  67.78888888888889
Epoch:  8


100%|██████████| 656/656 [00:03<00:00, 166.17it/s]


Training Loss:  0.8852141702011225 Training Accuracy:  68.53333333333333 Test Loss:  0.8907302829378163 Testing Accuracy:  67.81111111111112
Epoch:  9


100%|██████████| 656/656 [00:05<00:00, 111.52it/s]


Training Loss:  0.884670227819539 Training Accuracy:  68.54285714285714 Test Loss:  0.8901496052526897 Testing Accuracy:  67.85
Epoch:  10


100%|██████████| 656/656 [00:03<00:00, 171.80it/s]


Training Loss:  0.8840476721578798 Training Accuracy:  68.53571428571429 Test Loss:  0.8894844585126598 Testing Accuracy:  67.9


0,1
tr_accuracy,▁██████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁██████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,68.53571
tr_loss,0.88405
val_accuracy,67.9
val_loss,0.88948


[34m[1mwandb[0m: Agent Starting Run: pt3d0xrd with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  13.517491749999515 Training Accuracy:  10.097619047619

100%|██████████| 656/656 [00:04<00:00, 145.43it/s]


Training Loss:  18.40043110504032 Training Accuracy:  9.964285714285714 Test Loss:  18.493179420129902 Testing Accuracy:  10.094444444444445
Epoch:  2


100%|██████████| 656/656 [00:03<00:00, 178.88it/s]


Training Loss:  6.107298397444025 Training Accuracy:  20.10952380952381 Test Loss:  6.186708840569287 Testing Accuracy:  19.544444444444444
Epoch:  3


100%|██████████| 656/656 [00:06<00:00, 105.88it/s]


Training Loss:  16.591964424291433 Training Accuracy:  10.073809523809524 Test Loss:  16.65931813933163 Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:03<00:00, 181.82it/s]


Training Loss:  17.514024288169697 Training Accuracy:  9.973809523809523 Test Loss:  17.623861242384166 Testing Accuracy:  10.061111111111112
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 147.73it/s]


Training Loss:  16.949037118487436 Training Accuracy:  9.973809523809523 Test Loss:  16.99699096691415 Testing Accuracy:  10.061111111111112


0,1
tr_accuracy,▁▁█▁▁▁
tr_loss,▅█▁▇▇▇
val_accuracy,▁▁█▁▁▁
val_loss,▅█▁▇█▇

0,1
tr_accuracy,9.97381
tr_loss,16.94904
val_accuracy,10.06111
val_loss,16.99699


[34m[1mwandb[0m: Agent Starting Run: 40iffh5w with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  nan Training Accuracy:  17.535714285714285 Test Loss:  nan Testing Accuracy:  17.67222222222222
Epoch:  1


100%|██████████| 1312/1312 [00:08<00:00, 151.57it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 237.22it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:09<00:00, 144.79it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 224.67it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 154.96it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 1312/1312 [00:06<00:00, 217.33it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 1312/1312 [00:07<00:00, 169.93it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 1312/1312 [00:06<00:00, 193.26it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 1312/1312 [00:06<00:00, 201.95it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 1312/1312 [00:06<00:00, 191.90it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: cvsvrsmq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.3351935903683336 Training Accuracy:  9.383333333333333 Test Loss:  2.3344073399958747 Testing Accuracy:  9.45
Epoch:  1


100%|██████████| 656/656 [00:00<00:00, 727.26it/s]


Training Loss:  1.9265877460955785 Training Accuracy:  42.38333333333333 Test Loss:  1.92703437205917 Testing Accuracy:  42.15555555555556
Epoch:  2


100%|██████████| 656/656 [00:00<00:00, 722.02it/s]


Training Loss:  1.357430196350505 Training Accuracy:  55.94047619047619 Test Loss:  1.359114104042043 Testing Accuracy:  55.205555555555556
Epoch:  3


100%|██████████| 656/656 [00:01<00:00, 618.58it/s]


Training Loss:  0.9791508226221181 Training Accuracy:  65.96428571428571 Test Loss:  0.9800801962855 Testing Accuracy:  65.69444444444444
Epoch:  4


100%|██████████| 656/656 [00:01<00:00, 336.50it/s]


Training Loss:  0.850254365585281 Training Accuracy:  68.23333333333333 Test Loss:  0.8523328853782053 Testing Accuracy:  67.77777777777777
Epoch:  5


100%|██████████| 656/656 [00:02<00:00, 310.57it/s]


Training Loss:  0.7866385879370166 Training Accuracy:  70.18571428571428 Test Loss:  0.7901829471443574 Testing Accuracy:  69.86111111111111


0,1
tr_accuracy,▁▅▆███
tr_loss,█▆▄▂▁▁
val_accuracy,▁▅▆███
val_loss,█▆▄▂▁▁

0,1
tr_accuracy,70.18571
tr_loss,0.78664
val_accuracy,69.86111
val_loss,0.79018


[34m[1mwandb[0m: Agent Starting Run: 2jpmuztg with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function: 

100%|██████████| 656/656 [00:07<00:00, 83.31it/s] 


Training Loss:  5.687427342354977 Training Accuracy:  13.395238095238096 Test Loss:  5.678318945109152 Testing Accuracy:  13.316666666666666
Epoch:  2


100%|██████████| 656/656 [00:05<00:00, 122.02it/s]


Training Loss:  4.526432175216481 Training Accuracy:  16.38095238095238 Test Loss:  4.522659813113472 Testing Accuracy:  16.416666666666668
Epoch:  3


100%|██████████| 656/656 [00:08<00:00, 78.20it/s] 


Training Loss:  3.7080554291819454 Training Accuracy:  18.364285714285714 Test Loss:  3.7077042498128923 Testing Accuracy:  18.316666666666666
Epoch:  4


100%|██████████| 656/656 [00:05<00:00, 114.82it/s]


Training Loss:  3.0351250322477217 Training Accuracy:  20.864285714285714 Test Loss:  3.0385971074980604 Testing Accuracy:  20.927777777777777
Epoch:  5


100%|██████████| 656/656 [00:06<00:00, 101.25it/s]


Training Loss:  2.6225502867372286 Training Accuracy:  23.976190476190474 Test Loss:  2.62743953367514 Testing Accuracy:  23.7
Epoch:  6


100%|██████████| 656/656 [00:07<00:00, 82.30it/s]


Training Loss:  2.377701518324886 Training Accuracy:  25.676190476190477 Test Loss:  2.380941486440807 Testing Accuracy:  25.333333333333332
Epoch:  7


100%|██████████| 656/656 [00:05<00:00, 115.45it/s]


Training Loss:  2.210253474856851 Training Accuracy:  26.99047619047619 Test Loss:  2.211186743101183 Testing Accuracy:  26.583333333333332
Epoch:  8


100%|██████████| 656/656 [00:08<00:00, 74.52it/s] 


Training Loss:  2.109538134805136 Training Accuracy:  28.083333333333332 Test Loss:  2.109006995814087 Testing Accuracy:  27.95
Epoch:  9


100%|██████████| 656/656 [00:05<00:00, 119.45it/s]


Training Loss:  2.046665078846809 Training Accuracy:  29.176190476190477 Test Loss:  2.045568142902532 Testing Accuracy:  29.244444444444444
Epoch:  10


100%|██████████| 656/656 [00:06<00:00, 95.25it/s] 


Training Loss:  1.9975453432347492 Training Accuracy:  30.283333333333335 Test Loss:  1.996367171334857 Testing Accuracy:  30.43888888888889


0,1
tr_accuracy,▁▂▃▄▅▆▆▇▇██
tr_loss,█▅▄▃▂▂▁▁▁▁▁
val_accuracy,▁▂▃▄▅▆▆▇▇██
val_loss,█▅▄▃▂▂▁▁▁▁▁

0,1
tr_accuracy,30.28333
tr_loss,1.99755
val_accuracy,30.43889
val_loss,1.99637


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 2nrz7ivx with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  soft

100%|██████████| 2625/2625 [00:26<00:00, 97.62it/s] 


Training Loss:  2.313777980456623 Training Accuracy:  9.916666666666666 Test Loss:  2.3135891772840225 Testing Accuracy:  10.194444444444445
Epoch:  2


100%|██████████| 2625/2625 [00:25<00:00, 103.66it/s]


Training Loss:  2.308757918518721 Training Accuracy:  9.973809523809523 Test Loss:  2.3088406135192883 Testing Accuracy:  10.061111111111112
Epoch:  3


100%|██████████| 2625/2625 [00:23<00:00, 113.29it/s]


Training Loss:  2.3046564792855078 Training Accuracy:  9.973809523809523 Test Loss:  2.3047784341221975 Testing Accuracy:  10.061111111111112
Epoch:  4


100%|██████████| 2625/2625 [00:23<00:00, 110.10it/s]


Training Loss:  2.303381578548793 Training Accuracy:  9.973809523809523 Test Loss:  2.3034493789287844 Testing Accuracy:  10.061111111111112
Epoch:  5


100%|██████████| 2625/2625 [00:24<00:00, 106.01it/s]


Training Loss:  2.3029613877520156 Training Accuracy:  9.973809523809523 Test Loss:  2.3030163344113266 Testing Accuracy:  10.061111111111112
Epoch:  6


100%|██████████| 2625/2625 [00:22<00:00, 114.58it/s]


Training Loss:  2.302794218091196 Training Accuracy:  9.973809523809523 Test Loss:  2.3028485721384158 Testing Accuracy:  10.061111111111112
Epoch:  7


100%|██████████| 2625/2625 [00:23<00:00, 113.56it/s]


Training Loss:  2.3027184008066275 Training Accuracy:  9.973809523809523 Test Loss:  2.302769841594593 Testing Accuracy:  10.061111111111112
Epoch:  8


100%|██████████| 2625/2625 [00:25<00:00, 102.90it/s]


Training Loss:  2.302683085459609 Training Accuracy:  9.973809523809523 Test Loss:  2.302729161034319 Testing Accuracy:  10.061111111111112
Epoch:  9


100%|██████████| 2625/2625 [00:22<00:00, 117.18it/s]


Training Loss:  2.302665146629138 Training Accuracy:  9.916666666666666 Test Loss:  2.302706048712537 Testing Accuracy:  10.194444444444445
Epoch:  10


100%|██████████| 2625/2625 [00:23<00:00, 113.19it/s]


Training Loss:  2.3026550791977285 Training Accuracy:  9.916666666666666 Test Loss:  2.3026913575409846 Testing Accuracy:  10.194444444444445


0,1
tr_accuracy,▁▁███████▁▁
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,██▁▁▁▁▁▁▁██
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,9.91667
tr_loss,2.30266
val_accuracy,10.19444
val_loss,2.30269


[34m[1mwandb[0m: Agent Starting Run: 8xsu28s9 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight m

100%|██████████| 2625/2625 [00:25<00:00, 103.60it/s]


Training Loss:  4.3922900522232675 Training Accuracy:  39.733333333333334 Test Loss:  4.338285021262083 Testing Accuracy:  39.827777777777776
Epoch:  2


100%|██████████| 2625/2625 [00:22<00:00, 116.26it/s]


Training Loss:  2.1543826985860015 Training Accuracy:  49.24523809523809 Test Loss:  2.2110804475295827 Testing Accuracy:  49.25
Epoch:  3


100%|██████████| 2625/2625 [00:21<00:00, 121.90it/s]


Training Loss:  1.4507725416676593 Training Accuracy:  55.03095238095238 Test Loss:  1.4670405428680944 Testing Accuracy:  55.077777777777776
Epoch:  4


100%|██████████| 2625/2625 [00:24<00:00, 107.60it/s]


Training Loss:  1.2182535904250833 Training Accuracy:  58.83095238095238 Test Loss:  1.2347763872055781 Testing Accuracy:  58.25
Epoch:  5


100%|██████████| 2625/2625 [00:24<00:00, 108.19it/s]


Training Loss:  1.1056093373591582 Training Accuracy:  62.95238095238095 Test Loss:  1.1172544174779702 Testing Accuracy:  62.81666666666667
Epoch:  6


100%|██████████| 2625/2625 [00:21<00:00, 119.67it/s]


Training Loss:  1.0298199996979196 Training Accuracy:  65.72619047619048 Test Loss:  1.062801316928783 Testing Accuracy:  65.24444444444444
Epoch:  7


100%|██████████| 2625/2625 [00:22<00:00, 116.13it/s]


Training Loss:  0.9743482329698333 Training Accuracy:  66.91428571428571 Test Loss:  1.0186063998111057 Testing Accuracy:  65.88333333333334
Epoch:  8


100%|██████████| 2625/2625 [00:24<00:00, 108.20it/s]


Training Loss:  0.965726902162753 Training Accuracy:  68.32142857142857 Test Loss:  0.9916137338014984 Testing Accuracy:  67.71666666666667
Epoch:  9


100%|██████████| 2625/2625 [00:23<00:00, 109.74it/s]


Training Loss:  0.9250188493975227 Training Accuracy:  68.73095238095237 Test Loss:  0.9453465411655247 Testing Accuracy:  68.20555555555555
Epoch:  10


100%|██████████| 2625/2625 [00:22<00:00, 117.14it/s]


Training Loss:  0.9120986531017805 Training Accuracy:  69.14285714285714 Test Loss:  0.9349092264449654 Testing Accuracy:  68.04444444444445


0,1
tr_accuracy,▁▅▆▆▇▇█████
tr_loss,█▃▂▁▁▁▁▁▁▁▁
val_accuracy,▁▅▆▆▇▇█████
val_loss,█▃▂▁▁▁▁▁▁▁▁

0,1
tr_accuracy,69.14286
tr_loss,0.9121
val_accuracy,68.04444
val_loss,0.93491


[34m[1mwandb[0m: Agent Starting Run: f90ct891 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  4.390239345490793 Training Accuracy:  10.

100%|██████████| 656/656 [00:03<00:00, 165.25it/s]


Training Loss:  1.5231167049640566 Training Accuracy:  47.98809523809524 Test Loss:  1.5389991085149108 Testing Accuracy:  47.34444444444444
Epoch:  2


100%|██████████| 656/656 [00:04<00:00, 140.97it/s]


Training Loss:  1.1028198228575625 Training Accuracy:  61.27857142857143 Test Loss:  1.118657904301443 Testing Accuracy:  60.73888888888889
Epoch:  3


100%|██████████| 656/656 [00:03<00:00, 175.47it/s]


Training Loss:  0.9177521351509493 Training Accuracy:  67.36428571428571 Test Loss:  0.9296818703734605 Testing Accuracy:  66.66666666666667
Epoch:  4


100%|██████████| 656/656 [00:03<00:00, 169.33it/s]


Training Loss:  0.817031634441504 Training Accuracy:  70.61666666666666 Test Loss:  0.8297707279580322 Testing Accuracy:  70.17222222222222
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 137.15it/s]


Training Loss:  0.7549635743770388 Training Accuracy:  72.72380952380952 Test Loss:  0.7697140385904795 Testing Accuracy:  72.40555555555555
Epoch:  6


100%|██████████| 656/656 [00:03<00:00, 202.02it/s]


Training Loss:  0.7123267376514764 Training Accuracy:  74.23809523809524 Test Loss:  0.7289678097548598 Testing Accuracy:  73.81666666666666
Epoch:  7


100%|██████████| 656/656 [00:03<00:00, 201.41it/s]


Training Loss:  0.68007277123784 Training Accuracy:  75.38095238095238 Test Loss:  0.6985069632301437 Testing Accuracy:  74.95
Epoch:  8


100%|██████████| 656/656 [00:05<00:00, 125.13it/s]


Training Loss:  0.6543816288741363 Training Accuracy:  76.36428571428571 Test Loss:  0.674464278463611 Testing Accuracy:  75.86111111111111
Epoch:  9


100%|██████████| 656/656 [00:03<00:00, 192.61it/s]


Training Loss:  0.6332783277761073 Training Accuracy:  77.21190476190476 Test Loss:  0.6548006031359712 Testing Accuracy:  76.58888888888889
Epoch:  10


100%|██████████| 656/656 [00:03<00:00, 202.76it/s]


Training Loss:  0.6155881295931176 Training Accuracy:  77.94285714285714 Test Loss:  0.6382282093184537 Testing Accuracy:  77.13333333333334


0,1
tr_accuracy,▁▅▆▇▇▇█████
tr_loss,█▃▂▂▁▁▁▁▁▁▁
val_accuracy,▁▅▆▇▇██████
val_loss,█▃▂▂▁▁▁▁▁▁▁

0,1
tr_accuracy,77.94286
tr_loss,0.61559
val_accuracy,77.13333
val_loss,0.63823


[34m[1mwandb[0m: Agent Starting Run: ouuk8z1g with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix di

100%|██████████| 1312/1312 [00:04<00:00, 310.00it/s]


Training Loss:  0.6018322128073912 Training Accuracy:  78.63333333333334 Test Loss:  0.6211721520459369 Testing Accuracy:  78.12777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 186.87it/s]


Training Loss:  0.5348941088064005 Training Accuracy:  81.15714285714286 Test Loss:  0.5594296032895021 Testing Accuracy:  80.58333333333333
Epoch:  3


100%|██████████| 1312/1312 [00:04<00:00, 292.19it/s]


Training Loss:  0.4874951558457265 Training Accuracy:  82.55714285714286 Test Loss:  0.517321568773185 Testing Accuracy:  81.87777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 225.09it/s]


Training Loss:  0.45496951140029185 Training Accuracy:  83.71428571428571 Test Loss:  0.4923006362678429 Testing Accuracy:  83.2611111111111
Epoch:  5


100%|██████████| 1312/1312 [00:04<00:00, 280.49it/s]


Training Loss:  0.4326298199914993 Training Accuracy:  84.83333333333333 Test Loss:  0.47763120898983197 Testing Accuracy:  83.75555555555556


0,1
tr_accuracy,▁▇████
tr_loss,█▁▁▁▁▁
val_accuracy,▁▇████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,84.83333
tr_loss,0.43263
val_accuracy,83.75556
val_loss,0.47763


[34m[1mwandb[0m: Agent Starting Run: v8jl35hc with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.2389281994101586 Training Accura

100%|██████████| 1312/1312 [00:13<00:00, 94.06it/s] 


Training Loss:  0.6326933598513909 Training Accuracy:  75.25238095238095 Test Loss:  0.634526168587043 Testing Accuracy:  75.51666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:13<00:00, 98.39it/s] 


Training Loss:  0.6055153871236718 Training Accuracy:  77.18095238095238 Test Loss:  0.6079162341690958 Testing Accuracy:  77.43888888888888
Epoch:  3


100%|██████████| 1312/1312 [00:13<00:00, 97.41it/s] 


Training Loss:  0.5913256582734381 Training Accuracy:  78.53809523809524 Test Loss:  0.5936883115332723 Testing Accuracy:  78.77222222222223
Epoch:  4


100%|██████████| 1312/1312 [00:13<00:00, 98.43it/s] 


Training Loss:  0.5839618435209354 Training Accuracy:  79.0952380952381 Test Loss:  0.5866682353243258 Testing Accuracy:  79.35555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:13<00:00, 99.85it/s] 


Training Loss:  0.5801860738482343 Training Accuracy:  79.23333333333333 Test Loss:  0.58359247889929 Testing Accuracy:  79.39444444444445
Epoch:  6


100%|██████████| 1312/1312 [00:12<00:00, 101.01it/s]


Training Loss:  0.5745869408109125 Training Accuracy:  79.71190476190476 Test Loss:  0.5777759628784168 Testing Accuracy:  79.87222222222222
Epoch:  7


100%|██████████| 1312/1312 [00:12<00:00, 108.32it/s]


Training Loss:  0.5665010600883579 Training Accuracy:  80.17142857142858 Test Loss:  0.5699911038166922 Testing Accuracy:  80.2388888888889
Epoch:  8


100%|██████████| 1312/1312 [00:12<00:00, 103.27it/s]


Training Loss:  0.5661633326842759 Training Accuracy:  80.05952380952381 Test Loss:  0.5700937942046898 Testing Accuracy:  80.10555555555555
Epoch:  9


100%|██████████| 1312/1312 [00:13<00:00, 97.98it/s] 


Training Loss:  0.5683453122696448 Training Accuracy:  80.02619047619048 Test Loss:  0.5722727480576382 Testing Accuracy:  80.12222222222222
Epoch:  10


100%|██████████| 1312/1312 [00:13<00:00, 96.09it/s] 


Training Loss:  0.5643141074622654 Training Accuracy:  80.24761904761905 Test Loss:  0.5684353957176072 Testing Accuracy:  80.25555555555556


0,1
tr_accuracy,▁▇█████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,80.24762
tr_loss,0.56431
val_accuracy,80.25556
val_loss,0.56844


[34m[1mwandb[0m: Agent Starting Run: gbpk4c79 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.4215043279724706 Training Accuracy:  8.176190476190476 Test Loss:  2.4208142850925896 Testing Accuracy:  7.944444444444445
Epoch:  1


100%|██████████| 2625/2625 [00:15<00:00, 164.50it/s]


Training Loss:  0.7135813428331852 Training Accuracy:  75.0547619047619 Test Loss:  0.7201814990735398 Testing Accuracy:  75.07222222222222
Epoch:  2


100%|██████████| 2625/2625 [00:15<00:00, 169.65it/s]


Training Loss:  0.7073445242907084 Training Accuracy:  75.45238095238095 Test Loss:  0.7142348206355446 Testing Accuracy:  75.47777777777777
Epoch:  3


100%|██████████| 2625/2625 [00:15<00:00, 164.17it/s]


Training Loss:  0.7052029821185972 Training Accuracy:  75.63571428571429 Test Loss:  0.7119878220120398 Testing Accuracy:  75.57222222222222
Epoch:  4


100%|██████████| 2625/2625 [00:15<00:00, 167.33it/s]


Training Loss:  0.704615355811432 Training Accuracy:  75.8047619047619 Test Loss:  0.7113252497036027 Testing Accuracy:  75.81111111111112
Epoch:  5


100%|██████████| 2625/2625 [00:16<00:00, 160.03it/s]


Training Loss:  0.7050333104119505 Training Accuracy:  75.79523809523809 Test Loss:  0.7117303600338674 Testing Accuracy:  75.78888888888889


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,75.79524
tr_loss,0.70503
val_accuracy,75.78889
val_loss,0.71173


[34m[1mwandb[0m: Agent Starting Run: jjxp0btk with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matri

100%|██████████| 1312/1312 [00:02<00:00, 492.01it/s]


Training Loss:  2.304053861902207 Training Accuracy:  9.973809523809523 Test Loss:  2.3039856648153028 Testing Accuracy:  10.061111111111112
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 251.79it/s]


Training Loss:  2.3037820887667797 Training Accuracy:  9.973809523809523 Test Loss:  2.303715992416434 Testing Accuracy:  10.061111111111112
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 493.83it/s]


Training Loss:  2.303636779720581 Training Accuracy:  9.973809523809523 Test Loss:  2.303570510223257 Testing Accuracy:  10.061111111111112
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 479.21it/s]


Training Loss:  2.303547953578309 Training Accuracy:  9.973809523809523 Test Loss:  2.303479805975363 Testing Accuracy:  10.061111111111112
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 487.17it/s]


Training Loss:  2.3034540357367868 Training Accuracy:  9.973809523809523 Test Loss:  2.3033804644792557 Testing Accuracy:  10.061111111111112


0,1
tr_accuracy,█▁▁▁▁▁
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,9.97381
tr_loss,2.30345
val_accuracy,10.06111
val_loss,2.30338


[34m[1mwandb[0m: Agent Starting Run: luusvot7 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  s

100%|██████████| 656/656 [00:08<00:00, 79.53it/s]


Training Loss:  0.6291469068069748 Training Accuracy:  77.94285714285714 Test Loss:  0.6709986034381978 Testing Accuracy:  76.92222222222222
Epoch:  2


100%|██████████| 656/656 [00:06<00:00, 103.43it/s]


Training Loss:  0.5267349993179232 Training Accuracy:  81.29523809523809 Test Loss:  0.5789673770748869 Testing Accuracy:  79.55555555555556
Epoch:  3


100%|██████████| 656/656 [00:08<00:00, 76.72it/s] 


Training Loss:  0.4837106967016685 Training Accuracy:  82.75238095238095 Test Loss:  0.5471534822730063 Testing Accuracy:  80.90555555555555
Epoch:  4


100%|██████████| 656/656 [00:07<00:00, 86.98it/s]


Training Loss:  0.4582351950823694 Training Accuracy:  83.68333333333334 Test Loss:  0.5289161141509517 Testing Accuracy:  81.91111111111111
Epoch:  5


100%|██████████| 656/656 [00:06<00:00, 102.88it/s]


Training Loss:  0.4329221476401359 Training Accuracy:  84.71666666666667 Test Loss:  0.5129249200786677 Testing Accuracy:  82.88333333333334
Epoch:  6


100%|██████████| 656/656 [00:08<00:00, 74.50it/s] 


Training Loss:  0.4200940113498983 Training Accuracy:  85.18571428571428 Test Loss:  0.5040328502107869 Testing Accuracy:  83.32222222222222
Epoch:  7


100%|██████████| 656/656 [00:07<00:00, 93.22it/s]


Training Loss:  0.40247026117962126 Training Accuracy:  85.79761904761905 Test Loss:  0.4954127109673877 Testing Accuracy:  83.59444444444445
Epoch:  8


100%|██████████| 656/656 [00:06<00:00, 102.47it/s]


Training Loss:  0.39402346940868527 Training Accuracy:  86.28333333333333 Test Loss:  0.49509944654635246 Testing Accuracy:  83.68333333333334
Epoch:  9


100%|██████████| 656/656 [00:08<00:00, 74.77it/s] 


Training Loss:  0.3895548504387319 Training Accuracy:  86.43571428571428 Test Loss:  0.49588413612332627 Testing Accuracy:  84.04444444444445
Epoch:  10


100%|██████████| 656/656 [00:06<00:00, 96.92it/s]


Training Loss:  0.37733802718576687 Training Accuracy:  86.89285714285714 Test Loss:  0.4900825268802693 Testing Accuracy:  84.2611111111111


0,1
tr_accuracy,▁▇▇████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,86.89286
tr_loss,0.37734
val_accuracy,84.26111
val_loss,0.49008


[34m[1mwandb[0m: Agent Starting Run: ejlccq0r with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.4525799931530874 Training Accuracy:  14.295238095238096 Test Loss:  2.4486113415409347 Testing Accuracy:  14.15
Epoch:  1


100%|██████████| 656/656 [00:04<00:00, 156.16it/s]


Training Loss:  0.618971980971181 Training Accuracy:  79.80714285714286 Test Loss:  0.6249278328968608 Testing Accuracy:  79.37777777777778
Epoch:  2


100%|██████████| 656/656 [00:03<00:00, 190.42it/s]


Training Loss:  0.49444407425012193 Training Accuracy:  82.81190476190476 Test Loss:  0.4999450030396544 Testing Accuracy:  82.52777777777777
Epoch:  3


100%|██████████| 656/656 [00:03<00:00, 194.05it/s]


Training Loss:  0.4505234572036532 Training Accuracy:  84.10238095238095 Test Loss:  0.4565785274473924 Testing Accuracy:  83.66666666666667
Epoch:  4


100%|██████████| 656/656 [00:05<00:00, 130.51it/s]


Training Loss:  0.42624158738646223 Training Accuracy:  84.94285714285714 Test Loss:  0.4335803894628163 Testing Accuracy:  84.35
Epoch:  5


100%|██████████| 656/656 [00:03<00:00, 180.27it/s]


Training Loss:  0.40926420909024197 Training Accuracy:  85.41666666666667 Test Loss:  0.41813201103503544 Testing Accuracy:  84.9
Epoch:  6


100%|██████████| 656/656 [00:03<00:00, 195.49it/s]


Training Loss:  0.39603657920928326 Training Accuracy:  85.8952380952381 Test Loss:  0.406524929753046 Testing Accuracy:  85.33888888888889
Epoch:  7


100%|██████████| 656/656 [00:05<00:00, 122.97it/s]


Training Loss:  0.3851636947266711 Training Accuracy:  86.2 Test Loss:  0.39731922296326794 Testing Accuracy:  85.71666666666667
Epoch:  8


100%|██████████| 656/656 [00:03<00:00, 194.99it/s]


Training Loss:  0.37593938769558305 Training Accuracy:  86.5547619047619 Test Loss:  0.3897936135497003 Testing Accuracy:  85.97777777777777
Epoch:  9


100%|██████████| 656/656 [00:03<00:00, 184.51it/s]


Training Loss:  0.367941060216552 Training Accuracy:  86.87619047619047 Test Loss:  0.38351354600281967 Testing Accuracy:  86.17222222222222
Epoch:  10


100%|██████████| 656/656 [00:04<00:00, 151.77it/s]


Training Loss:  0.3608952110631559 Training Accuracy:  87.08571428571429 Test Loss:  0.3781924834095848 Testing Accuracy:  86.38333333333334


0,1
tr_accuracy,▁▇█████████
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▂▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,87.08571
tr_loss,0.3609
val_accuracy,86.38333
val_loss,0.37819


[34m[1mwandb[0m: Agent Starting Run: gv6rxt2l with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  6.909578441191924 Training Accuracy:  10.861904761904762 Test Loss:  6.93724963331244 Testing Accuracy:  11.177777777777777
Epoch:  1


100%|██████████| 656/656 [00:05<00:00, 115.62it/s]


Training Loss:  1.6860010855312704 Training Accuracy:  47.06428571428572 Test Loss:  1.694345142652787 Testing Accuracy:  46.48888888888889
Epoch:  2


100%|██████████| 656/656 [00:02<00:00, 224.51it/s]


Training Loss:  1.1453094009492533 Training Accuracy:  62.695238095238096 Test Loss:  1.1495731339102522 Testing Accuracy:  62.43888888888889
Epoch:  3


100%|██████████| 656/656 [00:03<00:00, 214.14it/s]


Training Loss:  0.9429773685814861 Training Accuracy:  68.13809523809523 Test Loss:  0.9474135368726836 Testing Accuracy:  67.85555555555555
Epoch:  4


100%|██████████| 656/656 [00:04<00:00, 148.52it/s]


Training Loss:  0.8434505325392027 Training Accuracy:  70.74047619047619 Test Loss:  0.848746617722746 Testing Accuracy:  70.6
Epoch:  5


100%|██████████| 656/656 [00:03<00:00, 194.33it/s]


Training Loss:  0.7836425829661808 Training Accuracy:  72.56904761904762 Test Loss:  0.7900326219292665 Testing Accuracy:  72.12222222222222


0,1
tr_accuracy,▁▅▇▇██
tr_loss,█▂▁▁▁▁
val_accuracy,▁▅▇███
val_loss,█▂▁▁▁▁

0,1
tr_accuracy,72.56905
tr_loss,0.78364
val_accuracy,72.12222
val_loss,0.79003


[34m[1mwandb[0m: Agent Starting Run: w9ej4z1c with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  11.017639719919691 Training Accuracy:  11.5142857142

100%|██████████| 656/656 [00:06<00:00, 100.08it/s]


Training Loss:  7.692948956627791 Training Accuracy:  17.426190476190477 Test Loss:  7.913574752554765 Testing Accuracy:  16.583333333333332
Epoch:  2


100%|██████████| 656/656 [00:04<00:00, 134.15it/s]


Training Loss:  6.0706093595240365 Training Accuracy:  23.652380952380952 Test Loss:  6.2752093701483185 Testing Accuracy:  22.694444444444443
Epoch:  3


100%|██████████| 656/656 [00:06<00:00, 102.41it/s]


Training Loss:  5.000265952482501 Training Accuracy:  29.547619047619047 Test Loss:  5.206688150112857 Testing Accuracy:  28.538888888888888
Epoch:  4


100%|██████████| 656/656 [00:04<00:00, 132.66it/s]


Training Loss:  4.2428067778889975 Training Accuracy:  34.207142857142856 Test Loss:  4.43517161062822 Testing Accuracy:  33.233333333333334
Epoch:  5


100%|██████████| 656/656 [00:06<00:00, 99.15it/s] 


Training Loss:  3.7061975551617077 Training Accuracy:  37.75238095238095 Test Loss:  3.8973882637584896 Testing Accuracy:  36.44444444444444


0,1
tr_accuracy,▁▃▄▆▇█
tr_loss,█▅▃▂▂▁
val_accuracy,▁▂▄▆▇█
val_loss,█▅▃▂▂▁

0,1
tr_accuracy,37.75238
tr_loss,3.7062
val_accuracy,36.44444
val_loss,3.89739


[34m[1mwandb[0m: Agent Starting Run: 3brjpqcu with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.3059358241032664 Training Accuracy:  11.3904761904

100%|██████████| 2625/2625 [00:11<00:00, 227.23it/s]


Training Loss:  0.4425433121391524 Training Accuracy:  84.31190476190476 Test Loss:  0.4587839326047231 Testing Accuracy:  83.80555555555556
Epoch:  2


100%|██████████| 2625/2625 [00:11<00:00, 223.15it/s]


Training Loss:  0.41188461139451127 Training Accuracy:  85.59285714285714 Test Loss:  0.4405655615539453 Testing Accuracy:  84.86111111111111
Epoch:  3


100%|██████████| 2625/2625 [00:12<00:00, 211.15it/s]


Training Loss:  0.39907556089716306 Training Accuracy:  86.15 Test Loss:  0.43508444656569634 Testing Accuracy:  85.36666666666666
Epoch:  4


100%|██████████| 2625/2625 [00:12<00:00, 204.29it/s]


Training Loss:  0.3993327956230427 Training Accuracy:  86.39047619047619 Test Loss:  0.44358166930883763 Testing Accuracy:  85.72777777777777
Epoch:  5


100%|██████████| 2625/2625 [00:13<00:00, 201.80it/s]


Training Loss:  0.3705974569888309 Training Accuracy:  87.4047619047619 Test Loss:  0.42439865941508215 Testing Accuracy:  86.31666666666666
Epoch:  6


100%|██████████| 2625/2625 [00:13<00:00, 197.97it/s]


Training Loss:  0.36215941339371155 Training Accuracy:  87.88095238095238 Test Loss:  0.4257859971294269 Testing Accuracy:  86.43888888888888
Epoch:  7


100%|██████████| 2625/2625 [00:13<00:00, 197.07it/s]


Training Loss:  0.3457104827252604 Training Accuracy:  88.31190476190476 Test Loss:  0.41150028875049155 Testing Accuracy:  86.7
Epoch:  8


100%|██████████| 2625/2625 [00:13<00:00, 201.81it/s]


Training Loss:  0.340762934893722 Training Accuracy:  88.61190476190477 Test Loss:  0.4112879458223116 Testing Accuracy:  86.93333333333334
Epoch:  9


100%|██████████| 2625/2625 [00:12<00:00, 207.05it/s]


Training Loss:  0.32734723640426494 Training Accuracy:  88.95952380952382 Test Loss:  0.40753943967740697 Testing Accuracy:  87.24444444444444
Epoch:  10


100%|██████████| 2625/2625 [00:13<00:00, 198.14it/s]


Training Loss:  0.3374855626121123 Training Accuracy:  88.5952380952381 Test Loss:  0.42114553398776533 Testing Accuracy:  87.0111111111111


0,1
tr_accuracy,▁██████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁██████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,88.59524
tr_loss,0.33749
val_accuracy,87.01111
val_loss,0.42115


[34m[1mwandb[0m: Agent Starting Run: 4qyb5gvn with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  15.377686030403595 Training Accuracy: 

100%|██████████| 2625/2625 [00:14<00:00, 178.10it/s]


Training Loss:  5.9969424663327136 Training Accuracy:  37.10476190476191 Test Loss:  6.0190687299677945 Testing Accuracy:  37.016666666666666
Epoch:  2


100%|██████████| 2625/2625 [00:14<00:00, 185.02it/s]


Training Loss:  4.362061575542057 Training Accuracy:  44.75238095238095 Test Loss:  4.5254795920530935 Testing Accuracy:  43.855555555555554
Epoch:  3


100%|██████████| 2625/2625 [00:14<00:00, 182.33it/s]


Training Loss:  3.527139023260702 Training Accuracy:  48.635714285714286 Test Loss:  3.674919811615769 Testing Accuracy:  47.88333333333333
Epoch:  4


100%|██████████| 2625/2625 [00:14<00:00, 181.79it/s]


Training Loss:  2.9480394797896574 Training Accuracy:  49.91904761904762 Test Loss:  3.1115590520071876 Testing Accuracy:  48.87222222222222
Epoch:  5


100%|██████████| 2625/2625 [00:14<00:00, 186.57it/s]


Training Loss:  2.552084491265766 Training Accuracy:  51.91428571428571 Test Loss:  2.672402560656989 Testing Accuracy:  50.516666666666666


0,1
tr_accuracy,▁▅▇▇██
tr_loss,█▃▂▂▁▁
val_accuracy,▁▆▇███
val_loss,█▃▂▂▁▁

0,1
tr_accuracy,51.91429
tr_loss,2.55208
val_accuracy,50.51667
val_loss,2.6724


[34m[1mwandb[0m: Agent Starting Run: 41dy1ali with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 3

100%|██████████| 1312/1312 [00:01<00:00, 784.28it/s]


Training Loss:  2.311418949253533 Training Accuracy:  10.033333333333333 Test Loss:  2.3119393276687688 Testing Accuracy:  10.227777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:02<00:00, 535.32it/s]


Training Loss:  2.2934214784031792 Training Accuracy:  10.209523809523809 Test Loss:  2.2940383417806562 Testing Accuracy:  10.272222222222222
Epoch:  3


100%|██████████| 1312/1312 [00:03<00:00, 361.50it/s]


Training Loss:  2.2772956109093534 Training Accuracy:  10.626190476190477 Test Loss:  2.2779153125182945 Testing Accuracy:  10.61111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:01<00:00, 780.51it/s]


Training Loss:  2.261284438323963 Training Accuracy:  11.08095238095238 Test Loss:  2.261870331803698 Testing Accuracy:  11.105555555555556
Epoch:  5


100%|██████████| 1312/1312 [00:01<00:00, 768.06it/s]


Training Loss:  2.244942067231868 Training Accuracy:  11.542857142857143 Test Loss:  2.2453906638101446 Testing Accuracy:  11.61111111111111
Epoch:  6


100%|██████████| 1312/1312 [00:01<00:00, 756.19it/s]


Training Loss:  2.2284335220871285 Training Accuracy:  11.91904761904762 Test Loss:  2.228733588856112 Testing Accuracy:  12.055555555555555
Epoch:  7


100%|██████████| 1312/1312 [00:01<00:00, 767.49it/s]


Training Loss:  2.211198160508571 Training Accuracy:  12.335714285714285 Test Loss:  2.211307333597625 Testing Accuracy:  12.505555555555556
Epoch:  8


100%|██████████| 1312/1312 [00:02<00:00, 559.21it/s]


Training Loss:  2.192451617696058 Training Accuracy:  13.952380952380953 Test Loss:  2.1923786792696904 Testing Accuracy:  14.011111111111111
Epoch:  9


100%|██████████| 1312/1312 [00:03<00:00, 405.49it/s]


Training Loss:  2.171754861410731 Training Accuracy:  17.185714285714287 Test Loss:  2.171522331165176 Testing Accuracy:  17.483333333333334
Epoch:  10


100%|██████████| 1312/1312 [00:01<00:00, 777.92it/s]


Training Loss:  2.148924804085666 Training Accuracy:  20.95 Test Loss:  2.1485103177904525 Testing Accuracy:  21.105555555555554


0,1
tr_accuracy,▁▁▁▁▂▂▂▃▄▆█
tr_loss,█▇▆▆▅▅▄▃▃▂▁
val_accuracy,▁▁▁▁▂▂▂▃▄▆█
val_loss,█▇▆▆▅▅▄▃▃▂▁

0,1
tr_accuracy,20.95
tr_loss,2.14892
val_accuracy,21.10556
val_loss,2.14851


[34m[1mwandb[0m: Agent Starting Run: 97vapir1 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 

100%|██████████| 1312/1312 [00:06<00:00, 207.49it/s]


Training Loss:  0.4738616385046094 Training Accuracy:  83.48095238095237 Test Loss:  0.48386591974869186 Testing Accuracy:  83.16666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 295.90it/s]


Training Loss:  0.41834996516303885 Training Accuracy:  85.34285714285714 Test Loss:  0.4346727379799429 Testing Accuracy:  85.12777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 186.77it/s]


Training Loss:  0.39765751511688907 Training Accuracy:  86.27619047619048 Test Loss:  0.42295780568591224 Testing Accuracy:  85.76666666666667
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 301.48it/s]


Training Loss:  0.3772426615968127 Training Accuracy:  86.94761904761904 Test Loss:  0.41256234556929317 Testing Accuracy:  86.21111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 235.43it/s]


Training Loss:  0.36831834091504156 Training Accuracy:  87.36428571428571 Test Loss:  0.4124560377343174 Testing Accuracy:  86.36111111111111
Epoch:  6


100%|██████████| 1312/1312 [00:04<00:00, 271.75it/s]


Training Loss:  0.3520560486221611 Training Accuracy:  87.90714285714286 Test Loss:  0.40396745964982833 Testing Accuracy:  86.62222222222222
Epoch:  7


100%|██████████| 1312/1312 [00:04<00:00, 297.95it/s]


Training Loss:  0.3353110943053031 Training Accuracy:  88.45714285714286 Test Loss:  0.39144875856928124 Testing Accuracy:  87.07222222222222
Epoch:  8


100%|██████████| 1312/1312 [00:06<00:00, 201.97it/s]


Training Loss:  0.32731448439291677 Training Accuracy:  88.65 Test Loss:  0.39062321352271645 Testing Accuracy:  87.2388888888889
Epoch:  9


100%|██████████| 1312/1312 [00:04<00:00, 300.02it/s]


Training Loss:  0.32665069374794303 Training Accuracy:  88.5952380952381 Test Loss:  0.3963563128737718 Testing Accuracy:  86.82777777777778
Epoch:  10


100%|██████████| 1312/1312 [00:06<00:00, 194.23it/s]


Training Loss:  0.3195903744478541 Training Accuracy:  88.81666666666666 Test Loss:  0.39622405470470246 Testing Accuracy:  86.84444444444445


0,1
tr_accuracy,▁██████████
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁██████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,88.81667
tr_loss,0.31959
val_accuracy,86.84444
val_loss,0.39622


[34m[1mwandb[0m: Agent Starting Run: 4qtffujt with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  16.466310000630052 Training Accuracy:

100%|██████████| 656/656 [00:09<00:00, 65.67it/s]


Training Loss:  4.14487348454895 Training Accuracy:  45.72380952380952 Test Loss:  4.1539686886458025 Testing Accuracy:  45.672222222222224
Epoch:  2


100%|██████████| 656/656 [00:10<00:00, 64.66it/s]


Training Loss:  2.3242148380439596 Training Accuracy:  52.25476190476191 Test Loss:  2.4316024871166237 Testing Accuracy:  51.105555555555554
Epoch:  3


100%|██████████| 656/656 [00:10<00:00, 62.34it/s]


Training Loss:  1.5102725615281896 Training Accuracy:  57.5452380952381 Test Loss:  1.5612219911708343 Testing Accuracy:  56.40555555555556
Epoch:  4


100%|██████████| 656/656 [00:10<00:00, 62.26it/s]


Training Loss:  1.1763718873949298 Training Accuracy:  61.82857142857143 Test Loss:  1.2484650813316787 Testing Accuracy:  60.17777777777778
Epoch:  5


100%|██████████| 656/656 [00:10<00:00, 59.74it/s]


Training Loss:  1.0228270672401107 Training Accuracy:  65.50238095238095 Test Loss:  1.05841899633911 Testing Accuracy:  64.72222222222223
Epoch:  6


100%|██████████| 656/656 [00:11<00:00, 56.04it/s]


Training Loss:  0.9436347041650931 Training Accuracy:  67.34761904761905 Test Loss:  0.9872372211585326 Testing Accuracy:  66.3
Epoch:  7


100%|██████████| 656/656 [00:11<00:00, 54.83it/s]


Training Loss:  0.878408005117327 Training Accuracy:  69.74047619047619 Test Loss:  0.9241818609107274 Testing Accuracy:  68.4888888888889
Epoch:  8


100%|██████████| 656/656 [00:12<00:00, 54.51it/s]


Training Loss:  0.8297056168351673 Training Accuracy:  71.77380952380952 Test Loss:  0.861944879576071 Testing Accuracy:  71.29444444444445
Epoch:  9


100%|██████████| 656/656 [00:11<00:00, 55.51it/s]


Training Loss:  0.7848120090434727 Training Accuracy:  72.47857142857143 Test Loss:  0.8159015529277134 Testing Accuracy:  71.63888888888889
Epoch:  10


100%|██████████| 656/656 [00:12<00:00, 54.46it/s]


Training Loss:  0.7840686482512256 Training Accuracy:  72.84285714285714 Test Loss:  0.8230067473782453 Testing Accuracy:  71.77777777777777


0,1
tr_accuracy,▁▅▆▆▇▇▇████
tr_loss,█▃▂▁▁▁▁▁▁▁▁
val_accuracy,▁▅▆▆▇▇▇████
val_loss,█▃▂▁▁▁▁▁▁▁▁

0,1
tr_accuracy,72.84286
tr_loss,0.78407
val_accuracy,71.77778
val_loss,0.82301


[34m[1mwandb[0m: Agent Starting Run: 2sz95g7h with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.431689105825553 Training Accuracy:  9.35 Test Loss:

100%|██████████| 656/656 [00:04<00:00, 135.40it/s]


Training Loss:  0.4277199249390672 Training Accuracy:  84.67857142857143 Test Loss:  0.4376748335082063 Testing Accuracy:  84.34444444444445
Epoch:  2


100%|██████████| 656/656 [00:07<00:00, 88.49it/s] 


Training Loss:  0.38123726686343146 Training Accuracy:  86.31666666666666 Test Loss:  0.4039845547231814 Testing Accuracy:  85.53888888888889
Epoch:  3


100%|██████████| 656/656 [00:04<00:00, 131.37it/s]


Training Loss:  0.35372021839469514 Training Accuracy:  87.26190476190476 Test Loss:  0.3871305418313282 Testing Accuracy:  86.2388888888889
Epoch:  4


100%|██████████| 656/656 [00:07<00:00, 92.50it/s]


Training Loss:  0.3392281852902315 Training Accuracy:  87.78571428571429 Test Loss:  0.3809651129412062 Testing Accuracy:  86.45
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 132.51it/s]


Training Loss:  0.3246171490226923 Training Accuracy:  88.28809523809524 Test Loss:  0.3745133853522593 Testing Accuracy:  86.75


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,88.2881
tr_loss,0.32462
val_accuracy,86.75
val_loss,0.37451


[34m[1mwandb[0m: Agent Starting Run: vzttxs4a with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10,

100%|██████████| 2625/2625 [00:09<00:00, 283.04it/s]


Training Loss:  0.5868406739784402 Training Accuracy:  79.68809523809524 Test Loss:  0.5931916014980242 Testing Accuracy:  79.42222222222222
Epoch:  2


100%|██████████| 2625/2625 [00:07<00:00, 343.80it/s]


Training Loss:  0.49605169168690755 Training Accuracy:  82.98333333333333 Test Loss:  0.505144442510358 Testing Accuracy:  82.50555555555556
Epoch:  3


100%|██████████| 2625/2625 [00:10<00:00, 249.88it/s]


Training Loss:  0.4520418677607957 Training Accuracy:  84.18333333333334 Test Loss:  0.4651390889685931 Testing Accuracy:  83.88333333333334
Epoch:  4


100%|██████████| 2625/2625 [00:09<00:00, 286.07it/s]


Training Loss:  0.4246922811369501 Training Accuracy:  85.15714285714286 Test Loss:  0.44102653620331844 Testing Accuracy:  84.67777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:07<00:00, 351.68it/s]


Training Loss:  0.4076075776275522 Training Accuracy:  85.81904761904762 Test Loss:  0.42644926953092405 Testing Accuracy:  85.18333333333334
Epoch:  6


100%|██████████| 2625/2625 [00:10<00:00, 248.61it/s]


Training Loss:  0.39616046149260503 Training Accuracy:  86.18333333333334 Test Loss:  0.41706009518580694 Testing Accuracy:  85.54444444444445
Epoch:  7


100%|██████████| 2625/2625 [00:09<00:00, 264.46it/s]


Training Loss:  0.3877817347000688 Training Accuracy:  86.59285714285714 Test Loss:  0.4104394231628889 Testing Accuracy:  85.8
Epoch:  8


100%|██████████| 2625/2625 [00:07<00:00, 352.55it/s]


Training Loss:  0.3811590615849294 Training Accuracy:  86.77857142857142 Test Loss:  0.4053238087720384 Testing Accuracy:  86.0111111111111
Epoch:  9


100%|██████████| 2625/2625 [00:10<00:00, 258.08it/s]


Training Loss:  0.3756160827884436 Training Accuracy:  87.05 Test Loss:  0.4011564931188237 Testing Accuracy:  86.15
Epoch:  10


100%|██████████| 2625/2625 [00:09<00:00, 279.74it/s]


Training Loss:  0.37082102799422634 Training Accuracy:  87.19047619047619 Test Loss:  0.39769546009007606 Testing Accuracy:  86.36111111111111


0,1
tr_accuracy,▁▇█████████
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▂▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,87.19048
tr_loss,0.37082
val_accuracy,86.36111
val_loss,0.3977


[34m[1mwandb[0m: Agent Starting Run: 51wm591d with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.4887233228573358 Training Accuracy:  10.073809523809524 Test Loss:  2.4878606610263274 Testing Accuracy:  9.827777777777778
Epoch:  1


100%|██████████| 2625/2625 [00:19<00:00, 133.49it/s]


Training Loss:  2.3137498746389635 Training Accuracy:  9.916666666666666 Test Loss:  2.313551827656126 Testing Accuracy:  10.194444444444445
Epoch:  2


100%|██████████| 2625/2625 [00:19<00:00, 136.36it/s]


Training Loss:  2.3090069775189996 Training Accuracy:  9.973809523809523 Test Loss:  2.3090817314582286 Testing Accuracy:  10.061111111111112
Epoch:  3


100%|██████████| 2625/2625 [00:18<00:00, 142.14it/s]


Training Loss:  2.3047594931745246 Training Accuracy:  9.973809523809523 Test Loss:  2.3048827371017104 Testing Accuracy:  10.061111111111112
Epoch:  4


100%|██████████| 2625/2625 [00:18<00:00, 141.45it/s]


Training Loss:  2.303424551108258 Training Accuracy:  9.973809523809523 Test Loss:  2.3034920616529204 Testing Accuracy:  10.061111111111112
Epoch:  5


100%|██████████| 2625/2625 [00:19<00:00, 132.27it/s]


Training Loss:  2.302975531516867 Training Accuracy:  9.973809523809523 Test Loss:  2.3030299623744517 Testing Accuracy:  10.061111111111112
Epoch:  6


100%|██████████| 2625/2625 [00:18<00:00, 142.80it/s]


Training Loss:  2.302797213270239 Training Accuracy:  9.973809523809523 Test Loss:  2.3028514094246435 Testing Accuracy:  10.061111111111112
Epoch:  7


100%|██████████| 2625/2625 [00:18<00:00, 143.14it/s]


Training Loss:  2.3027192175277182 Training Accuracy:  9.973809523809523 Test Loss:  2.302770529806373 Testing Accuracy:  10.061111111111112
Epoch:  8


100%|██████████| 2625/2625 [00:19<00:00, 135.07it/s]


Training Loss:  2.302683539861708 Training Accuracy:  9.973809523809523 Test Loss:  2.302729586482403 Testing Accuracy:  10.061111111111112
Epoch:  9


100%|██████████| 2625/2625 [00:17<00:00, 147.44it/s]


Training Loss:  2.3026653634548255 Training Accuracy:  9.916666666666666 Test Loss:  2.302706265489029 Testing Accuracy:  10.194444444444445
Epoch:  10


100%|██████████| 2625/2625 [00:18<00:00, 145.03it/s]


Training Loss:  2.3026552164817065 Training Accuracy:  9.916666666666666 Test Loss:  2.3026915646827906 Testing Accuracy:  10.194444444444445


0,1
tr_accuracy,█▁▄▄▄▄▄▄▄▁▁
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁█▅▅▅▅▅▅▅██
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,9.91667
tr_loss,2.30266
val_accuracy,10.19444
val_loss,2.30269


[34m[1mwandb[0m: Agent Starting Run: nki02p9a with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  

100%|██████████| 2625/2625 [00:16<00:00, 162.25it/s]


Training Loss:  1.4907537988918858 Training Accuracy:  48.964285714285715 Test Loss:  1.4963823422677849 Testing Accuracy:  48.84444444444444
Epoch:  2


100%|██████████| 2625/2625 [00:15<00:00, 167.93it/s]


Training Loss:  1.2030655292159511 Training Accuracy:  58.29047619047619 Test Loss:  1.2156559595199676 Testing Accuracy:  57.80555555555556
Epoch:  3


100%|██████████| 2625/2625 [00:16<00:00, 161.28it/s]


Training Loss:  1.0649503624934518 Training Accuracy:  62.56428571428572 Test Loss:  1.0810249570662258 Testing Accuracy:  62.27777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:15<00:00, 165.96it/s]


Training Loss:  0.9800855337575068 Training Accuracy:  65.37619047619047 Test Loss:  0.9976975725060375 Testing Accuracy:  64.96666666666667
Epoch:  5


100%|██████████| 2625/2625 [00:16<00:00, 160.84it/s]


Training Loss:  0.9224343383535916 Training Accuracy:  67.40714285714286 Test Loss:  0.9407082030582979 Testing Accuracy:  66.78333333333333


0,1
tr_accuracy,▁▆▇▇██
tr_loss,█▁▁▁▁▁
val_accuracy,▁▆▇▇██
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,67.40714
tr_loss,0.92243
val_accuracy,66.78333
val_loss,0.94071


[34m[1mwandb[0m: Agent Starting Run: 9xr1swwy with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  nan Training Accuracy:  10.076190476190476 Test Loss

100%|██████████| 1312/1312 [00:05<00:00, 253.21it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:03<00:00, 387.35it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:03<00:00, 436.90it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:03<00:00, 436.19it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 236.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,▁█████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: jhjy8xpw with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  nan Training Accuracy:  8.892857142857

100%|██████████| 656/656 [00:08<00:00, 78.47it/s] 


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:05<00:00, 112.68it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:08<00:00, 80.18it/s] 


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:06<00:00, 96.18it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:06<00:00, 95.13it/s] 


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 656/656 [00:07<00:00, 87.58it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 656/656 [00:05<00:00, 112.12it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 656/656 [00:07<00:00, 85.28it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 656/656 [00:05<00:00, 113.13it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 656/656 [00:08<00:00, 77.11it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,▁██████████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: 4pda5tqj with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function: 

100%|██████████| 1312/1312 [00:11<00:00, 115.05it/s]


Training Loss:  2.3099038558768576 Training Accuracy:  10.040476190476191 Test Loss:  2.309188293655703 Testing Accuracy:  9.905555555555555
Epoch:  2


100%|██████████| 1312/1312 [00:11<00:00, 115.82it/s]


Training Loss:  2.3036297942044737 Training Accuracy:  10.040476190476191 Test Loss:  2.3034136359692154 Testing Accuracy:  9.905555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:11<00:00, 115.35it/s]


Training Loss:  2.3027252080936567 Training Accuracy:  10.040476190476191 Test Loss:  2.302701787675926 Testing Accuracy:  9.905555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:09<00:00, 133.70it/s]


Training Loss:  2.302590921846519 Training Accuracy:  10.040476190476191 Test Loss:  2.302639276522392 Testing Accuracy:  9.905555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 149.16it/s]


Training Loss:  2.3025683084104434 Training Accuracy:  10.040476190476191 Test Loss:  2.3026430398912368 Testing Accuracy:  9.905555555555555
Epoch:  6


100%|██████████| 1312/1312 [00:10<00:00, 126.63it/s]


Training Loss:  2.30256320418779 Training Accuracy:  10.040476190476191 Test Loss:  2.3026475709582552 Testing Accuracy:  9.905555555555555
Epoch:  7


100%|██████████| 1312/1312 [00:11<00:00, 115.90it/s]


Training Loss:  2.3025613836355574 Training Accuracy:  10.040476190476191 Test Loss:  2.302649244214487 Testing Accuracy:  9.905555555555555
Epoch:  8


100%|██████████| 1312/1312 [00:11<00:00, 116.72it/s]


Training Loss:  2.302560373934787 Training Accuracy:  10.045238095238096 Test Loss:  2.3026494775283024 Testing Accuracy:  9.916666666666666
Epoch:  9


100%|██████████| 1312/1312 [00:10<00:00, 121.67it/s]


Training Loss:  2.302559635449081 Training Accuracy:  10.066666666666666 Test Loss:  2.302649157062065 Testing Accuracy:  9.933333333333334
Epoch:  10


100%|██████████| 1312/1312 [00:08<00:00, 152.76it/s]


Training Loss:  2.3025590180691444 Training Accuracy:  10.07857142857143 Test Loss:  2.3026486554889085 Testing Accuracy:  9.944444444444445


0,1
tr_accuracy,▁▁▁▁▁▁▁▁▂▆█
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▃▆█
val_loss,█▂▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07857
tr_loss,2.30256
val_accuracy,9.94444
val_loss,2.30265


[34m[1mwandb[0m: Agent Starting Run: vu99x2pa with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.4313414638809534 Training Accuracy:

100%|██████████| 1312/1312 [00:16<00:00, 79.19it/s]


Training Loss:  0.5915565728142715 Training Accuracy:  79.52857142857142 Test Loss:  0.5963449409287688 Testing Accuracy:  79.31111111111112
Epoch:  2


100%|██████████| 1312/1312 [00:16<00:00, 81.26it/s]


Training Loss:  0.5679378824874467 Training Accuracy:  80.90238095238095 Test Loss:  0.5724019904382872 Testing Accuracy:  80.86111111111111
Epoch:  3


100%|██████████| 1312/1312 [00:16<00:00, 80.31it/s]


Training Loss:  0.567060529673595 Training Accuracy:  80.91190476190476 Test Loss:  0.5713076501372899 Testing Accuracy:  81.03888888888889
Epoch:  4


100%|██████████| 1312/1312 [00:15<00:00, 82.80it/s] 


Training Loss:  0.5682380640599295 Training Accuracy:  80.79285714285714 Test Loss:  0.5719546080234531 Testing Accuracy:  80.94444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:16<00:00, 81.07it/s]


Training Loss:  0.5647023116950626 Training Accuracy:  80.91190476190476 Test Loss:  0.5684005423926648 Testing Accuracy:  81.13888888888889
Epoch:  6


100%|██████████| 1312/1312 [00:16<00:00, 81.50it/s]


Training Loss:  0.5661258749956305 Training Accuracy:  80.89047619047619 Test Loss:  0.5696512073357863 Testing Accuracy:  80.93333333333334
Epoch:  7


100%|██████████| 1312/1312 [00:16<00:00, 78.71it/s]


Training Loss:  0.56708626306839 Training Accuracy:  80.79523809523809 Test Loss:  0.5705268222206057 Testing Accuracy:  80.86111111111111
Epoch:  8


100%|██████████| 1312/1312 [00:16<00:00, 80.75it/s]


Training Loss:  0.5667073297604811 Training Accuracy:  80.68571428571428 Test Loss:  0.5699401091731561 Testing Accuracy:  80.73333333333333
Epoch:  9


100%|██████████| 1312/1312 [00:17<00:00, 76.59it/s]


Training Loss:  0.5643820006283168 Training Accuracy:  80.8047619047619 Test Loss:  0.5678046825892481 Testing Accuracy:  80.82222222222222
Epoch:  10


100%|██████████| 1312/1312 [00:16<00:00, 79.52it/s]


Training Loss:  0.55998446127563 Training Accuracy:  80.95714285714286 Test Loss:  0.5636020207643749 Testing Accuracy:  80.93888888888888


0,1
tr_accuracy,▁██████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁██████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,80.95714
tr_loss,0.55998
val_accuracy,80.93889
val_loss,0.5636


[34m[1mwandb[0m: Agent Starting Run: 727fmnju with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.3279800331756193 Training Accuracy:  8.885714285714

100%|██████████| 1312/1312 [00:01<00:00, 839.14it/s]


Training Loss:  2.3008812834696206 Training Accuracy:  10.083333333333334 Test Loss:  2.29998172378838 Testing Accuracy:  10.122222222222222
Epoch:  2


100%|██████████| 1312/1312 [00:01<00:00, 768.35it/s]


Training Loss:  2.2819741486963134 Training Accuracy:  12.066666666666666 Test Loss:  2.2810127333866452 Testing Accuracy:  12.38888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:01<00:00, 772.69it/s]


Training Loss:  2.263256171951175 Training Accuracy:  14.49047619047619 Test Loss:  2.26236650948802 Testing Accuracy:  14.855555555555556
Epoch:  4


100%|██████████| 1312/1312 [00:01<00:00, 749.81it/s]


Training Loss:  2.243083389667436 Training Accuracy:  15.89047619047619 Test Loss:  2.2422065570955656 Testing Accuracy:  16.288888888888888
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 408.67it/s]


Training Loss:  2.2214355943410955 Training Accuracy:  16.68095238095238 Test Loss:  2.22050831765462 Testing Accuracy:  17.094444444444445


0,1
tr_accuracy,▁▂▄▆▇█
tr_loss,█▆▅▄▂▁
val_accuracy,▁▂▄▆▇█
val_loss,█▆▅▄▂▁

0,1
tr_accuracy,16.68095
tr_loss,2.22144
val_accuracy,17.09444
val_loss,2.22051


[34m[1mwandb[0m: Agent Starting Run: qgh72px0 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  nan Training Accuracy:  11.911904761904761 Test Loss:  nan Testing Accuracy:  12.077777777777778
Epoch:  1


100%|██████████| 1312/1312 [00:03<00:00, 425.50it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 291.65it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 508.03it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 491.08it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 439.06it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: y4agyph1 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  17.506048835907308 Training Accuracy:  11.338095238095239 Test Loss:  17.580037555719247 Testing Accuracy:  10.966666666666667
Epoch:  1


100%|██████████| 2625/2625 [00:17<00:00, 153.39it/s]


Training Loss:  1.5843018028209304 Training Accuracy:  60.40714285714286 Test Loss:  1.6178950456664727 Testing Accuracy:  59.327777777777776
Epoch:  2


100%|██████████| 2625/2625 [00:18<00:00, 145.40it/s]


Training Loss:  1.0783080625008732 Training Accuracy:  69.61428571428571 Test Loss:  1.1208745764484318 Testing Accuracy:  68.56111111111112
Epoch:  3


100%|██████████| 2625/2625 [00:17<00:00, 153.71it/s]


Training Loss:  0.9717880492527705 Training Accuracy:  71.99761904761905 Test Loss:  1.010696041414412 Testing Accuracy:  71.16111111111111
Epoch:  4


100%|██████████| 2625/2625 [00:17<00:00, 154.25it/s]


Training Loss:  0.9217099066516449 Training Accuracy:  73.71428571428571 Test Loss:  0.981610234237485 Testing Accuracy:  72.85
Epoch:  5


100%|██████████| 2625/2625 [00:17<00:00, 150.57it/s]


Training Loss:  0.8307759921505371 Training Accuracy:  76.2547619047619 Test Loss:  0.8984987502600447 Testing Accuracy:  74.68333333333334


0,1
tr_accuracy,▁▆▇███
tr_loss,█▁▁▁▁▁
val_accuracy,▁▆▇███
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,76.25476
tr_loss,0.83078
val_accuracy,74.68333
val_loss,0.8985


[34m[1mwandb[0m: Agent Starting Run: vuoz8b5o with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix di

100%|██████████| 2625/2625 [00:03<00:00, 715.63it/s]


Training Loss:  2.306212919190527 Training Accuracy:  10.040476190476191 Test Loss:  2.3072532054739967 Testing Accuracy:  9.905555555555555
Epoch:  2


100%|██████████| 2625/2625 [00:04<00:00, 546.25it/s]


Training Loss:  2.302460501291313 Training Accuracy:  10.040476190476191 Test Loss:  2.3025806058789255 Testing Accuracy:  9.905555555555555
Epoch:  3


100%|██████████| 2625/2625 [00:05<00:00, 494.83it/s]


Training Loss:  2.302456091877452 Training Accuracy:  10.040476190476191 Test Loss:  2.3025052424295573 Testing Accuracy:  9.905555555555555
Epoch:  4


100%|██████████| 2625/2625 [00:03<00:00, 703.21it/s]


Training Loss:  2.3024527775272676 Training Accuracy:  10.040476190476191 Test Loss:  2.3024968993734753 Testing Accuracy:  9.905555555555555
Epoch:  5


100%|██████████| 2625/2625 [00:04<00:00, 543.16it/s]


Training Loss:  2.302447739462719 Training Accuracy:  10.040476190476191 Test Loss:  2.3024915265337595 Testing Accuracy:  9.905555555555555
Epoch:  6


100%|██████████| 2625/2625 [00:04<00:00, 547.86it/s]


Training Loss:  2.3024425374909328 Training Accuracy:  10.040476190476191 Test Loss:  2.302486325149604 Testing Accuracy:  9.905555555555555
Epoch:  7


100%|██████████| 2625/2625 [00:03<00:00, 699.54it/s]


Training Loss:  2.3024372932164052 Training Accuracy:  10.040476190476191 Test Loss:  2.302481105915679 Testing Accuracy:  9.905555555555555
Epoch:  8


100%|██████████| 2625/2625 [00:05<00:00, 520.93it/s]


Training Loss:  2.302432014302546 Training Accuracy:  10.040476190476191 Test Loss:  2.302475854166643 Testing Accuracy:  9.905555555555555
Epoch:  9


100%|██████████| 2625/2625 [00:04<00:00, 566.67it/s]


Training Loss:  2.302426700006941 Training Accuracy:  10.040476190476191 Test Loss:  2.302470567554767 Testing Accuracy:  9.905555555555555
Epoch:  10


100%|██████████| 2625/2625 [00:03<00:00, 712.69it/s]


Training Loss:  2.3024213489452743 Training Accuracy:  10.040476190476191 Test Loss:  2.302465244587108 Testing Accuracy:  9.905555555555555


0,1
tr_accuracy,▁▁▁▁▁▁▁▁▁▁▁
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.04048
tr_loss,2.30242
val_accuracy,9.90556
val_loss,2.30247


[34m[1mwandb[0m: Agent Starting Run: yj23nyxm with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  14.635845817077465 Training Accuracy:  10.185714285714285 Test Loss:  14.707033169963657 Testing Accuracy:  9.911111111111111
Epoch:  1


100%|██████████| 656/656 [00:04<00:00, 137.58it/s]


Training Loss:  0.8056689791594384 Training Accuracy:  73.17142857142858 Test Loss:  0.822275234912437 Testing Accuracy:  72.55555555555556
Epoch:  2


100%|██████████| 656/656 [00:06<00:00, 101.00it/s]


Training Loss:  0.5816863799584018 Training Accuracy:  80.40714285714286 Test Loss:  0.6090700152957853 Testing Accuracy:  79.95555555555555
Epoch:  3


100%|██████████| 656/656 [00:04<00:00, 135.34it/s]


Training Loss:  0.4883033854818265 Training Accuracy:  82.81666666666666 Test Loss:  0.5284837271974278 Testing Accuracy:  81.96111111111111
Epoch:  4


100%|██████████| 656/656 [00:06<00:00, 95.67it/s] 


Training Loss:  0.4376449742607314 Training Accuracy:  84.32142857142857 Test Loss:  0.48483363729153534 Testing Accuracy:  83.24444444444444
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 135.05it/s]


Training Loss:  0.39833034848279825 Training Accuracy:  85.40238095238095 Test Loss:  0.4524775871268432 Testing Accuracy:  84.13333333333334
Epoch:  6


100%|██████████| 656/656 [00:07<00:00, 87.87it/s] 


Training Loss:  0.3823895350436111 Training Accuracy:  86.00714285714285 Test Loss:  0.4455347113836978 Testing Accuracy:  84.47222222222223
Epoch:  7


100%|██████████| 656/656 [00:04<00:00, 133.72it/s]


Training Loss:  0.36659298303320714 Training Accuracy:  86.64761904761905 Test Loss:  0.438677429889828 Testing Accuracy:  84.82222222222222
Epoch:  8


100%|██████████| 656/656 [00:07<00:00, 83.51it/s] 


Training Loss:  0.35786739209419094 Training Accuracy:  87.02857142857142 Test Loss:  0.4373872499379682 Testing Accuracy:  85.07222222222222
Epoch:  9


100%|██████████| 656/656 [00:04<00:00, 141.61it/s]


Training Loss:  0.35248947776596634 Training Accuracy:  87.06666666666666 Test Loss:  0.43786829936412064 Testing Accuracy:  84.98333333333333
Epoch:  10


100%|██████████| 656/656 [00:07<00:00, 88.48it/s] 


Training Loss:  0.34419691340002195 Training Accuracy:  87.39285714285714 Test Loss:  0.435420869227129 Testing Accuracy:  85.11666666666666


0,1
tr_accuracy,▁▇▇████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,87.39286
tr_loss,0.3442
val_accuracy,85.11667
val_loss,0.43542


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lczuaoad with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.3973647360928365 Training Accuracy:  1.4714285714285715 Test Loss:  2.3970557296050155 Testing Accuracy:  1.4333333333333333
Epoch:  1


100%|██████████| 656/656 [00:02<00:00, 289.61it/s]


Training Loss:  0.4360711483180769 Training Accuracy:  84.5547619047619 Test Loss:  0.44276612554420863 Testing Accuracy:  84.81111111111112
Epoch:  2


100%|██████████| 656/656 [00:02<00:00, 299.33it/s]


Training Loss:  0.3878886406162212 Training Accuracy:  86.08095238095238 Test Loss:  0.40851471545301954 Testing Accuracy:  85.71111111111111
Epoch:  3


100%|██████████| 656/656 [00:03<00:00, 174.37it/s]


Training Loss:  0.3631629676977186 Training Accuracy:  86.99761904761905 Test Loss:  0.39702770386103475 Testing Accuracy:  86.4888888888889
Epoch:  4


100%|██████████| 656/656 [00:02<00:00, 288.13it/s]


Training Loss:  0.3419515671920881 Training Accuracy:  87.72619047619048 Test Loss:  0.3849494260446944 Testing Accuracy:  87.06666666666666
Epoch:  5


100%|██████████| 656/656 [00:02<00:00, 299.67it/s]


Training Loss:  0.358996761963908 Training Accuracy:  87.27619047619048 Test Loss:  0.4165089357793078 Testing Accuracy:  86.21666666666667


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,87.27619
tr_loss,0.359
val_accuracy,86.21667
val_loss,0.41651


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vfd4co8n with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix d

100%|██████████| 1312/1312 [00:09<00:00, 143.37it/s]


Training Loss:  0.5886273418260154 Training Accuracy:  78.52142857142857 Test Loss:  0.6136154965240781 Testing Accuracy:  77.86111111111111
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 193.89it/s]


Training Loss:  0.5152855787236632 Training Accuracy:  81.60952380952381 Test Loss:  0.5447157605209094 Testing Accuracy:  80.7388888888889
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 150.05it/s]


Training Loss:  0.47452500155375327 Training Accuracy:  83.03095238095239 Test Loss:  0.5102525910665773 Testing Accuracy:  82.0111111111111
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 178.01it/s]


Training Loss:  0.44124114367954836 Training Accuracy:  84.20238095238095 Test Loss:  0.482436806529041 Testing Accuracy:  82.95
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 173.66it/s]


Training Loss:  0.422759630059503 Training Accuracy:  84.8452380952381 Test Loss:  0.4697753040259788 Testing Accuracy:  83.80555555555556


0,1
tr_accuracy,▁▇████
tr_loss,█▁▁▁▁▁
val_accuracy,▁▇████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,84.84524
tr_loss,0.42276
val_accuracy,83.80556
val_loss,0.46978


[34m[1mwandb[0m: Agent Starting Run: 37yydb3i with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix d

100%|██████████| 1312/1312 [00:04<00:00, 309.44it/s]


Training Loss:  3.778315844589689 Training Accuracy:  9.383333333333333 Test Loss:  3.7835759253856476 Testing Accuracy:  9.427777777777777
Epoch:  2


100%|██████████| 1312/1312 [00:02<00:00, 526.66it/s]


Training Loss:  3.265057368692911 Training Accuracy:  8.526190476190477 Test Loss:  3.271947358913797 Testing Accuracy:  8.722222222222221
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 522.79it/s]


Training Loss:  2.9347452061427717 Training Accuracy:  7.4523809523809526 Test Loss:  2.942763244306774 Testing Accuracy:  7.616666666666666
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 528.19it/s]


Training Loss:  2.7370530781636835 Training Accuracy:  6.066666666666666 Test Loss:  2.7456395335819694 Testing Accuracy:  6.011111111111111
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 328.56it/s]


Training Loss:  2.615520454751153 Training Accuracy:  6.011904761904762 Test Loss:  2.6242148959339384 Testing Accuracy:  5.944444444444445
Epoch:  6


100%|██████████| 1312/1312 [00:02<00:00, 460.91it/s]


Training Loss:  2.535198059716942 Training Accuracy:  6.416666666666667 Test Loss:  2.543755088497395 Testing Accuracy:  6.416666666666667
Epoch:  7


100%|██████████| 1312/1312 [00:02<00:00, 552.27it/s]


Training Loss:  2.4789099703955357 Training Accuracy:  6.826190476190476 Test Loss:  2.4872333038428978 Testing Accuracy:  6.783333333333333
Epoch:  8


100%|██████████| 1312/1312 [00:02<00:00, 554.77it/s]


Training Loss:  2.4378438124923814 Training Accuracy:  7.211904761904762 Test Loss:  2.4459114573230125 Testing Accuracy:  7.277777777777778
Epoch:  9


100%|██████████| 1312/1312 [00:04<00:00, 308.58it/s]


Training Loss:  2.4069289543844508 Training Accuracy:  7.514285714285714 Test Loss:  2.4147499744981675 Testing Accuracy:  7.661111111111111
Epoch:  10


100%|██████████| 1312/1312 [00:03<00:00, 399.31it/s]


Training Loss:  2.382992719516344 Training Accuracy:  7.871428571428571 Test Loss:  2.390586414755985 Testing Accuracy:  8.03888888888889


0,1
tr_accuracy,██▆▄▁▁▂▃▃▄▅
tr_loss,█▆▄▃▂▂▁▁▁▁▁
val_accuracy,██▆▄▁▁▂▃▄▄▅
val_loss,█▆▄▃▂▂▂▁▁▁▁

0,1
tr_accuracy,7.87143
tr_loss,2.38299
val_accuracy,8.03889
val_loss,2.39059


[34m[1mwandb[0m: Agent Starting Run: guvexpvg with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (

100%|██████████| 1312/1312 [00:03<00:00, 358.54it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 317.20it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 639.73it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 631.63it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 632.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: mj5na8s2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (

100%|██████████| 656/656 [00:01<00:00, 472.67it/s]


Training Loss:  2.287838590886942 Training Accuracy:  16.642857142857142 Test Loss:  2.2876193575998744 Testing Accuracy:  16.75
Epoch:  2


100%|██████████| 656/656 [00:01<00:00, 457.64it/s]


Training Loss:  2.269795349880711 Training Accuracy:  17.51904761904762 Test Loss:  2.2694841737558775 Testing Accuracy:  17.63888888888889
Epoch:  3


100%|██████████| 656/656 [00:01<00:00, 333.30it/s]


Training Loss:  2.254136562748236 Training Accuracy:  17.830952380952382 Test Loss:  2.253720242085707 Testing Accuracy:  17.994444444444444
Epoch:  4


100%|██████████| 656/656 [00:02<00:00, 271.90it/s]


Training Loss:  2.238108324663292 Training Accuracy:  18.076190476190476 Test Loss:  2.237526629638524 Testing Accuracy:  18.11111111111111
Epoch:  5


100%|██████████| 656/656 [00:02<00:00, 320.72it/s]


Training Loss:  2.2200977870548306 Training Accuracy:  18.485714285714284 Test Loss:  2.219350750881715 Testing Accuracy:  18.42222222222222
Epoch:  6


100%|██████████| 656/656 [00:01<00:00, 358.71it/s]


Training Loss:  2.199675910363701 Training Accuracy:  19.414285714285715 Test Loss:  2.1987559154226806 Testing Accuracy:  19.122222222222224
Epoch:  7


100%|██████████| 656/656 [00:01<00:00, 454.94it/s]


Training Loss:  2.1764701596733866 Training Accuracy:  21.507142857142856 Test Loss:  2.1754458715544405 Testing Accuracy:  21.45
Epoch:  8


100%|██████████| 656/656 [00:01<00:00, 452.63it/s]


Training Loss:  2.1507343041267486 Training Accuracy:  23.542857142857144 Test Loss:  2.1496439863225216 Testing Accuracy:  23.483333333333334
Epoch:  9


100%|██████████| 656/656 [00:01<00:00, 464.59it/s]


Training Loss:  2.1233179837968286 Training Accuracy:  24.76904761904762 Test Loss:  2.1221224462399957 Testing Accuracy:  24.727777777777778
Epoch:  10


100%|██████████| 656/656 [00:02<00:00, 299.81it/s]


Training Loss:  2.094947831550576 Training Accuracy:  25.63809523809524 Test Loss:  2.0936805787461847 Testing Accuracy:  25.488888888888887


0,1
tr_accuracy,▁▃▃▃▄▄▄▆▇▇█
tr_loss,█▇▇▆▆▅▄▄▃▂▁
val_accuracy,▁▃▃▃▄▄▄▆▇██
val_loss,█▇▇▆▆▅▄▄▃▂▁

0,1
tr_accuracy,25.6381
tr_loss,2.09495
val_accuracy,25.48889
val_loss,2.09368


[34m[1mwandb[0m: Agent Starting Run: imrercv0 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.311822208661992 Training Accuracy

100%|██████████| 656/656 [00:07<00:00, 90.52it/s]


Training Loss:  0.4317281191439566 Training Accuracy:  84.81904761904762 Test Loss:  0.44822796753381083 Testing Accuracy:  84.47222222222223
Epoch:  2


100%|██████████| 656/656 [00:13<00:00, 48.14it/s]


Training Loss:  0.39114765344703484 Training Accuracy:  85.94761904761904 Test Loss:  0.4174447406620292 Testing Accuracy:  85.47777777777777
Epoch:  3


100%|██████████| 656/656 [00:13<00:00, 48.59it/s]


Training Loss:  0.44120375006949536 Training Accuracy:  85.70714285714286 Test Loss:  0.4904846219499154 Testing Accuracy:  84.93888888888888
Epoch:  4


100%|██████████| 656/656 [00:13<00:00, 50.44it/s]


Training Loss:  0.4682234620859696 Training Accuracy:  85.01190476190476 Test Loss:  0.5198730857699847 Testing Accuracy:  84.30555555555556
Epoch:  5


100%|██████████| 656/656 [00:13<00:00, 48.12it/s]


Training Loss:  0.47842106239288684 Training Accuracy:  84.53095238095239 Test Loss:  0.5284353803775198 Testing Accuracy:  84.2388888888889


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,84.53095
tr_loss,0.47842
val_accuracy,84.23889
val_loss,0.52844


[34m[1mwandb[0m: Agent Starting Run: 10y8ugxf with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight m

100%|██████████| 1312/1312 [00:17<00:00, 75.54it/s]


Training Loss:  0.5082266912519089 Training Accuracy:  81.71428571428571 Test Loss:  0.5165988184359834 Testing Accuracy:  81.38333333333334
Epoch:  2


100%|██████████| 1312/1312 [00:16<00:00, 77.80it/s]


Training Loss:  0.451534366827819 Training Accuracy:  83.99285714285715 Test Loss:  0.47269088236314766 Testing Accuracy:  83.52222222222223
Epoch:  3


100%|██████████| 1312/1312 [00:16<00:00, 78.05it/s]


Training Loss:  0.392053680758074 Training Accuracy:  85.92142857142858 Test Loss:  0.4262180296167831 Testing Accuracy:  85.11666666666666
Epoch:  4


100%|██████████| 1312/1312 [00:16<00:00, 78.40it/s]


Training Loss:  0.3639651454284852 Training Accuracy:  86.99761904761905 Test Loss:  0.414322567328866 Testing Accuracy:  85.84444444444445
Epoch:  5


100%|██████████| 1312/1312 [00:16<00:00, 77.39it/s]


Training Loss:  0.37307549940781287 Training Accuracy:  86.85238095238095 Test Loss:  0.4310183542549482 Testing Accuracy:  85.5


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,86.85238
tr_loss,0.37308
val_accuracy,85.5
val_loss,0.43102


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: f15jxasm with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix 

100%|██████████| 656/656 [00:02<00:00, 240.71it/s]


Training Loss:  1.4571250990561304 Training Accuracy:  36.35 Test Loss:  1.4557625634855467 Testing Accuracy:  36.955555555555556
Epoch:  2


100%|██████████| 656/656 [00:04<00:00, 156.52it/s]


Training Loss:  1.0298648275529196 Training Accuracy:  58.52619047619048 Test Loss:  1.0282906103497929 Testing Accuracy:  58.34444444444444
Epoch:  3


100%|██████████| 656/656 [00:02<00:00, 243.91it/s]


Training Loss:  0.866703576258806 Training Accuracy:  65.15714285714286 Test Loss:  0.8656222679236995 Testing Accuracy:  64.44444444444444
Epoch:  4


100%|██████████| 656/656 [00:02<00:00, 262.91it/s]


Training Loss:  0.805016092654209 Training Accuracy:  64.57619047619048 Test Loss:  0.8051627971058847 Testing Accuracy:  64.18888888888888
Epoch:  5


100%|██████████| 656/656 [00:02<00:00, 262.20it/s]


Training Loss:  0.7768658860638271 Training Accuracy:  66.55 Test Loss:  0.7804536860879419 Testing Accuracy:  65.87222222222222


0,1
tr_accuracy,▁▄▇███
tr_loss,█▄▂▁▁▁
val_accuracy,▁▄▇███
val_loss,█▄▂▁▁▁

0,1
tr_accuracy,66.55
tr_loss,0.77687
val_accuracy,65.87222
val_loss,0.78045


[34m[1mwandb[0m: Agent Starting Run: yt96vyyc with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  nan Training Accuracy:  6.469047619047619 Test Loss:  nan Testing Accuracy:  6.455555555555556
Epoch:  1


100%|██████████| 2625/2625 [00:04<00:00, 565.83it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:07<00:00, 349.57it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:05<00:00, 514.91it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:06<00:00, 413.24it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:05<00:00, 449.02it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 2625/2625 [00:04<00:00, 545.44it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 2625/2625 [00:07<00:00, 336.60it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 2625/2625 [00:04<00:00, 537.21it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 2625/2625 [00:06<00:00, 435.99it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 2625/2625 [00:05<00:00, 443.00it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,▁██████████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: 06bpqtud with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.3393907023510767 Training

100%|██████████| 656/656 [00:05<00:00, 125.24it/s]


Training Loss:  2.3030228687326977 Training Accuracy:  8.721428571428572 Test Loss:  2.302932520963614 Testing Accuracy:  9.022222222222222
Epoch:  2


100%|██████████| 656/656 [00:07<00:00, 84.16it/s] 


Training Loss:  2.302742059115225 Training Accuracy:  7.545238095238095 Test Loss:  2.302769746854703 Testing Accuracy:  7.4944444444444445
Epoch:  3


100%|██████████| 656/656 [00:05<00:00, 126.58it/s]


Training Loss:  2.302684678080605 Training Accuracy:  9.333333333333334 Test Loss:  2.302722120567055 Testing Accuracy:  9.255555555555556
Epoch:  4


100%|██████████| 656/656 [00:07<00:00, 87.10it/s] 


Training Loss:  2.302628280267797 Training Accuracy:  9.478571428571428 Test Loss:  2.302666649818644 Testing Accuracy:  9.38888888888889
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 128.83it/s]


Training Loss:  2.3025719637971673 Training Accuracy:  9.55952380952381 Test Loss:  2.302610526599171 Testing Accuracy:  9.422222222222222


0,1
tr_accuracy,█▄▁▆▇▇
tr_loss,█▁▁▁▁▁
val_accuracy,█▅▁▆▆▆
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,9.55952
tr_loss,2.30257
val_accuracy,9.42222
val_loss,2.30261


[34m[1mwandb[0m: Agent Starting Run: h2sqnph6 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.2977142096828027 Training Accuracy:  11.185714285714285 Test Loss:  2.297741793709569 Testing Accuracy:  11.5
Epoch:  1


100%|██████████| 1312/1312 [00:01<00:00, 800.09it/s]


Training Loss:  2.244374136158302 Training Accuracy:  17.542857142857144 Test Loss:  2.245249627793213 Testing Accuracy:  18.011111111111113
Epoch:  2


100%|██████████| 1312/1312 [00:01<00:00, 809.89it/s]


Training Loss:  2.197962337198259 Training Accuracy:  23.495238095238093 Test Loss:  2.198948554118311 Testing Accuracy:  23.961111111111112
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 593.04it/s]


Training Loss:  2.1517318811102237 Training Accuracy:  29.695238095238096 Test Loss:  2.152760702741623 Testing Accuracy:  29.805555555555557
Epoch:  4


100%|██████████| 1312/1312 [00:03<00:00, 436.90it/s]


Training Loss:  2.1040710111768965 Training Accuracy:  33.23095238095238 Test Loss:  2.104887404211873 Testing Accuracy:  33.355555555555554
Epoch:  5


100%|██████████| 1312/1312 [00:01<00:00, 864.35it/s]


Training Loss:  2.0546665614997024 Training Accuracy:  36.01904761904762 Test Loss:  2.0553569171236723 Testing Accuracy:  36.17777777777778
Epoch:  6


100%|██████████| 1312/1312 [00:01<00:00, 843.99it/s]


Training Loss:  2.001304301178161 Training Accuracy:  38.33571428571429 Test Loss:  2.0018137067520425 Testing Accuracy:  38.56111111111111
Epoch:  7


100%|██████████| 1312/1312 [00:01<00:00, 850.42it/s]


Training Loss:  1.9439473022584186 Training Accuracy:  40.11904761904762 Test Loss:  1.9441992932028191 Testing Accuracy:  40.388888888888886
Epoch:  8


100%|██████████| 1312/1312 [00:01<00:00, 708.69it/s]


Training Loss:  1.8829928271935434 Training Accuracy:  41.77857142857143 Test Loss:  1.8830189058958147 Testing Accuracy:  42.016666666666666
Epoch:  9


100%|██████████| 1312/1312 [00:01<00:00, 855.64it/s]


Training Loss:  1.8192122314832055 Training Accuracy:  43.41428571428571 Test Loss:  1.8191165189211 Testing Accuracy:  43.611111111111114
Epoch:  10


100%|██████████| 1312/1312 [00:02<00:00, 458.12it/s]


Training Loss:  1.7555001963654098 Training Accuracy:  45.06190476190476 Test Loss:  1.7551873812994503 Testing Accuracy:  45.31666666666667


0,1
tr_accuracy,▁▂▄▅▆▆▇▇▇██
tr_loss,█▇▇▆▆▅▄▃▃▂▁
val_accuracy,▁▂▄▅▆▆▇▇▇██
val_loss,█▇▇▆▆▅▄▃▃▂▁

0,1
tr_accuracy,45.0619
tr_loss,1.7555
val_accuracy,45.31667
val_loss,1.75519


[34m[1mwandb[0m: Agent Starting Run: povqvpda with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  sof

100%|██████████| 1312/1312 [00:17<00:00, 76.27it/s]


Training Loss:  0.654833800843917 Training Accuracy:  76.83571428571429 Test Loss:  0.6516483600578268 Testing Accuracy:  76.75555555555556
Epoch:  2


100%|██████████| 1312/1312 [00:16<00:00, 80.01it/s]


Training Loss:  0.5531550548123058 Training Accuracy:  80.71190476190476 Test Loss:  0.5593197743446874 Testing Accuracy:  80.31111111111112
Epoch:  3


100%|██████████| 1312/1312 [00:16<00:00, 79.96it/s]


Training Loss:  0.49557315704935356 Training Accuracy:  83.53809523809524 Test Loss:  0.5085407220781919 Testing Accuracy:  82.95555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:16<00:00, 79.40it/s]


Training Loss:  0.4936775461150041 Training Accuracy:  83.86190476190477 Test Loss:  0.5126722533239371 Testing Accuracy:  83.4888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:16<00:00, 79.21it/s]


Training Loss:  0.48033711332641016 Training Accuracy:  84.71190476190476 Test Loss:  0.5067368637133987 Testing Accuracy:  84.04444444444445


0,1
tr_accuracy,▁▇████
tr_loss,█▂▁▁▁▁
val_accuracy,▁▇████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,84.7119
tr_loss,0.48034
val_accuracy,84.04444
val_loss,0.50674


[34m[1mwandb[0m: Agent Starting Run: pdpfubk0 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  nan Training Accuracy:  11.433333333333334 Test Loss:  

100%|██████████| 656/656 [00:01<00:00, 532.30it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:01<00:00, 505.23it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:01<00:00, 538.75it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:01<00:00, 383.66it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:01<00:00, 524.33it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 656/656 [00:01<00:00, 357.61it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 656/656 [00:01<00:00, 333.66it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 656/656 [00:01<00:00, 345.14it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 656/656 [00:01<00:00, 545.93it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 656/656 [00:01<00:00, 504.87it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: mv10nva4 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matri

100%|██████████| 656/656 [00:07<00:00, 89.16it/s] 


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:04<00:00, 135.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:06<00:00, 94.45it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:04<00:00, 143.92it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 122.98it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: 6ctkucqm with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  nan Training Accuracy:  9.8452380

100%|██████████| 1312/1312 [00:11<00:00, 116.03it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:10<00:00, 130.18it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 153.84it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:10<00:00, 122.15it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:10<00:00, 121.08it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 1312/1312 [00:08<00:00, 158.83it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 1312/1312 [00:09<00:00, 134.16it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 1312/1312 [00:11<00:00, 118.77it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 1312/1312 [00:08<00:00, 146.83it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 1312/1312 [00:08<00:00, 147.42it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8u0615m1 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (1

100%|██████████| 2625/2625 [00:09<00:00, 271.59it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:10<00:00, 240.49it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:08<00:00, 325.60it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:10<00:00, 255.03it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:10<00:00, 242.56it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█████
val_accuracy,▁█████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: si0t09uv with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix d

100%|██████████| 1312/1312 [00:07<00:00, 179.51it/s]


Training Loss:  1.827149254725775 Training Accuracy:  22.00952380952381 Test Loss:  1.829502940066639 Testing Accuracy:  21.961111111111112
Epoch:  2


100%|██████████| 1312/1312 [00:04<00:00, 270.40it/s]


Training Loss:  1.8162994667236996 Training Accuracy:  25.335714285714285 Test Loss:  1.8184671827156746 Testing Accuracy:  25.33888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 223.13it/s]


Training Loss:  1.8123377192831696 Training Accuracy:  26.726190476190474 Test Loss:  1.8144625586006236 Testing Accuracy:  26.47222222222222
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 229.44it/s]


Training Loss:  1.8104526869781392 Training Accuracy:  26.97142857142857 Test Loss:  1.812562027063003 Testing Accuracy:  26.716666666666665
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 262.31it/s]


Training Loss:  1.809473879719045 Training Accuracy:  26.87142857142857 Test Loss:  1.8115773821800096 Testing Accuracy:  26.555555555555557
Epoch:  6


100%|██████████| 1312/1312 [00:06<00:00, 197.09it/s]


Training Loss:  1.8088559934066417 Training Accuracy:  26.807142857142857 Test Loss:  1.8109565549540154 Testing Accuracy:  26.516666666666666
Epoch:  7


100%|██████████| 1312/1312 [00:04<00:00, 269.21it/s]


Training Loss:  1.8083958370798816 Training Accuracy:  26.788095238095238 Test Loss:  1.8104936261299742 Testing Accuracy:  26.433333333333334
Epoch:  8


100%|██████████| 1312/1312 [00:07<00:00, 177.35it/s]


Training Loss:  1.808033134053058 Training Accuracy:  26.75952380952381 Test Loss:  1.8101280900757108 Testing Accuracy:  26.42222222222222
Epoch:  9


100%|██████████| 1312/1312 [00:04<00:00, 267.60it/s]


Training Loss:  1.8077386947579674 Training Accuracy:  26.711904761904762 Test Loss:  1.809830855053884 Testing Accuracy:  26.322222222222223
Epoch:  10


100%|██████████| 1312/1312 [00:07<00:00, 185.05it/s]


Training Loss:  1.8074953626874242 Training Accuracy:  26.702380952380953 Test Loss:  1.809584870103114 Testing Accuracy:  26.322222222222223


0,1
tr_accuracy,▁▅▇████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▆▇████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,26.70238
tr_loss,1.8075
val_accuracy,26.32222
val_loss,1.80958


[34m[1mwandb[0m: Agent Starting Run: op4vrjxn with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.291783441106885 Training Accuracy:  10.4 Test Loss:  2.289613027571238 Testing Accuracy:  10.561111111111112
Epoch:  1


100%|██████████| 1312/1312 [00:02<00:00, 485.74it/s]


Training Loss:  1.1843371365424349 Training Accuracy:  63.4547619047619 Test Loss:  1.1860966141433908 Testing Accuracy:  64.06666666666666
Epoch:  2


100%|██████████| 1312/1312 [00:02<00:00, 497.36it/s]


Training Loss:  0.8152834308126753 Training Accuracy:  71.55714285714286 Test Loss:  0.8193065189646622 Testing Accuracy:  71.57222222222222
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 653.86it/s]


Training Loss:  0.7061076585769898 Training Accuracy:  75.8547619047619 Test Loss:  0.7101900547745205 Testing Accuracy:  75.89444444444445
Epoch:  4


100%|██████████| 1312/1312 [00:01<00:00, 735.00it/s]


Training Loss:  0.6408453798083431 Training Accuracy:  78.31190476190476 Test Loss:  0.6451599827361658 Testing Accuracy:  78.25555555555556
Epoch:  5


100%|██████████| 1312/1312 [00:01<00:00, 732.87it/s]


Training Loss:  0.5983325423491346 Training Accuracy:  79.56666666666666 Test Loss:  0.6026497551998449 Testing Accuracy:  79.51666666666667
Epoch:  6


100%|██████████| 1312/1312 [00:01<00:00, 792.36it/s]


Training Loss:  0.5672228200093383 Training Accuracy:  80.48333333333333 Test Loss:  0.5716295214092442 Testing Accuracy:  80.34444444444445
Epoch:  7


100%|██████████| 1312/1312 [00:02<00:00, 625.11it/s]


Training Loss:  0.5439706805899706 Training Accuracy:  81.11190476190477 Test Loss:  0.5486233705788227 Testing Accuracy:  80.97222222222223
Epoch:  8


100%|██████████| 1312/1312 [00:02<00:00, 508.92it/s]


Training Loss:  0.5259025343145247 Training Accuracy:  81.75 Test Loss:  0.5308154342399108 Testing Accuracy:  81.49444444444444
Epoch:  9


100%|██████████| 1312/1312 [00:02<00:00, 541.28it/s]


Training Loss:  0.5109842371858304 Training Accuracy:  82.14761904761905 Test Loss:  0.5163007817051628 Testing Accuracy:  82.04444444444445
Epoch:  10


100%|██████████| 1312/1312 [00:01<00:00, 800.89it/s]


Training Loss:  0.4983317975385233 Training Accuracy:  82.54523809523809 Test Loss:  0.5044607834461277 Testing Accuracy:  82.41666666666667


0,1
tr_accuracy,▁▆▇▇███████
tr_loss,█▄▂▂▂▁▁▁▁▁▁
val_accuracy,▁▆▇▇███████
val_loss,█▄▂▂▂▁▁▁▁▁▁

0,1
tr_accuracy,82.54524
tr_loss,0.49833
val_accuracy,82.41667
val_loss,0.50446


[34m[1mwandb[0m: Agent Starting Run: kkago02z with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matr

100%|██████████| 2625/2625 [00:16<00:00, 158.58it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:17<00:00, 150.02it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:16<00:00, 157.47it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:17<00:00, 154.12it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:17<00:00, 151.99it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 2625/2625 [00:17<00:00, 152.53it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 2625/2625 [00:18<00:00, 144.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 2625/2625 [00:17<00:00, 153.08it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 2625/2625 [00:16<00:00, 154.58it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 2625/2625 [00:17<00:00, 153.33it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: gdil7wv0 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  5.591303266428694 Training Accuracy:  7.059523809523809 Test Loss:  5.6202118272845505 Testing Accuracy:  6.7555555555555555
Epoch:  1


100%|██████████| 1312/1312 [00:01<00:00, 662.00it/s]


Training Loss:  4.125602185260072 Training Accuracy:  4.185714285714286 Test Loss:  4.14232613410666 Testing Accuracy:  4.35
Epoch:  2


100%|██████████| 1312/1312 [00:02<00:00, 648.65it/s]


Training Loss:  3.255945441474424 Training Accuracy:  5.552380952380952 Test Loss:  3.2651312643039265 Testing Accuracy:  5.6
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 637.32it/s]


Training Loss:  2.8171822380741474 Training Accuracy:  6.635714285714286 Test Loss:  2.8226608680512393 Testing Accuracy:  6.477777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:03<00:00, 370.94it/s]


Training Loss:  2.6168023818162403 Training Accuracy:  9.073809523809524 Test Loss:  2.620833917715248 Testing Accuracy:  8.994444444444444
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 475.51it/s]


Training Loss:  2.508766823744815 Training Accuracy:  10.716666666666667 Test Loss:  2.512258511828035 Testing Accuracy:  10.71111111111111
Epoch:  6


100%|██████████| 1312/1312 [00:02<00:00, 529.15it/s]


Training Loss:  2.4396085215743653 Training Accuracy:  11.904761904761905 Test Loss:  2.442905985173697 Testing Accuracy:  11.838888888888889
Epoch:  7


100%|██████████| 1312/1312 [00:02<00:00, 631.28it/s]


Training Loss:  2.3912650293989786 Training Accuracy:  12.616666666666667 Test Loss:  2.394504709131567 Testing Accuracy:  12.761111111111111
Epoch:  8


100%|██████████| 1312/1312 [00:01<00:00, 667.62it/s]


Training Loss:  2.3557095022986765 Training Accuracy:  13.276190476190477 Test Loss:  2.358946086770968 Testing Accuracy:  13.488888888888889
Epoch:  9


100%|██████████| 1312/1312 [00:03<00:00, 404.63it/s]


Training Loss:  2.328407756142627 Training Accuracy:  13.678571428571429 Test Loss:  2.3316646046006655 Testing Accuracy:  14.027777777777779
Epoch:  10


100%|██████████| 1312/1312 [00:02<00:00, 486.76it/s]


Training Loss:  2.306510330659271 Training Accuracy:  13.938095238095238 Test Loss:  2.3097990038175347 Testing Accuracy:  14.177777777777777


0,1
tr_accuracy,▃▁▂▃▅▆▇▇███
tr_loss,█▅▃▂▂▁▁▁▁▁▁
val_accuracy,▃▁▂▃▄▆▆▇███
val_loss,█▅▃▂▂▁▁▁▁▁▁

0,1
tr_accuracy,13.9381
tr_loss,2.30651
val_accuracy,14.17778
val_loss,2.3098


[34m[1mwandb[0m: Agent Starting Run: ee5jdpo0 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix 

100%|██████████| 656/656 [00:04<00:00, 159.95it/s]


Training Loss:  2.043342842281402 Training Accuracy:  28.792857142857144 Test Loss:  2.0454628191658015 Testing Accuracy:  28.86111111111111
Epoch:  2


100%|██████████| 656/656 [00:02<00:00, 235.20it/s]


Training Loss:  1.431695768166463 Training Accuracy:  51.08571428571429 Test Loss:  1.4350067279649183 Testing Accuracy:  51.11666666666667
Epoch:  3


100%|██████████| 656/656 [00:02<00:00, 268.91it/s]


Training Loss:  1.2055192218764599 Training Accuracy:  60.51904761904762 Test Loss:  1.2122657580441711 Testing Accuracy:  60.394444444444446
Epoch:  4


100%|██████████| 656/656 [00:02<00:00, 262.53it/s]


Training Loss:  1.0565897600302192 Training Accuracy:  65.5547619047619 Test Loss:  1.066258308547276 Testing Accuracy:  64.9
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 159.23it/s]


Training Loss:  0.955416935312439 Training Accuracy:  68.3547619047619 Test Loss:  0.9654482909059015 Testing Accuracy:  67.9
Epoch:  6


100%|██████████| 656/656 [00:03<00:00, 206.64it/s]


Training Loss:  0.8832941673430162 Training Accuracy:  70.17142857142858 Test Loss:  0.8932187102931314 Testing Accuracy:  69.71666666666667
Epoch:  7


100%|██████████| 656/656 [00:02<00:00, 256.64it/s]


Training Loss:  0.8301725961204757 Training Accuracy:  71.60952380952381 Test Loss:  0.8399323058722719 Testing Accuracy:  71.03333333333333
Epoch:  8


100%|██████████| 656/656 [00:02<00:00, 270.57it/s]


Training Loss:  0.7896610734450048 Training Accuracy:  72.63333333333334 Test Loss:  0.7994548157099891 Testing Accuracy:  72.17222222222222
Epoch:  9


100%|██████████| 656/656 [00:03<00:00, 206.72it/s]


Training Loss:  0.7576154976836351 Training Accuracy:  73.5952380952381 Test Loss:  0.7676269706895453 Testing Accuracy:  73.0
Epoch:  10


100%|██████████| 656/656 [00:03<00:00, 168.40it/s]


Training Loss:  0.7313307019122418 Training Accuracy:  74.3452380952381 Test Loss:  0.7416289235876019 Testing Accuracy:  73.94444444444444


0,1
tr_accuracy,▁▃▅▆▇▇█████
tr_loss,█▂▂▁▁▁▁▁▁▁▁
val_accuracy,▁▃▆▇▇▇█████
val_loss,█▂▂▁▁▁▁▁▁▁▁

0,1
tr_accuracy,74.34524
tr_loss,0.73133
val_accuracy,73.94444
val_loss,0.74163


[34m[1mwandb[0m: Agent Starting Run: 4mxjr8u1 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  12.164924543555125 Training Accuracy:  9.2 Test Loss:  12.036903606584783 Testing Accuracy:  9.222222222222221
Epoch:  1


100%|██████████| 2625/2625 [00:13<00:00, 200.44it/s]


Training Loss:  1.2396370781550268 Training Accuracy:  63.46904761904762 Test Loss:  1.264075359813349 Testing Accuracy:  63.41111111111111
Epoch:  2


100%|██████████| 2625/2625 [00:13<00:00, 190.82it/s]


Training Loss:  0.89606464159639 Training Accuracy:  71.25714285714285 Test Loss:  0.9316848090993907 Testing Accuracy:  70.74444444444444
Epoch:  3


100%|██████████| 2625/2625 [00:13<00:00, 195.36it/s]


Training Loss:  0.8141178524047419 Training Accuracy:  73.82857142857142 Test Loss:  0.8493952255007993 Testing Accuracy:  73.0
Epoch:  4


100%|██████████| 2625/2625 [00:12<00:00, 218.17it/s]


Training Loss:  0.7381630818955885 Training Accuracy:  76.34761904761905 Test Loss:  0.7729253509182489 Testing Accuracy:  75.35
Epoch:  5


100%|██████████| 2625/2625 [00:11<00:00, 234.76it/s]


Training Loss:  0.6957375788630875 Training Accuracy:  77.37857142857143 Test Loss:  0.7372813564522785 Testing Accuracy:  76.87777777777778


0,1
tr_accuracy,▁▇▇███
tr_loss,█▁▁▁▁▁
val_accuracy,▁▇▇███
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,77.37857
tr_loss,0.69574
val_accuracy,76.87778
val_loss,0.73728


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 29o3ykn2 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  17.63375871642769 Trai

100%|██████████| 656/656 [00:06<00:00, 103.50it/s]


Training Loss:  1.4559950685614875 Training Accuracy:  51.35476190476191 Test Loss:  1.4742271389853643 Testing Accuracy:  50.766666666666666
Epoch:  2


100%|██████████| 656/656 [00:07<00:00, 86.17it/s]


Training Loss:  1.0066880467746475 Training Accuracy:  65.14047619047619 Test Loss:  1.0278686800429921 Testing Accuracy:  64.67222222222222
Epoch:  3


100%|██████████| 656/656 [00:08<00:00, 81.94it/s]


Training Loss:  0.8549695395621566 Training Accuracy:  69.93809523809524 Test Loss:  0.8825879792900644 Testing Accuracy:  69.29444444444445
Epoch:  4


100%|██████████| 656/656 [00:06<00:00, 103.11it/s]


Training Loss:  0.7734083263519335 Training Accuracy:  72.72142857142858 Test Loss:  0.8051072118728607 Testing Accuracy:  71.93888888888888
Epoch:  5


100%|██████████| 656/656 [00:08<00:00, 73.50it/s] 


Training Loss:  0.7199935168237781 Training Accuracy:  74.58571428571429 Test Loss:  0.754563276432013 Testing Accuracy:  73.39444444444445


0,1
tr_accuracy,▁▅▇▇██
tr_loss,█▁▁▁▁▁
val_accuracy,▁▅▇███
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,74.58571
tr_loss,0.71999
val_accuracy,73.39444
val_loss,0.75456


[34m[1mwandb[0m: Agent Starting Run: 46rg26oo with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  nan Training Accuracy:  11.20952380

100%|██████████| 656/656 [00:07<00:00, 82.61it/s] 


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:05<00:00, 124.19it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:07<00:00, 84.90it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:05<00:00, 122.92it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:06<00:00, 94.85it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: f6zkbmiy with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.4540457510762725 Training Accuracy:  9.9

100%|██████████| 656/656 [00:03<00:00, 166.92it/s]


Training Loss:  0.799021506461019 Training Accuracy:  69.87380952380953 Test Loss:  0.8065770898125036 Testing Accuracy:  69.46111111111111
Epoch:  2


100%|██████████| 656/656 [00:05<00:00, 116.69it/s]


Training Loss:  0.5372477721417319 Training Accuracy:  81.57380952380953 Test Loss:  0.5440683791419616 Testing Accuracy:  81.7
Epoch:  3


100%|██████████| 656/656 [00:03<00:00, 173.86it/s]


Training Loss:  0.4628985309407967 Training Accuracy:  84.0952380952381 Test Loss:  0.47095886331084097 Testing Accuracy:  84.17777777777778
Epoch:  4


100%|██████████| 656/656 [00:04<00:00, 139.63it/s]


Training Loss:  0.4251729236214609 Training Accuracy:  85.47380952380952 Test Loss:  0.4392371973910733 Testing Accuracy:  85.22777777777777
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 132.40it/s]


Training Loss:  0.39927620865327507 Training Accuracy:  86.34761904761905 Test Loss:  0.42049742452004457 Testing Accuracy:  85.67222222222222
Epoch:  6


100%|██████████| 656/656 [00:03<00:00, 166.84it/s]


Training Loss:  0.3798473736844757 Training Accuracy:  86.91904761904762 Test Loss:  0.40749660950635697 Testing Accuracy:  86.11666666666666
Epoch:  7


100%|██████████| 656/656 [00:04<00:00, 137.81it/s]


Training Loss:  0.36501081730037316 Training Accuracy:  87.32142857142857 Test Loss:  0.3984070981334296 Testing Accuracy:  86.44444444444444
Epoch:  8


100%|██████████| 656/656 [00:04<00:00, 143.13it/s]


Training Loss:  0.3532870269619145 Training Accuracy:  87.72857142857143 Test Loss:  0.39206668574194886 Testing Accuracy:  86.8
Epoch:  9


100%|██████████| 656/656 [00:03<00:00, 169.63it/s]


Training Loss:  0.3435565541925731 Training Accuracy:  88.08333333333333 Test Loss:  0.38732148146185624 Testing Accuracy:  86.92777777777778
Epoch:  10


100%|██████████| 656/656 [00:04<00:00, 141.80it/s]


Training Loss:  0.33517255671689 Training Accuracy:  88.32619047619048 Test Loss:  0.3835636912550714 Testing Accuracy:  87.02777777777777


0,1
tr_accuracy,▁▆▇████████
tr_loss,█▃▂▁▁▁▁▁▁▁▁
val_accuracy,▁▆█████████
val_loss,█▂▂▁▁▁▁▁▁▁▁

0,1
tr_accuracy,88.32619
tr_loss,0.33517
val_accuracy,87.02778
val_loss,0.38356


[34m[1mwandb[0m: Agent Starting Run: jslc2ouo with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.4762600619926696 Training Accuracy:

100%|██████████| 1312/1312 [00:03<00:00, 328.34it/s]


Training Loss:  2.302601474534567 Training Accuracy:  6.0738095238095235 Test Loss:  2.302597948464279 Testing Accuracy:  5.955555555555556
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 198.19it/s]


Training Loss:  2.3024447756686843 Training Accuracy:  10.038095238095238 Test Loss:  2.302437547888682 Testing Accuracy:  9.905555555555555
Epoch:  3


100%|██████████| 1312/1312 [00:04<00:00, 308.66it/s]


Training Loss:  2.3022811654973654 Training Accuracy:  10.040476190476191 Test Loss:  2.3022740757388895 Testing Accuracy:  9.905555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 291.75it/s]


Training Loss:  2.302115550296349 Training Accuracy:  10.040476190476191 Test Loss:  2.302108603908476 Testing Accuracy:  9.905555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:05<00:00, 224.47it/s]


Training Loss:  2.3019473761824605 Training Accuracy:  10.040476190476191 Test Loss:  2.3019405855313986 Testing Accuracy:  9.905555555555555


0,1
tr_accuracy,█▁████
tr_loss,█▁▁▁▁▁
val_accuracy,█▁████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,10.04048
tr_loss,2.30195
val_accuracy,9.90556
val_loss,2.30194


[34m[1mwandb[0m: Agent Starting Run: p337kwli with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  sof

100%|██████████| 656/656 [00:11<00:00, 56.82it/s]


Training Loss:  1.2861812424608694 Training Accuracy:  55.542857142857144 Test Loss:  1.2999869950548961 Testing Accuracy:  55.37777777777778
Epoch:  2


100%|██████████| 656/656 [00:11<00:00, 58.15it/s]


Training Loss:  0.9533133170042153 Training Accuracy:  66.78571428571429 Test Loss:  0.9771237133740646 Testing Accuracy:  66.06111111111112
Epoch:  3


100%|██████████| 656/656 [00:09<00:00, 67.72it/s]


Training Loss:  0.8184590069458209 Training Accuracy:  70.97380952380952 Test Loss:  0.848390146301195 Testing Accuracy:  69.87777777777778
Epoch:  4


100%|██████████| 656/656 [00:08<00:00, 75.15it/s]


Training Loss:  0.7436338530708474 Training Accuracy:  73.34285714285714 Test Loss:  0.7776423631543318 Testing Accuracy:  72.17222222222222
Epoch:  5


100%|██████████| 656/656 [00:10<00:00, 64.93it/s]


Training Loss:  0.6943236426302147 Training Accuracy:  75.01904761904763 Test Loss:  0.7322459295591446 Testing Accuracy:  73.67777777777778
Epoch:  6


100%|██████████| 656/656 [00:11<00:00, 56.75it/s]


Training Loss:  0.6581505241807307 Training Accuracy:  76.35714285714286 Test Loss:  0.6999496214109471 Testing Accuracy:  74.83333333333333
Epoch:  7


100%|██████████| 656/656 [00:10<00:00, 60.13it/s]


Training Loss:  0.6293631426062263 Training Accuracy:  77.27857142857142 Test Loss:  0.6748520740308992 Testing Accuracy:  75.88888888888889
Epoch:  8


100%|██████████| 656/656 [00:09<00:00, 70.58it/s]


Training Loss:  0.6056533503132628 Training Accuracy:  78.05714285714286 Test Loss:  0.6549118185676984 Testing Accuracy:  76.64444444444445
Epoch:  9


100%|██████████| 656/656 [00:08<00:00, 74.92it/s]


Training Loss:  0.5856788596381673 Training Accuracy:  78.75714285714285 Test Loss:  0.6387860935033788 Testing Accuracy:  77.08333333333333
Epoch:  10


100%|██████████| 656/656 [00:11<00:00, 58.95it/s]


Training Loss:  0.5684430884165411 Training Accuracy:  79.43095238095238 Test Loss:  0.6254091118612373 Testing Accuracy:  77.6


0,1
tr_accuracy,▁▆▇▇▇██████
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▆▇▇▇██████
val_loss,█▂▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,79.43095
tr_loss,0.56844
val_accuracy,77.6
val_loss,0.62541


[34m[1mwandb[0m: Agent Starting Run: r7kalexp with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.3326082147616596 Training Accuracy:  12.297619047619047 Test Loss:  2.3322549759417863 Testing Accuracy:  12.194444444444445
Epoch:  1


100%|██████████| 1312/1312 [00:12<00:00, 108.04it/s]


Training Loss:  0.8223660486754653 Training Accuracy:  73.47857142857143 Test Loss:  0.8258762105263403 Testing Accuracy:  73.47222222222223
Epoch:  2


100%|██████████| 1312/1312 [00:11<00:00, 114.89it/s]


Training Loss:  0.663339409052754 Training Accuracy:  77.43333333333334 Test Loss:  0.6678477455384547 Testing Accuracy:  77.55555555555556
Epoch:  3


100%|██████████| 1312/1312 [00:10<00:00, 129.81it/s]


Training Loss:  0.5912520202557602 Training Accuracy:  79.6595238095238 Test Loss:  0.5956582125748188 Testing Accuracy:  79.67777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:09<00:00, 143.12it/s]


Training Loss:  0.5474647042144232 Training Accuracy:  80.96666666666667 Test Loss:  0.5517141183702339 Testing Accuracy:  80.81111111111112
Epoch:  5


100%|██████████| 1312/1312 [00:10<00:00, 127.58it/s]


Training Loss:  0.5179500581689346 Training Accuracy:  81.92857142857143 Test Loss:  0.5222947410412867 Testing Accuracy:  81.72777777777777


0,1
tr_accuracy,▁▇████
tr_loss,█▂▂▁▁▁
val_accuracy,▁▇████
val_loss,█▂▂▁▁▁

0,1
tr_accuracy,81.92857
tr_loss,0.51795
val_accuracy,81.72778
val_loss,0.52229


[34m[1mwandb[0m: Agent Starting Run: wgu8q2ye with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  10.54713484534887 Training Accuracy:  11.00714285

100%|██████████| 1312/1312 [00:06<00:00, 203.68it/s]


Training Loss:  4.377992706385726 Training Accuracy:  31.802380952380954 Test Loss:  4.426541837139446 Testing Accuracy:  32.19444444444444
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 217.69it/s]


Training Loss:  2.981687315893484 Training Accuracy:  38.864285714285714 Test Loss:  3.0563252735337016 Testing Accuracy:  38.69444444444444
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 178.57it/s]


Training Loss:  2.358468216204222 Training Accuracy:  44.24047619047619 Test Loss:  2.4533134736330333 Testing Accuracy:  43.21666666666667
Epoch:  4


100%|██████████| 1312/1312 [00:05<00:00, 233.31it/s]


Training Loss:  2.014524330821251 Training Accuracy:  46.99285714285714 Test Loss:  2.096206652531362 Testing Accuracy:  46.227777777777774
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 171.89it/s]


Training Loss:  1.8008560830305975 Training Accuracy:  49.55 Test Loss:  1.9029355921903641 Testing Accuracy:  48.18333333333333
Epoch:  6


100%|██████████| 1312/1312 [00:05<00:00, 231.20it/s]


Training Loss:  1.6636192239448577 Training Accuracy:  51.46190476190476 Test Loss:  1.7705774348465755 Testing Accuracy:  49.93888888888889
Epoch:  7


100%|██████████| 1312/1312 [00:07<00:00, 165.66it/s]


Training Loss:  1.5622785933727756 Training Accuracy:  52.82142857142857 Test Loss:  1.6751325583082566 Testing Accuracy:  51.35
Epoch:  8


100%|██████████| 1312/1312 [00:05<00:00, 240.03it/s]


Training Loss:  1.4744753805740245 Training Accuracy:  54.29047619047619 Test Loss:  1.5987788161743761 Testing Accuracy:  52.78888888888889
Epoch:  9


100%|██████████| 1312/1312 [00:08<00:00, 161.11it/s]


Training Loss:  1.4037929470913422 Training Accuracy:  55.86190476190476 Test Loss:  1.5383276691752645 Testing Accuracy:  53.65555555555556
Epoch:  10


100%|██████████| 1312/1312 [00:05<00:00, 235.10it/s]


Training Loss:  1.3450599088734665 Training Accuracy:  57.035714285714285 Test Loss:  1.497277377400292 Testing Accuracy:  54.922222222222224


0,1
tr_accuracy,▁▄▅▆▆▇▇▇███
tr_loss,█▃▂▂▂▁▁▁▁▁▁
val_accuracy,▁▄▅▆▇▇▇▇███
val_loss,█▃▂▂▁▁▁▁▁▁▁

0,1
tr_accuracy,57.03571
tr_loss,1.34506
val_accuracy,54.92222
val_loss,1.49728


[34m[1mwandb[0m: Agent Starting Run: uhi25mgp with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function: 

100%|██████████| 2625/2625 [00:17<00:00, 147.99it/s]


Training Loss:  2.3100503967127897 Training Accuracy:  9.997619047619047 Test Loss:  2.3108253585096894 Testing Accuracy:  10.005555555555556
Epoch:  2


100%|██████████| 2625/2625 [00:17<00:00, 153.33it/s]


Training Loss:  2.302822797810591 Training Accuracy:  8.242857142857142 Test Loss:  2.302982429291264 Testing Accuracy:  8.205555555555556
Epoch:  3


100%|██████████| 2625/2625 [00:17<00:00, 150.14it/s]


Training Loss:  2.302657801466389 Training Accuracy:  9.92142857142857 Test Loss:  2.3027297610841115 Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:17<00:00, 152.73it/s]


Training Loss:  2.3026515597723627 Training Accuracy:  9.926190476190476 Test Loss:  2.302712155786377 Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:17<00:00, 152.00it/s]


Training Loss:  2.3026479097261916 Training Accuracy:  9.923809523809524 Test Loss:  2.3027070563543095 Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 2625/2625 [00:17<00:00, 149.06it/s]


Training Loss:  2.3026442777896023 Training Accuracy:  9.923809523809524 Test Loss:  2.302703247263418 Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 2625/2625 [00:17<00:00, 153.71it/s]


Training Loss:  2.30264064288936 Training Accuracy:  9.926190476190476 Test Loss:  2.302699598660361 Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 2625/2625 [00:17<00:00, 151.56it/s]


Training Loss:  2.3026370087938868 Training Accuracy:  9.928571428571429 Test Loss:  2.3026959719192552 Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 2625/2625 [00:18<00:00, 145.27it/s]


Training Loss:  2.3026333762318436 Training Accuracy:  9.933333333333334 Test Loss:  2.302692349428127 Testing Accuracy:  9.833333333333334
Epoch:  10


100%|██████████| 2625/2625 [00:17<00:00, 149.80it/s]


Training Loss:  2.3026297452722826 Training Accuracy:  9.938095238095238 Test Loss:  2.302688728886257 Testing Accuracy:  9.833333333333334


0,1
tr_accuracy,██▁████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,██▁▇▇▇▇▇▇▇▇
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,9.9381
tr_loss,2.30263
val_accuracy,9.83333
val_loss,2.30269


[34m[1mwandb[0m: Agent Starting Run: sqvbfph6 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32

100%|██████████| 656/656 [00:01<00:00, 480.51it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:01<00:00, 469.93it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:01<00:00, 443.46it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:01<00:00, 459.77it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:01<00:00, 353.75it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 656/656 [00:02<00:00, 256.71it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 656/656 [00:02<00:00, 318.26it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 656/656 [00:01<00:00, 345.19it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 656/656 [00:01<00:00, 473.13it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 656/656 [00:01<00:00, 459.81it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: ftny1set with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.335242834306699 Training Accuracy:  10.795238095238096 Test Loss:  2.33135407255811 Testing Accuracy:  11.11111111111111
Epoch:  1


100%|██████████| 1312/1312 [00:05<00:00, 232.22it/s]


Training Loss:  0.873353487615652 Training Accuracy:  71.27380952380952 Test Loss:  0.8781574820538409 Testing Accuracy:  71.08333333333333
Epoch:  2


100%|██████████| 1312/1312 [00:08<00:00, 154.68it/s]


Training Loss:  0.6892439583382646 Training Accuracy:  76.4452380952381 Test Loss:  0.6954454136712405 Testing Accuracy:  76.11666666666666
Epoch:  3


100%|██████████| 1312/1312 [00:06<00:00, 213.00it/s]


Training Loss:  0.6087907573397815 Training Accuracy:  79.21428571428571 Test Loss:  0.6152519946620664 Testing Accuracy:  78.95
Epoch:  4


100%|██████████| 1312/1312 [00:07<00:00, 164.18it/s]


Training Loss:  0.5591768231807694 Training Accuracy:  80.87380952380953 Test Loss:  0.5658771133618442 Testing Accuracy:  80.63888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:06<00:00, 213.31it/s]


Training Loss:  0.5260816257085899 Training Accuracy:  81.9095238095238 Test Loss:  0.5332280073160137 Testing Accuracy:  81.63888888888889


0,1
tr_accuracy,▁▇▇███
tr_loss,█▂▂▁▁▁
val_accuracy,▁▇▇███
val_loss,█▂▂▁▁▁

0,1
tr_accuracy,81.90952
tr_loss,0.52608
val_accuracy,81.63889
val_loss,0.53323


[34m[1mwandb[0m: Agent Starting Run: rm0l0809 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  5.616644282525587 Training Accuracy:  8.407142857142857 Test Loss:  5.592767514435363 Testing Accuracy:  8.38888888888889
Epoch:  1


100%|██████████| 2625/2625 [00:05<00:00, 504.95it/s]


Training Loss:  0.6419980607241778 Training Accuracy:  76.76666666666667 Test Loss:  0.6505412757540421 Testing Accuracy:  76.57222222222222
Epoch:  2


100%|██████████| 2625/2625 [00:05<00:00, 506.32it/s]


Training Loss:  0.5327908318736442 Training Accuracy:  80.77857142857142 Test Loss:  0.5484502829559421 Testing Accuracy:  80.51666666666667
Epoch:  3


100%|██████████| 2625/2625 [00:03<00:00, 687.81it/s]


Training Loss:  0.4872981962248998 Training Accuracy:  82.59761904761905 Test Loss:  0.5132640976762182 Testing Accuracy:  82.17222222222222
Epoch:  4


100%|██████████| 2625/2625 [00:04<00:00, 574.23it/s]


Training Loss:  0.48671497723258184 Training Accuracy:  82.70238095238095 Test Loss:  0.5169401599675016 Testing Accuracy:  82.0111111111111
Epoch:  5


100%|██████████| 2625/2625 [00:05<00:00, 489.78it/s]


Training Loss:  0.4650867069171524 Training Accuracy:  82.66428571428571 Test Loss:  0.49510424840058437 Testing Accuracy:  81.86666666666666


0,1
tr_accuracy,▁▇████
tr_loss,█▁▁▁▁▁
val_accuracy,▁▇████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,82.66429
tr_loss,0.46509
val_accuracy,81.86667
val_loss,0.4951


[34m[1mwandb[0m: Agent Starting Run: 3k4i676e with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  nan Training Accuracy:  9.411904761904761 Test Loss: 

100%|██████████| 656/656 [00:04<00:00, 138.34it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:03<00:00, 176.99it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:03<00:00, 181.85it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:05<00:00, 122.89it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:03<00:00, 166.16it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 656/656 [00:03<00:00, 183.61it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 656/656 [00:05<00:00, 118.07it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 656/656 [00:03<00:00, 180.28it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 656/656 [00:03<00:00, 185.67it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 656/656 [00:04<00:00, 145.32it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
val_accuracy,▁██████████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: ylfyvx6b with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  7.053495688168806 Training Accuracy:  15.630952380952381 Test Loss:  7.082410204600746 Testing Accuracy:  15.372222222222222
Epoch:  1


100%|██████████| 656/656 [00:02<00:00, 277.60it/s]


Training Loss:  0.6671903048971741 Training Accuracy:  76.13809523809523 Test Loss:  0.6727188709951962 Testing Accuracy:  76.07222222222222
Epoch:  2


100%|██████████| 656/656 [00:02<00:00, 244.97it/s]


Training Loss:  0.6991335462869323 Training Accuracy:  74.79285714285714 Test Loss:  0.7051551706658072 Testing Accuracy:  74.5
Epoch:  3


100%|██████████| 656/656 [00:03<00:00, 181.16it/s]


Training Loss:  0.7382429009661388 Training Accuracy:  72.20714285714286 Test Loss:  0.7438076588000814 Testing Accuracy:  72.19444444444444
Epoch:  4


100%|██████████| 656/656 [00:02<00:00, 293.00it/s]


Training Loss:  0.6819393569926995 Training Accuracy:  75.52380952380952 Test Loss:  0.6869184951972364 Testing Accuracy:  75.31111111111112
Epoch:  5


100%|██████████| 656/656 [00:02<00:00, 295.60it/s]


Training Loss:  0.7009790596778941 Training Accuracy:  74.89285714285714 Test Loss:  0.7074282717288637 Testing Accuracy:  74.81666666666666
Epoch:  6


100%|██████████| 656/656 [00:02<00:00, 305.70it/s]


Training Loss:  0.6874045317291265 Training Accuracy:  74.86904761904762 Test Loss:  0.6952304573549155 Testing Accuracy:  74.53888888888889
Epoch:  7


100%|██████████| 656/656 [00:03<00:00, 186.69it/s]


Training Loss:  0.7112244476380744 Training Accuracy:  73.42857142857143 Test Loss:  0.7187632787529383 Testing Accuracy:  73.28888888888889
Epoch:  8


100%|██████████| 656/656 [00:03<00:00, 208.07it/s]


Training Loss:  0.7324861623860475 Training Accuracy:  72.85 Test Loss:  0.740003994196581 Testing Accuracy:  72.62777777777778
Epoch:  9


100%|██████████| 656/656 [00:02<00:00, 296.26it/s]


Training Loss:  0.6892205508785976 Training Accuracy:  75.37142857142857 Test Loss:  0.6954735114614297 Testing Accuracy:  74.84444444444445
Epoch:  10


100%|██████████| 656/656 [00:02<00:00, 296.31it/s]


Training Loss:  0.785006351413409 Training Accuracy:  70.58095238095238 Test Loss:  0.7933464583541322 Testing Accuracy:  70.24444444444444


0,1
tr_accuracy,▁█████████▇
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁█████████▇
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,70.58095
tr_loss,0.78501
val_accuracy,70.24444
val_loss,0.79335


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5k8d8t5v with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.3716026508601313 Training Accuracy:  13.45 Test Loss

100%|██████████| 656/656 [00:03<00:00, 191.27it/s]


Training Loss:  2.2595012532021244 Training Accuracy:  14.376190476190477 Test Loss:  2.2585316044496326 Testing Accuracy:  14.377777777777778
Epoch:  2


100%|██████████| 656/656 [00:02<00:00, 272.39it/s]


Training Loss:  2.1867272515792946 Training Accuracy:  16.697619047619046 Test Loss:  2.1863451584554423 Testing Accuracy:  16.733333333333334
Epoch:  3


100%|██████████| 656/656 [00:02<00:00, 278.46it/s]


Training Loss:  2.124381887060463 Training Accuracy:  19.183333333333334 Test Loss:  2.124492028993275 Testing Accuracy:  19.66111111111111
Epoch:  4


100%|██████████| 656/656 [00:02<00:00, 276.70it/s]


Training Loss:  2.0657858554411908 Training Accuracy:  22.995238095238093 Test Loss:  2.0662661028649727 Testing Accuracy:  23.372222222222224
Epoch:  5


100%|██████████| 656/656 [00:03<00:00, 169.23it/s]


Training Loss:  2.0097762853701724 Training Accuracy:  27.811904761904763 Test Loss:  2.010536481889588 Testing Accuracy:  28.02777777777778
Epoch:  6


100%|██████████| 656/656 [00:02<00:00, 270.78it/s]


Training Loss:  1.9564007167778015 Training Accuracy:  33.114285714285714 Test Loss:  1.957385541350706 Testing Accuracy:  33.266666666666666
Epoch:  7


100%|██████████| 656/656 [00:02<00:00, 271.32it/s]


Training Loss:  1.9059026879467613 Training Accuracy:  39.21190476190476 Test Loss:  1.9070875474433107 Testing Accuracy:  39.233333333333334
Epoch:  8


100%|██████████| 656/656 [00:02<00:00, 273.24it/s]


Training Loss:  1.858418892004717 Training Accuracy:  44.34761904761905 Test Loss:  1.859797619577671 Testing Accuracy:  44.71666666666667
Epoch:  9


100%|██████████| 656/656 [00:03<00:00, 171.51it/s]


Training Loss:  1.8139319708552353 Training Accuracy:  47.78095238095238 Test Loss:  1.8155045565086017 Testing Accuracy:  48.272222222222226
Epoch:  10


100%|██████████| 656/656 [00:02<00:00, 263.90it/s]


Training Loss:  1.772302875861151 Training Accuracy:  50.14047619047619 Test Loss:  1.774068760729091 Testing Accuracy:  50.577777777777776


0,1
tr_accuracy,▁▁▂▂▃▄▅▆▇██
tr_loss,█▇▆▅▄▄▃▃▂▁▁
val_accuracy,▁▁▂▂▃▄▅▆▇██
val_loss,█▇▆▅▄▄▃▃▂▁▁

0,1
tr_accuracy,50.14048
tr_loss,1.7723
val_accuracy,50.57778
val_loss,1.77407


[34m[1mwandb[0m: Agent Starting Run: qro2gq55 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.332302871601959 Training Accuracy:  11.5809523809

100%|██████████| 1312/1312 [00:08<00:00, 150.62it/s]


Training Loss:  0.6013787438639187 Training Accuracy:  79.75 Test Loss:  0.6066407053587759 Testing Accuracy:  79.4
Epoch:  2


100%|██████████| 1312/1312 [00:06<00:00, 214.74it/s]


Training Loss:  0.4962917281776892 Training Accuracy:  82.81428571428572 Test Loss:  0.502664029914052 Testing Accuracy:  82.69444444444444
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 151.21it/s]


Training Loss:  0.4555719401397304 Training Accuracy:  84.1 Test Loss:  0.4634293115547184 Testing Accuracy:  83.98333333333333
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 217.01it/s]


Training Loss:  0.4308802451328732 Training Accuracy:  84.86904761904762 Test Loss:  0.4403192506469307 Testing Accuracy:  84.82777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:08<00:00, 151.80it/s]


Training Loss:  0.41457807505625344 Training Accuracy:  85.39285714285714 Test Loss:  0.4260373128293379 Testing Accuracy:  85.22777777777777
Epoch:  6


100%|██████████| 1312/1312 [00:06<00:00, 214.57it/s]


Training Loss:  0.40202079472753915 Training Accuracy:  85.86190476190477 Test Loss:  0.4150566300788992 Testing Accuracy:  85.70555555555555
Epoch:  7


100%|██████████| 1312/1312 [00:08<00:00, 162.64it/s]


Training Loss:  0.39086797027591835 Training Accuracy:  86.18809523809524 Test Loss:  0.40650458756188707 Testing Accuracy:  86.06666666666666
Epoch:  8


100%|██████████| 1312/1312 [00:06<00:00, 203.71it/s]


Training Loss:  0.3815464344492147 Training Accuracy:  86.5547619047619 Test Loss:  0.4005996211664166 Testing Accuracy:  86.29444444444445
Epoch:  9


100%|██████████| 1312/1312 [00:08<00:00, 160.72it/s]


Training Loss:  0.37332490996595147 Training Accuracy:  86.82380952380953 Test Loss:  0.39545946011394634 Testing Accuracy:  86.55
Epoch:  10


100%|██████████| 1312/1312 [00:06<00:00, 207.12it/s]


Training Loss:  0.366790941299885 Training Accuracy:  87.10952380952381 Test Loss:  0.39218637372828463 Testing Accuracy:  86.57777777777778


0,1
tr_accuracy,▁▇█████████
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▂▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,87.10952
tr_loss,0.36679
val_accuracy,86.57778
val_loss,0.39219


[34m[1mwandb[0m: Agent Starting Run: i46l057r with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  rmsprop
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.4902003132593813 Training Accuracy:  9.94047619047619 Test Loss:  2.48744667439275 Testing Accuracy:  10.13888888888889
Epoch:  1


100%|██████████| 656/656 [00:06<00:00, 105.15it/s]


Training Loss:  1.5491004169288543 Training Accuracy:  52.19761904761905 Test Loss:  1.550365130612638 Testing Accuracy:  52.105555555555554
Epoch:  2


100%|██████████| 656/656 [00:06<00:00, 97.56it/s]


Training Loss:  1.081760621544544 Training Accuracy:  60.016666666666666 Test Loss:  1.0830482681600506 Testing Accuracy:  59.61666666666667
Epoch:  3


100%|██████████| 656/656 [00:05<00:00, 113.73it/s]


Training Loss:  0.8853707014444148 Training Accuracy:  67.58809523809524 Test Loss:  0.8869447745280554 Testing Accuracy:  67.31111111111112
Epoch:  4


100%|██████████| 656/656 [00:07<00:00, 91.91it/s]


Training Loss:  0.7631461502739743 Training Accuracy:  73.07142857142857 Test Loss:  0.7657643133711257 Testing Accuracy:  72.66111111111111
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 117.29it/s]


Training Loss:  0.6767557265117125 Training Accuracy:  76.25 Test Loss:  0.6799208825411841 Testing Accuracy:  76.21666666666667


0,1
tr_accuracy,▁▅▆▇██
tr_loss,█▄▃▂▁▁
val_accuracy,▁▅▆▇██
val_loss,█▄▃▂▁▁

0,1
tr_accuracy,76.25
tr_loss,0.67676
val_accuracy,76.21667
val_loss,0.67992


[34m[1mwandb[0m: Agent Starting Run: azea2jjo with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.364229164083521 Training Accuracy:  8.961904761904762 Test Loss:  2.36727762352258 Testing Accuracy:  8.544444444444444
Epoch:  1


100%|██████████| 1312/1312 [00:02<00:00, 443.11it/s]


Training Loss:  0.7868825082825847 Training Accuracy:  71.60714285714286 Test Loss:  0.7987494579529503 Testing Accuracy:  71.16666666666667
Epoch:  2


100%|██████████| 1312/1312 [00:03<00:00, 345.23it/s]


Training Loss:  0.6044646972300364 Training Accuracy:  77.3547619047619 Test Loss:  0.611677693725467 Testing Accuracy:  77.31666666666666
Epoch:  3


100%|██████████| 1312/1312 [00:04<00:00, 292.10it/s]


Training Loss:  0.7552643995566475 Training Accuracy:  74.1452380952381 Test Loss:  0.7738301606217808 Testing Accuracy:  73.81111111111112
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 438.98it/s]


Training Loss:  0.6845506164586459 Training Accuracy:  76.4 Test Loss:  0.6937549288027004 Testing Accuracy:  76.55
Epoch:  5


100%|██████████| 1312/1312 [00:02<00:00, 447.86it/s]


Training Loss:  0.6745406371073622 Training Accuracy:  77.35238095238095 Test Loss:  0.6875870806859679 Testing Accuracy:  77.33888888888889


0,1
tr_accuracy,▁▇████
tr_loss,█▂▁▂▁▁
val_accuracy,▁▇████
val_loss,█▂▁▂▁▁

0,1
tr_accuracy,77.35238
tr_loss,0.67454
val_accuracy,77.33889
val_loss,0.68759


[34m[1mwandb[0m: Agent Starting Run: t455za2s with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  nan Training Accuracy:  11.166666666666666 Test Loss:  nan Testing Accuracy:  11.177777777777777
Epoch:  1


100%|██████████| 656/656 [00:01<00:00, 572.64it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:01<00:00, 550.73it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:01<00:00, 423.82it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:02<00:00, 310.35it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:01<00:00, 328.31it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: j3ubxtqt with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  14.419668778148367 Training Accuracy:  11.452380952380953 Test Loss:  14.48449445369274 Testing Accuracy:  11.016666666666667
Epoch:  1


100%|██████████| 1312/1312 [00:13<00:00, 100.02it/s]


Training Loss:  26.173370586694322 Training Accuracy:  9.964285714285714 Test Loss:  26.14739715140299 Testing Accuracy:  10.083333333333334
Epoch:  2


100%|██████████| 1312/1312 [00:11<00:00, 111.45it/s]


Training Loss:  29.863507691588058 Training Accuracy:  10.026190476190477 Test Loss:  29.923731354521788 Testing Accuracy:  9.938888888888888
Epoch:  3


100%|██████████| 1312/1312 [00:11<00:00, 113.83it/s]


Training Loss:  25.325642375684986 Training Accuracy:  10.040476190476191 Test Loss:  25.332736722391164 Testing Accuracy:  9.905555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:10<00:00, 126.64it/s]


Training Loss:  28.69578798392343 Training Accuracy:  9.964285714285714 Test Loss:  28.671218552024325 Testing Accuracy:  10.083333333333334
Epoch:  5


100%|██████████| 1312/1312 [00:09<00:00, 131.76it/s]


Training Loss:  19.68348180132084 Training Accuracy:  10.097619047619048 Test Loss:  19.785680430095614 Testing Accuracy:  9.772222222222222
Epoch:  6


100%|██████████| 1312/1312 [00:10<00:00, 123.81it/s]


Training Loss:  23.972422846060905 Training Accuracy:  10.026190476190477 Test Loss:  23.853589710070068 Testing Accuracy:  9.944444444444445
Epoch:  7


100%|██████████| 1312/1312 [00:11<00:00, 114.90it/s]


Training Loss:  21.871804883745526 Training Accuracy:  10.026190476190477 Test Loss:  21.762645013890104 Testing Accuracy:  9.938888888888888
Epoch:  8


100%|██████████| 1312/1312 [00:11<00:00, 112.77it/s]


Training Loss:  26.620980094970225 Training Accuracy:  9.964285714285714 Test Loss:  26.676112145483515 Testing Accuracy:  10.083333333333334
Epoch:  9


100%|██████████| 1312/1312 [00:12<00:00, 108.76it/s]


Training Loss:  27.983362970205395 Training Accuracy:  10.097619047619048 Test Loss:  28.058989776652165 Testing Accuracy:  9.772222222222222
Epoch:  10


100%|██████████| 1312/1312 [00:12<00:00, 104.33it/s]


Training Loss:  25.31677622009681 Training Accuracy:  9.964285714285714 Test Loss:  25.321910584678317 Testing Accuracy:  10.083333333333334


0,1
tr_accuracy,█▁▁▁▁▂▁▁▁▂▁
tr_loss,▁▆█▆▇▃▅▄▇▇▆
val_accuracy,█▃▂▂▃▁▂▂▃▁▃
val_loss,▁▆█▆▇▃▅▄▇▇▆

0,1
tr_accuracy,9.96429
tr_loss,25.31678
val_accuracy,10.08333
val_loss,25.32191


[34m[1mwandb[0m: Agent Starting Run: 3yqaluj5 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  nan Training Accuracy:  8.66904761904762 Test Loss:  

100%|██████████| 1312/1312 [00:03<00:00, 414.44it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:03<00:00, 365.06it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:05<00:00, 243.91it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:03<00:00, 418.14it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 390.48it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█████
val_accuracy,▁█████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: 9e2atsuq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 

100%|██████████| 2625/2625 [00:14<00:00, 187.38it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:14<00:00, 178.79it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:14<00:00, 181.52it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:14<00:00, 186.12it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:13<00:00, 191.78it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█████
val_accuracy,▁█████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: rwkt2lpq with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.4122667058587903 Training Accuracy:  12.433333333333334 Test Loss:  2.414147092452736 Testing Accuracy:  12.377777777777778
Epoch:  1


100%|██████████| 2625/2625 [00:15<00:00, 173.97it/s]


Training Loss:  0.8156940538217258 Training Accuracy:  74.63095238095238 Test Loss:  0.8193005984709053 Testing Accuracy:  74.5
Epoch:  2


100%|██████████| 2625/2625 [00:15<00:00, 173.59it/s]


Training Loss:  0.6500495337166029 Training Accuracy:  78.56428571428572 Test Loss:  0.6542320602413503 Testing Accuracy:  78.38888888888889
Epoch:  3


100%|██████████| 2625/2625 [00:14<00:00, 185.32it/s]


Training Loss:  0.5786358623076697 Training Accuracy:  80.28095238095239 Test Loss:  0.5827862247000325 Testing Accuracy:  80.25
Epoch:  4


100%|██████████| 2625/2625 [00:15<00:00, 174.80it/s]


Training Loss:  0.5371283675141905 Training Accuracy:  81.47142857142858 Test Loss:  0.5412589933008775 Testing Accuracy:  81.40555555555555
Epoch:  5


100%|██████████| 2625/2625 [00:14<00:00, 181.61it/s]


Training Loss:  0.5096193892380774 Training Accuracy:  82.21190476190476 Test Loss:  0.5139178885329021 Testing Accuracy:  82.22777777777777
Epoch:  6


100%|██████████| 2625/2625 [00:14<00:00, 178.87it/s]


Training Loss:  0.48980427093764217 Training Accuracy:  82.79523809523809 Test Loss:  0.49446549077288515 Testing Accuracy:  82.82777777777778
Epoch:  7


100%|██████████| 2625/2625 [00:14<00:00, 179.99it/s]


Training Loss:  0.4745814072578978 Training Accuracy:  83.25714285714285 Test Loss:  0.47975770934926854 Testing Accuracy:  83.22222222222223
Epoch:  8


100%|██████████| 2625/2625 [00:14<00:00, 176.79it/s]


Training Loss:  0.4622793058817379 Training Accuracy:  83.70952380952382 Test Loss:  0.4680769853411657 Testing Accuracy:  83.66666666666667
Epoch:  9


100%|██████████| 2625/2625 [00:15<00:00, 174.93it/s]


Training Loss:  0.45194865268014445 Training Accuracy:  84.04523809523809 Test Loss:  0.4584374733619158 Testing Accuracy:  83.95
Epoch:  10


100%|██████████| 2625/2625 [00:14<00:00, 183.36it/s]


Training Loss:  0.44302337828035204 Training Accuracy:  84.29761904761905 Test Loss:  0.4502472527150068 Testing Accuracy:  84.12222222222222


0,1
tr_accuracy,▁▇▇████████
tr_loss,█▂▂▁▁▁▁▁▁▁▁
val_accuracy,▁▇▇████████
val_loss,█▂▂▁▁▁▁▁▁▁▁

0,1
tr_accuracy,84.29762
tr_loss,0.44302
val_accuracy,84.12222
val_loss,0.45025


[34m[1mwandb[0m: Agent Starting Run: 83hphti7 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (

100%|██████████| 656/656 [00:05<00:00, 114.91it/s]


Training Loss:  2.0336800249988563 Training Accuracy:  35.90714285714286 Test Loss:  2.034593196003162 Testing Accuracy:  35.888888888888886
Epoch:  2


100%|██████████| 656/656 [00:05<00:00, 121.91it/s]


Training Loss:  1.8364783372213092 Training Accuracy:  47.21666666666667 Test Loss:  1.836918468103708 Testing Accuracy:  47.638888888888886
Epoch:  3


100%|██████████| 656/656 [00:06<00:00, 104.04it/s]


Training Loss:  1.6855207240482122 Training Accuracy:  50.680952380952384 Test Loss:  1.686031450945518 Testing Accuracy:  50.81111111111111
Epoch:  4


100%|██████████| 656/656 [00:05<00:00, 127.83it/s]


Training Loss:  1.567043726268646 Training Accuracy:  53.77857142857143 Test Loss:  1.5678124713698938 Testing Accuracy:  53.66111111111111
Epoch:  5


100%|██████████| 656/656 [00:06<00:00, 106.93it/s]


Training Loss:  1.4703022560698633 Training Accuracy:  56.65952380952381 Test Loss:  1.4713805375057754 Testing Accuracy:  56.583333333333336
Epoch:  6


100%|██████████| 656/656 [00:05<00:00, 126.09it/s]


Training Loss:  1.388949156863585 Training Accuracy:  59.08095238095238 Test Loss:  1.3903457288602807 Testing Accuracy:  59.00555555555555
Epoch:  7


100%|██████████| 656/656 [00:06<00:00, 102.85it/s]


Training Loss:  1.3192797334926794 Training Accuracy:  61.054761904761904 Test Loss:  1.3209859333594618 Testing Accuracy:  61.06111111111111
Epoch:  8


100%|██████████| 656/656 [00:05<00:00, 122.03it/s]


Training Loss:  1.2589106208163654 Training Accuracy:  62.56666666666667 Test Loss:  1.260909373254795 Testing Accuracy:  62.95
Epoch:  9


100%|██████████| 656/656 [00:06<00:00, 95.15it/s]


Training Loss:  1.2061574552683612 Training Accuracy:  63.93809523809524 Test Loss:  1.208427872639665 Testing Accuracy:  64.20555555555555
Epoch:  10


100%|██████████| 656/656 [00:05<00:00, 127.32it/s]


Training Loss:  1.1597505766554843 Training Accuracy:  65.12380952380953 Test Loss:  1.16227049380885 Testing Accuracy:  65.25555555555556


0,1
tr_accuracy,▁▄▅▆▆▇▇▇███
tr_loss,█▆▅▄▄▃▂▂▂▁▁
val_accuracy,▁▄▅▆▆▇▇▇███
val_loss,█▆▅▄▄▃▂▂▂▁▁

0,1
tr_accuracy,65.12381
tr_loss,1.15975
val_accuracy,65.25556
val_loss,1.16227


[34m[1mwandb[0m: Agent Starting Run: 27x3bnwa with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.3101648124845786 Training Accur

100%|██████████| 2625/2625 [00:15<00:00, 172.83it/s]


Training Loss:  0.8187225981979643 Training Accuracy:  72.37857142857143 Test Loss:  0.8229089361662989 Testing Accuracy:  72.4
Epoch:  2


100%|██████████| 2625/2625 [00:15<00:00, 174.49it/s]


Training Loss:  0.7072943282902967 Training Accuracy:  76.92142857142858 Test Loss:  0.7126754445978056 Testing Accuracy:  77.05
Epoch:  3


100%|██████████| 2625/2625 [00:14<00:00, 180.28it/s]


Training Loss:  0.6661967350413356 Training Accuracy:  78.54761904761905 Test Loss:  0.6721258791594489 Testing Accuracy:  78.52222222222223
Epoch:  4


100%|██████████| 2625/2625 [00:13<00:00, 190.82it/s]


Training Loss:  0.6461507994758423 Training Accuracy:  79.28095238095239 Test Loss:  0.6521225714076194 Testing Accuracy:  79.30555555555556
Epoch:  5


100%|██████████| 2625/2625 [00:13<00:00, 190.09it/s]


Training Loss:  0.6346271361728489 Training Accuracy:  79.64047619047619 Test Loss:  0.6404817956221968 Testing Accuracy:  79.78888888888889
Epoch:  6


100%|██████████| 2625/2625 [00:14<00:00, 182.75it/s]


Training Loss:  0.6270410562457244 Training Accuracy:  79.9452380952381 Test Loss:  0.6327625561590398 Testing Accuracy:  80.16666666666667
Epoch:  7


100%|██████████| 2625/2625 [00:14<00:00, 179.87it/s]


Training Loss:  0.6214240833006963 Training Accuracy:  80.0547619047619 Test Loss:  0.6269941662002418 Testing Accuracy:  80.3
Epoch:  8


100%|██████████| 2625/2625 [00:13<00:00, 192.02it/s]


Training Loss:  0.6168135340348354 Training Accuracy:  80.21190476190476 Test Loss:  0.6222390470091048 Testing Accuracy:  80.37777777777778
Epoch:  9


100%|██████████| 2625/2625 [00:13<00:00, 192.89it/s]


Training Loss:  0.6128316332587007 Training Accuracy:  80.37142857142857 Test Loss:  0.6180946546148823 Testing Accuracy:  80.52222222222223
Epoch:  10


100%|██████████| 2625/2625 [00:14<00:00, 185.56it/s]


Training Loss:  0.6091315824675443 Training Accuracy:  80.53095238095239 Test Loss:  0.6142451089745439 Testing Accuracy:  80.49444444444444


0,1
tr_accuracy,▁▇█████████
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▂▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,80.53095
tr_loss,0.60913
val_accuracy,80.49444
val_loss,0.61425


[34m[1mwandb[0m: Agent Starting Run: tq7e7d3b with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.716361435517697 Training Accuracy:  10.097619047619048 Test Loss:  2.7128126524853142 Testing Accuracy:  9.772222222222222
Epoch:  1


100%|██████████| 2625/2625 [00:08<00:00, 301.44it/s]


Training Loss:  2.302864633512572 Training Accuracy:  9.916666666666666 Test Loss:  2.3027203889948242 Testing Accuracy:  10.194444444444445
Epoch:  2


100%|██████████| 2625/2625 [00:06<00:00, 387.67it/s]


Training Loss:  2.302779426359393 Training Accuracy:  9.916666666666666 Test Loss:  2.3027024829154548 Testing Accuracy:  10.194444444444445
Epoch:  3


100%|██████████| 2625/2625 [00:08<00:00, 303.57it/s]


Training Loss:  2.3027481542148096 Training Accuracy:  9.916666666666666 Test Loss:  2.302692750527441 Testing Accuracy:  10.194444444444445
Epoch:  4


100%|██████████| 2625/2625 [00:06<00:00, 389.94it/s]


Training Loss:  2.3027358906213484 Training Accuracy:  10.040476190476191 Test Loss:  2.3026895485423635 Testing Accuracy:  9.905555555555555
Epoch:  5


100%|██████████| 2625/2625 [00:08<00:00, 293.66it/s]


Training Loss:  2.3027287711963274 Training Accuracy:  10.040476190476191 Test Loss:  2.3026878145739422 Testing Accuracy:  9.905555555555555
Epoch:  6


100%|██████████| 2625/2625 [00:06<00:00, 393.90it/s]


Training Loss:  2.3027238091424698 Training Accuracy:  10.040476190476191 Test Loss:  2.302687141928261 Testing Accuracy:  9.905555555555555
Epoch:  7


100%|██████████| 2625/2625 [00:08<00:00, 291.84it/s]


Training Loss:  2.3027200286957363 Training Accuracy:  10.040476190476191 Test Loss:  2.3026870596933224 Testing Accuracy:  9.905555555555555
Epoch:  8


100%|██████████| 2625/2625 [00:06<00:00, 381.25it/s]


Training Loss:  2.3027170660729803 Training Accuracy:  10.040476190476191 Test Loss:  2.3026872855411997 Testing Accuracy:  9.905555555555555
Epoch:  9


100%|██████████| 2625/2625 [00:09<00:00, 267.53it/s]


Training Loss:  2.302714754073628 Training Accuracy:  10.040476190476191 Test Loss:  2.3026876868249304 Testing Accuracy:  9.905555555555555
Epoch:  10


100%|██████████| 2625/2625 [00:06<00:00, 403.92it/s]


Training Loss:  2.302712974942394 Training Accuracy:  10.040476190476191 Test Loss:  2.3026881722743884 Testing Accuracy:  9.905555555555555


0,1
tr_accuracy,█▁▁▁▆▆▆▆▆▆▆
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁███▃▃▃▃▃▃▃
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.04048
tr_loss,2.30271
val_accuracy,9.90556
val_loss,2.30269


[34m[1mwandb[0m: Agent Starting Run: 71w7orwt with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.3881775624159305 Training Accuracy:  5.6476190476190

100%|██████████| 656/656 [00:04<00:00, 141.71it/s]


Training Loss:  9.094056841213156 Training Accuracy:  10.545238095238096 Test Loss:  9.04647368064477 Testing Accuracy:  10.683333333333334
Epoch:  2


100%|██████████| 656/656 [00:07<00:00, 88.92it/s] 


Training Loss:  6.529189708182311 Training Accuracy:  16.03095238095238 Test Loss:  6.560139750914558 Testing Accuracy:  16.066666666666666
Epoch:  3


100%|██████████| 656/656 [00:04<00:00, 143.01it/s]


Training Loss:  16.995293855531045 Training Accuracy:  9.94047619047619 Test Loss:  16.92858728874768 Testing Accuracy:  10.13888888888889
Epoch:  4


100%|██████████| 656/656 [00:05<00:00, 114.57it/s]


Training Loss:  20.922182403377683 Training Accuracy:  10.073809523809524 Test Loss:  21.081314317804388 Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:04<00:00, 142.01it/s]


Training Loss:  14.94284033770407 Training Accuracy:  9.997619047619047 Test Loss:  14.956295046467758 Testing Accuracy:  10.005555555555556


0,1
tr_accuracy,▁▄█▄▄▄
tr_loss,▁▄▃▇█▆
val_accuracy,▁▄█▄▄▄
val_loss,▁▃▃▆█▆

0,1
tr_accuracy,9.99762
tr_loss,14.94284
val_accuracy,10.00556
val_loss,14.9563


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: y93963oq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nag
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32

100%|██████████| 1312/1312 [00:04<00:00, 293.53it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:03<00:00, 336.32it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:02<00:00, 455.59it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:02<00:00, 455.90it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 338.01it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  6


100%|██████████| 1312/1312 [00:04<00:00, 272.08it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  7


100%|██████████| 1312/1312 [00:02<00:00, 473.88it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  8


100%|██████████| 1312/1312 [00:02<00:00, 473.71it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  9


100%|██████████| 1312/1312 [00:02<00:00, 447.43it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778
Epoch:  10


100%|██████████| 1312/1312 [00:04<00:00, 269.30it/s]


Training Loss:  2.308753920803279 Training Accuracy:  10.073809523809524 Test Loss:  2.309202719276001 Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁██████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁██████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,2.30875
val_accuracy,9.82778
val_loss,2.3092


[34m[1mwandb[0m: Agent Starting Run: 2osn458x with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  13.817352888868756 Training Accuracy:  12.745238095238095 Test Loss:  13.787953288258054 Testing Accuracy:  13.127777777777778
Epoch:  1


100%|██████████| 2625/2625 [00:17<00:00, 149.15it/s]


Training Loss:  0.5825620818476744 Training Accuracy:  80.28571428571429 Test Loss:  0.6200154836453164 Testing Accuracy:  79.49444444444444
Epoch:  2


100%|██████████| 2625/2625 [00:17<00:00, 153.21it/s]


Training Loss:  0.5269534356089783 Training Accuracy:  82.56666666666666 Test Loss:  0.5743758133895295 Testing Accuracy:  81.64444444444445
Epoch:  3


100%|██████████| 2625/2625 [00:17<00:00, 148.08it/s]


Training Loss:  0.4932578607935042 Training Accuracy:  83.56428571428572 Test Loss:  0.5436933028680013 Testing Accuracy:  82.58888888888889
Epoch:  4


100%|██████████| 2625/2625 [00:17<00:00, 150.90it/s]


Training Loss:  0.4457151077884624 Training Accuracy:  85.28571428571429 Test Loss:  0.5056559208822026 Testing Accuracy:  83.81666666666666
Epoch:  5


100%|██████████| 2625/2625 [00:17<00:00, 152.80it/s]


Training Loss:  0.44349149442923624 Training Accuracy:  85.55238095238096 Test Loss:  0.5030925790176011 Testing Accuracy:  84.08888888888889


0,1
tr_accuracy,▁▇████
tr_loss,█▁▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,85.55238
tr_loss,0.44349
val_accuracy,84.08889
val_loss,0.50309


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: eut6cz4u with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.306098030547771 Training Accuracy:  17.83809523809

100%|██████████| 2625/2625 [00:12<00:00, 208.24it/s]


Training Loss:  0.5244092824664208 Training Accuracy:  81.83095238095238 Test Loss:  0.5312159926454466 Testing Accuracy:  81.43333333333334
Epoch:  2


100%|██████████| 2625/2625 [00:17<00:00, 153.59it/s]


Training Loss:  0.4565985105255492 Training Accuracy:  84.20238095238095 Test Loss:  0.46541079135359914 Testing Accuracy:  84.09444444444445
Epoch:  3


100%|██████████| 2625/2625 [00:19<00:00, 132.96it/s]


Training Loss:  0.4223283152619741 Training Accuracy:  85.21904761904761 Test Loss:  0.43410886074587796 Testing Accuracy:  85.18888888888888
Epoch:  4


100%|██████████| 2625/2625 [00:20<00:00, 130.99it/s]


Training Loss:  0.4005395902877695 Training Accuracy:  85.94761904761904 Test Loss:  0.4166556908041471 Testing Accuracy:  85.71111111111111
Epoch:  5


100%|██████████| 2625/2625 [00:18<00:00, 138.42it/s]


Training Loss:  0.38585967601949644 Training Accuracy:  86.51666666666667 Test Loss:  0.4062929887067124 Testing Accuracy:  86.2


0,1
tr_accuracy,▁█████
tr_loss,█▂▁▁▁▁
val_accuracy,▁█████
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,86.51667
tr_loss,0.38586
val_accuracy,86.2
val_loss,0.40629


[34m[1mwandb[0m: Agent Starting Run: bucsegwq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  ReLU
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  nan Training Accuracy:  10.092857142857143 Test Loss:  nan Testing Accuracy:  9.866666666666667
Epoch:  1


100%|██████████| 2625/2625 [00:07<00:00, 355.69it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:09<00:00, 284.06it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:07<00:00, 330.77it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:08<00:00, 321.36it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:08<00:00, 324.61it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: q2g32oq2 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.4161988597968818 Training Accuracy:  8.102380952380953 Test Loss:  2.421076230939501 Testing Accuracy:  7.966666666666667
Epoch:  1


100%|██████████| 2625/2625 [00:04<00:00, 597.17it/s]


Training Loss:  0.8679149489777117 Training Accuracy:  72.02857142857142 Test Loss:  0.8749235237121115 Testing Accuracy:  71.62222222222222
Epoch:  2


100%|██████████| 2625/2625 [00:05<00:00, 459.72it/s]


Training Loss:  0.6765451410310122 Training Accuracy:  77.9095238095238 Test Loss:  0.6846817676423578 Testing Accuracy:  77.59444444444445
Epoch:  3


100%|██████████| 2625/2625 [00:04<00:00, 640.45it/s]


Training Loss:  0.5882198646000343 Training Accuracy:  80.40714285714286 Test Loss:  0.5958191044658309 Testing Accuracy:  80.18888888888888
Epoch:  4


100%|██████████| 2625/2625 [00:04<00:00, 533.03it/s]


Training Loss:  0.5370422368463599 Training Accuracy:  81.72619047619048 Test Loss:  0.5442573798922328 Testing Accuracy:  81.45
Epoch:  5


100%|██████████| 2625/2625 [00:05<00:00, 455.77it/s]


Training Loss:  0.5037754488431361 Training Accuracy:  82.52380952380952 Test Loss:  0.5110859771569372 Testing Accuracy:  82.37222222222222


0,1
tr_accuracy,▁▇████
tr_loss,█▂▂▁▁▁
val_accuracy,▁▇████
val_loss,█▂▂▁▁▁

0,1
tr_accuracy,82.52381
tr_loss,0.50378
val_accuracy,82.37222
val_loss,0.51109


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: rth1k032 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.838888888888889
Epoch:  1


100%|██████████| 2625/2625 [00:14<00:00, 186.73it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:13<00:00, 187.66it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 2625/2625 [00:12<00:00, 214.14it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:11<00:00, 238.35it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:12<00:00, 217.31it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: 43u9rocu with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10,

100%|██████████| 2625/2625 [00:18<00:00, 142.23it/s]


Training Loss:  7.655198933835742 Training Accuracy:  18.25 Test Loss:  7.733573692086285 Testing Accuracy:  17.41111111111111
Epoch:  2


100%|██████████| 2625/2625 [00:17<00:00, 146.95it/s]


Training Loss:  5.672724239049893 Training Accuracy:  26.188095238095237 Test Loss:  5.762096789113317 Testing Accuracy:  25.583333333333332
Epoch:  3


100%|██████████| 2625/2625 [00:17<00:00, 147.57it/s]


Training Loss:  4.602004454092167 Training Accuracy:  31.326190476190476 Test Loss:  4.697671931209165 Testing Accuracy:  30.622222222222224
Epoch:  4


100%|██████████| 2625/2625 [00:18<00:00, 144.21it/s]


Training Loss:  3.8475932722770176 Training Accuracy:  35.58571428571429 Test Loss:  3.984170852039444 Testing Accuracy:  34.80555555555556
Epoch:  5


100%|██████████| 2625/2625 [00:17<00:00, 149.79it/s]


Training Loss:  3.4092300177473653 Training Accuracy:  38.4547619047619 Test Loss:  3.539478742184892 Testing Accuracy:  37.077777777777776


0,1
tr_accuracy,▁▃▅▆▇█
tr_loss,█▄▃▂▁▁
val_accuracy,▁▃▅▆▇█
val_loss,█▄▃▂▁▁

0,1
tr_accuracy,38.45476
tr_loss,3.40923
val_accuracy,37.07778
val_loss,3.53948


[34m[1mwandb[0m: Agent Starting Run: 53adde2h with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  sigmoid
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matr

100%|██████████| 1312/1312 [00:03<00:00, 373.11it/s]


Training Loss:  2.3041078985600176 Training Accuracy:  9.969047619047618 Test Loss:  2.3042964821349234 Testing Accuracy:  10.072222222222223
Epoch:  2


100%|██████████| 1312/1312 [00:03<00:00, 377.07it/s]


Training Loss:  2.3026229864401797 Training Accuracy:  10.073809523809524 Test Loss:  2.3026847109726325 Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:04<00:00, 316.09it/s]


Training Loss:  2.302610792044912 Training Accuracy:  10.040476190476191 Test Loss:  2.3026653582564696 Testing Accuracy:  9.905555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:04<00:00, 276.32it/s]


Training Loss:  2.302603948118649 Training Accuracy:  10.040476190476191 Test Loss:  2.3026577899796306 Testing Accuracy:  9.905555555555555
Epoch:  5


100%|██████████| 1312/1312 [00:03<00:00, 378.83it/s]


Training Loss:  2.302598978236162 Training Accuracy:  10.040476190476191 Test Loss:  2.3026524429198756 Testing Accuracy:  9.905555555555555
Epoch:  6


100%|██████████| 1312/1312 [00:03<00:00, 412.82it/s]


Training Loss:  2.3025955332713797 Training Accuracy:  10.040476190476191 Test Loss:  2.302648674114492 Testing Accuracy:  9.905555555555555
Epoch:  7


100%|██████████| 1312/1312 [00:04<00:00, 285.63it/s]


Training Loss:  2.302593175294966 Training Accuracy:  10.040476190476191 Test Loss:  2.302646028268638 Testing Accuracy:  9.905555555555555
Epoch:  8


100%|██████████| 1312/1312 [00:03<00:00, 372.76it/s]


Training Loss:  2.3025915809359008 Training Accuracy:  10.040476190476191 Test Loss:  2.3026441767973247 Testing Accuracy:  9.905555555555555
Epoch:  9


100%|██████████| 1312/1312 [00:03<00:00, 401.66it/s]


Training Loss:  2.302590521200875 Training Accuracy:  10.040476190476191 Test Loss:  2.3026428865814195 Testing Accuracy:  9.905555555555555
Epoch:  10


100%|██████████| 1312/1312 [00:03<00:00, 417.58it/s]


Training Loss:  2.30258983468385 Training Accuracy:  10.040476190476191 Test Loss:  2.302641992766714 Testing Accuracy:  9.905555555555555


0,1
tr_accuracy,▁▁█▆▆▆▆▆▆▆▆
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,██▁▃▃▃▃▃▃▃▃
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,10.04048
tr_loss,2.30259
val_accuracy,9.90556
val_loss,2.30264


[34m[1mwandb[0m: Agent Starting Run: rv9dpxnn with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.4216335682813344 Training Accuracy:  10.778571428571428 Test Loss:  2.414443932519491 Testing Accuracy:  10.988888888888889
Epoch:  1


100%|██████████| 2625/2625 [00:08<00:00, 301.77it/s]


Training Loss:  0.8244239302755075 Training Accuracy:  74.96666666666667 Test Loss:  0.8296748521143522 Testing Accuracy:  74.42777777777778
Epoch:  2


100%|██████████| 2625/2625 [00:07<00:00, 335.44it/s]


Training Loss:  0.7266277773280055 Training Accuracy:  77.82619047619048 Test Loss:  0.7326131910262764 Testing Accuracy:  77.36111111111111
Epoch:  3


100%|██████████| 2625/2625 [00:10<00:00, 256.23it/s]


Training Loss:  0.6921065195119889 Training Accuracy:  78.96904761904761 Test Loss:  0.6980848347755381 Testing Accuracy:  78.58333333333333
Epoch:  4


100%|██████████| 2625/2625 [00:07<00:00, 351.31it/s]


Training Loss:  0.6762133643578683 Training Accuracy:  79.56904761904762 Test Loss:  0.6819886748490428 Testing Accuracy:  79.2611111111111
Epoch:  5


100%|██████████| 2625/2625 [00:08<00:00, 292.79it/s]


Training Loss:  0.6672654295814305 Training Accuracy:  79.92142857142858 Test Loss:  0.6728196050952086 Testing Accuracy:  79.63888888888889
Epoch:  6


100%|██████████| 2625/2625 [00:08<00:00, 316.59it/s]


Training Loss:  0.6611993727570221 Training Accuracy:  80.08095238095238 Test Loss:  0.6665607063534768 Testing Accuracy:  79.83888888888889
Epoch:  7


100%|██████████| 2625/2625 [00:07<00:00, 355.28it/s]


Training Loss:  0.656501743749638 Training Accuracy:  80.16666666666667 Test Loss:  0.6617033536001199 Testing Accuracy:  79.96111111111111
Epoch:  8


100%|██████████| 2625/2625 [00:09<00:00, 269.67it/s]


Training Loss:  0.6526014905999669 Training Accuracy:  80.23095238095237 Test Loss:  0.6576722958448799 Testing Accuracy:  80.1
Epoch:  9


100%|██████████| 2625/2625 [00:07<00:00, 348.16it/s]


Training Loss:  0.6492726501489353 Training Accuracy:  80.23095238095237 Test Loss:  0.6542372869656129 Testing Accuracy:  80.14444444444445
Epoch:  10


100%|██████████| 2625/2625 [00:08<00:00, 313.70it/s]


Training Loss:  0.6464085263416577 Training Accuracy:  80.31428571428572 Test Loss:  0.6512874695352036 Testing Accuracy:  80.25


0,1
tr_accuracy,▁▇█████████
tr_loss,█▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▇█████████
val_loss,█▂▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,80.31429
tr_loss,0.64641
val_accuracy,80.25
val_loss,0.65129


[34m[1mwandb[0m: Agent Starting Run: n4p965gd with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  nan Training Accuracy:  10.109523809523

100%|██████████| 656/656 [00:04<00:00, 141.72it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:04<00:00, 132.98it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:05<00:00, 116.32it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:04<00:00, 145.88it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 115.54it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: yweui6vv with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  nag
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  soft

100%|██████████| 1312/1312 [00:14<00:00, 87.64it/s] 


Training Loss:  2.3090367188448084 Training Accuracy:  10.073809523809524 Test Loss:  2.3094867791519254 Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:13<00:00, 94.65it/s] 


Training Loss:  2.308860780885954 Training Accuracy:  10.073809523809524 Test Loss:  2.309310480045571 Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:13<00:00, 95.65it/s] 


Training Loss:  2.3087964412760043 Training Accuracy:  10.073809523809524 Test Loss:  2.309245900003488 Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:13<00:00, 96.52it/s] 


Training Loss:  2.308746966430757 Training Accuracy:  10.073809523809524 Test Loss:  2.309196173934312 Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:14<00:00, 92.56it/s] 


Training Loss:  2.3086667555797042 Training Accuracy:  10.073809523809524 Test Loss:  2.3091147887496555 Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█████
tr_loss,█▁▁▁▁▁
val_accuracy,█▁▁▁▁▁
val_loss,█▁▁▁▁▁

0,1
tr_accuracy,10.07381
tr_loss,2.30867
val_accuracy,9.82778
val_loss,2.30911


[34m[1mwandb[0m: Agent Starting Run: 2tjo9ibq with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.3135847425456992 Training Accuracy:  8.68095238095238 Test Loss:  2.312263569634511 Testing Accuracy:  8.805555555555555
Epoch:  1


100%|██████████| 1312/1312 [00:11<00:00, 115.40it/s]


Training Loss:  0.5701365727204966 Training Accuracy:  81.36190476190477 Test Loss:  0.57560413349003 Testing Accuracy:  81.34444444444445
Epoch:  2


100%|██████████| 1312/1312 [00:10<00:00, 123.92it/s]


Training Loss:  0.5191587265794155 Training Accuracy:  83.00238095238095 Test Loss:  0.5246164393882781 Testing Accuracy:  82.79444444444445
Epoch:  3


100%|██████████| 1312/1312 [00:12<00:00, 108.63it/s]


Training Loss:  0.5030335688624114 Training Accuracy:  83.34285714285714 Test Loss:  0.5088470446449618 Testing Accuracy:  83.25555555555556
Epoch:  4


100%|██████████| 1312/1312 [00:12<00:00, 104.41it/s]


Training Loss:  0.4953565555034239 Training Accuracy:  83.57380952380953 Test Loss:  0.5015586009559507 Testing Accuracy:  83.41666666666667
Epoch:  5


100%|██████████| 1312/1312 [00:11<00:00, 116.60it/s]


Training Loss:  0.4910735758327343 Training Accuracy:  83.6547619047619 Test Loss:  0.49754435848848283 Testing Accuracy:  83.46666666666667
Epoch:  6


100%|██████████| 1312/1312 [00:09<00:00, 132.10it/s]


Training Loss:  0.4881772457433936 Training Accuracy:  83.74285714285715 Test Loss:  0.49493121148913555 Testing Accuracy:  83.47222222222223
Epoch:  7


100%|██████████| 1312/1312 [00:11<00:00, 113.17it/s]


Training Loss:  0.48604922093096337 Training Accuracy:  83.8047619047619 Test Loss:  0.4930311927274889 Testing Accuracy:  83.49444444444444
Epoch:  8


100%|██████████| 1312/1312 [00:12<00:00, 107.00it/s]


Training Loss:  0.48430961462210415 Training Accuracy:  83.82857142857142 Test Loss:  0.4915424867631789 Testing Accuracy:  83.56111111111112
Epoch:  9


100%|██████████| 1312/1312 [00:12<00:00, 105.42it/s]


Training Loss:  0.48299642948894256 Training Accuracy:  83.88571428571429 Test Loss:  0.49040659101578965 Testing Accuracy:  83.59444444444445
Epoch:  10


100%|██████████| 1312/1312 [00:11<00:00, 117.20it/s]


Training Loss:  0.481404648756106 Training Accuracy:  83.92857142857143 Test Loss:  0.4889118701791643 Testing Accuracy:  83.60555555555555


0,1
tr_accuracy,▁██████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁██████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,83.92857
tr_loss,0.4814
val_accuracy,83.60556
val_loss,0.48891


[34m[1mwandb[0m: Agent Starting Run: 0dx372fs with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (

100%|██████████| 2625/2625 [00:08<00:00, 321.53it/s]


Training Loss:  1.9017573458166306 Training Accuracy:  37.86666666666667 Test Loss:  1.9043055327329361 Testing Accuracy:  37.727777777777774
Epoch:  2


100%|██████████| 2625/2625 [00:05<00:00, 439.23it/s]


Training Loss:  1.6905298893659977 Training Accuracy:  56.53333333333333 Test Loss:  1.6939787604122851 Testing Accuracy:  56.21111111111111
Epoch:  3


100%|██████████| 2625/2625 [00:07<00:00, 334.64it/s]


Training Loss:  1.5505979557589475 Training Accuracy:  61.67857142857143 Test Loss:  1.5544505443158239 Testing Accuracy:  61.62777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:06<00:00, 431.76it/s]


Training Loss:  1.433576772906851 Training Accuracy:  64.63809523809523 Test Loss:  1.4374875417394632 Testing Accuracy:  64.7611111111111
Epoch:  5


100%|██████████| 2625/2625 [00:08<00:00, 326.49it/s]


Training Loss:  1.3319295648512253 Training Accuracy:  66.63571428571429 Test Loss:  1.335745924655241 Testing Accuracy:  66.72222222222223
Epoch:  6


100%|██████████| 2625/2625 [00:06<00:00, 428.40it/s]


Training Loss:  1.2438977322058697 Training Accuracy:  68.08095238095238 Test Loss:  1.2476212094653332 Testing Accuracy:  67.9
Epoch:  7


100%|██████████| 2625/2625 [00:08<00:00, 293.70it/s]


Training Loss:  1.1669528904408304 Training Accuracy:  69.1952380952381 Test Loss:  1.170642335254479 Testing Accuracy:  68.95555555555555
Epoch:  8


100%|██████████| 2625/2625 [00:06<00:00, 412.63it/s]


Training Loss:  1.0989858419565315 Training Accuracy:  70.41428571428571 Test Loss:  1.1027043384215949 Testing Accuracy:  70.14444444444445
Epoch:  9


100%|██████████| 2625/2625 [00:08<00:00, 307.58it/s]


Training Loss:  1.0386335220837901 Training Accuracy:  71.52142857142857 Test Loss:  1.0424227540367652 Testing Accuracy:  71.30555555555556
Epoch:  10


100%|██████████| 2625/2625 [00:06<00:00, 427.05it/s]


Training Loss:  0.9848912380978863 Training Accuracy:  72.54047619047618 Test Loss:  0.9887710067039782 Testing Accuracy:  72.1


0,1
tr_accuracy,▁▄▆▇▇▇▇████
tr_loss,█▆▅▄▃▃▂▂▂▁▁
val_accuracy,▁▄▆▇▇▇█████
val_loss,█▆▅▄▃▃▂▂▂▁▁

0,1
tr_accuracy,72.54048
tr_loss,0.98489
val_accuracy,72.1
val_loss,0.98877


[34m[1mwandb[0m: Agent Starting Run: cklzdzy2 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nag
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nag
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matri

100%|██████████| 1312/1312 [00:11<00:00, 117.03it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:10<00:00, 119.51it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:10<00:00, 119.56it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 1312/1312 [00:12<00:00, 106.02it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:12<00:00, 101.99it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█████
val_accuracy,▁█████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: ar7m9nxr with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  momentum
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight 

100%|██████████| 1312/1312 [00:19<00:00, 68.23it/s]


Training Loss:  0.8226019345704514 Training Accuracy:  73.07380952380953 Test Loss:  0.8263948384158842 Testing Accuracy:  72.75
Epoch:  2


100%|██████████| 1312/1312 [00:19<00:00, 68.14it/s]


Training Loss:  0.6776096758739253 Training Accuracy:  77.70238095238095 Test Loss:  0.6817560451777855 Testing Accuracy:  77.42777777777778
Epoch:  3


100%|██████████| 1312/1312 [00:18<00:00, 72.13it/s]


Training Loss:  0.6124799054248241 Training Accuracy:  79.85952380952381 Test Loss:  0.6161636186412374 Testing Accuracy:  79.75555555555556
Epoch:  4


100%|██████████| 1312/1312 [00:17<00:00, 76.15it/s]


Training Loss:  0.5752120775633961 Training Accuracy:  80.96190476190476 Test Loss:  0.5786128485283583 Testing Accuracy:  80.83888888888889
Epoch:  5


100%|██████████| 1312/1312 [00:18<00:00, 71.64it/s]


Training Loss:  0.5519647250006494 Training Accuracy:  81.66428571428571 Test Loss:  0.5553253595557037 Testing Accuracy:  81.58333333333333
Epoch:  6


100%|██████████| 1312/1312 [00:17<00:00, 73.24it/s]


Training Loss:  0.5366915660746303 Training Accuracy:  82.13095238095238 Test Loss:  0.5401451310608674 Testing Accuracy:  82.00555555555556
Epoch:  7


100%|██████████| 1312/1312 [00:18<00:00, 71.20it/s]


Training Loss:  0.5261590238769565 Training Accuracy:  82.46428571428571 Test Loss:  0.5297649793441568 Testing Accuracy:  82.30555555555556
Epoch:  8


100%|██████████| 1312/1312 [00:18<00:00, 72.42it/s]


Training Loss:  0.5185634059125728 Training Accuracy:  82.79047619047618 Test Loss:  0.522345723058736 Testing Accuracy:  82.45555555555555
Epoch:  9


100%|██████████| 1312/1312 [00:18<00:00, 71.57it/s]


Training Loss:  0.5128715140477498 Training Accuracy:  83.04285714285714 Test Loss:  0.5168372776935735 Testing Accuracy:  82.58333333333333
Epoch:  10


100%|██████████| 1312/1312 [00:17<00:00, 73.41it/s]


Training Loss:  0.5084657106527313 Training Accuracy:  83.25714285714285 Test Loss:  0.512613634855288 Testing Accuracy:  82.79444444444445


0,1
tr_accuracy,▁▇▇████████
tr_loss,█▂▂▁▁▁▁▁▁▁▁
val_accuracy,▁▇▇████████
val_loss,█▂▂▁▁▁▁▁▁▁▁

0,1
tr_accuracy,83.25714
tr_loss,0.50847
val_accuracy,82.79444
val_loss,0.51261


[34m[1mwandb[0m: Agent Starting Run: v1bknp23 with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: momentum
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  momentum
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  5.876158749986841 Training Accuracy:  6.095238095238095 Test Loss:  5.879335930286852 Testing Accuracy:  6.261111111111111
Epoch:  1


100%|██████████| 2625/2625 [00:08<00:00, 292.14it/s]


Training Loss:  3.204434226161751 Training Accuracy:  11.197619047619048 Test Loss:  3.199621090109706 Testing Accuracy:  11.583333333333334
Epoch:  2


100%|██████████| 2625/2625 [00:07<00:00, 354.31it/s]


Training Loss:  2.705625126796811 Training Accuracy:  15.461904761904762 Test Loss:  2.699324047295184 Testing Accuracy:  16.0
Epoch:  3


100%|██████████| 2625/2625 [00:08<00:00, 292.61it/s]


Training Loss:  2.4513781231775766 Training Accuracy:  18.935714285714287 Test Loss:  2.445852257518362 Testing Accuracy:  19.477777777777778
Epoch:  4


100%|██████████| 2625/2625 [00:06<00:00, 395.62it/s]


Training Loss:  2.2637243664110174 Training Accuracy:  22.273809523809526 Test Loss:  2.259001699282617 Testing Accuracy:  22.738888888888887
Epoch:  5


100%|██████████| 2625/2625 [00:09<00:00, 266.08it/s]


Training Loss:  2.1160105977263335 Training Accuracy:  25.438095238095237 Test Loss:  2.1120269080284984 Testing Accuracy:  25.61111111111111


0,1
tr_accuracy,▁▃▄▆▇█
tr_loss,█▃▂▂▁▁
val_accuracy,▁▃▅▆▇█
val_loss,█▃▂▂▁▁

0,1
tr_accuracy,25.4381
tr_loss,2.11601
val_accuracy,25.61111
val_loss,2.11203


[34m[1mwandb[0m: Agent Starting Run: bpgkexur with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  32  ; activation function:  tanh
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 3

100%|██████████| 1312/1312 [00:06<00:00, 198.83it/s]


Training Loss:  1.5380576702374016 Training Accuracy:  49.30714285714286 Test Loss:  1.5453195124413 Testing Accuracy:  48.94444444444444
Epoch:  2


100%|██████████| 1312/1312 [00:05<00:00, 218.90it/s]


Training Loss:  1.1262958676546568 Training Accuracy:  60.21904761904762 Test Loss:  1.1604739849279242 Testing Accuracy:  59.266666666666666
Epoch:  3


100%|██████████| 1312/1312 [00:07<00:00, 175.72it/s]


Training Loss:  0.949906810730049 Training Accuracy:  65.31428571428572 Test Loss:  0.9817997169805143 Testing Accuracy:  64.3
Epoch:  4


100%|██████████| 1312/1312 [00:06<00:00, 214.59it/s]


Training Loss:  0.863203756669793 Training Accuracy:  68.6 Test Loss:  0.8852213769521848 Testing Accuracy:  68.07777777777778
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 183.77it/s]


Training Loss:  0.8009252235106314 Training Accuracy:  71.03809523809524 Test Loss:  0.8140342641040247 Testing Accuracy:  70.45555555555555


0,1
tr_accuracy,▁▆▇▇██
tr_loss,█▂▁▁▁▁
val_accuracy,▁▆▇▇██
val_loss,█▂▁▁▁▁

0,1
tr_accuracy,71.0381
tr_loss,0.80093
val_accuracy,70.45556
val_loss,0.81403


[34m[1mwandb[0m: Agent Starting Run: fsuxji17 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  rmsprop
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  nan Training Accuracy:  8.352380952380953 Test Loss:  nan Testing Accuracy:  8.36111111111111
Epoch:  1


100%|██████████| 656/656 [00:06<00:00, 108.95it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  2


100%|██████████| 656/656 [00:08<00:00, 78.98it/s] 


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  3


100%|██████████| 656/656 [00:05<00:00, 123.41it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  4


100%|██████████| 656/656 [00:07<00:00, 84.37it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 112.50it/s]


Training Loss:  nan Training Accuracy:  10.073809523809524 Test Loss:  nan Testing Accuracy:  9.827777777777778


0,1
tr_accuracy,▁█████
val_accuracy,▁█████

0,1
tr_accuracy,10.07381
tr_loss,
val_accuracy,9.82778
val_loss,


[34m[1mwandb[0m: Agent Starting Run: dhclntrs with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  11.364976241674698 Training Accuracy:  10.902380952380952 Test Loss:  11.31980422338628 Testing Accuracy:  11.044444444444444
Epoch:  1


100%|██████████| 2625/2625 [00:12<00:00, 205.21it/s]


Training Loss:  1.2264357568873516 Training Accuracy:  64.53571428571429 Test Loss:  1.2609920976952913 Testing Accuracy:  63.922222222222224
Epoch:  2


100%|██████████| 2625/2625 [00:12<00:00, 204.96it/s]


Training Loss:  0.8907355030058243 Training Accuracy:  71.1595238095238 Test Loss:  0.9280666045592536 Testing Accuracy:  69.97777777777777
Epoch:  3


100%|██████████| 2625/2625 [00:12<00:00, 205.79it/s]


Training Loss:  0.7892977268269298 Training Accuracy:  74.68333333333334 Test Loss:  0.8166459923319038 Testing Accuracy:  74.15555555555555
Epoch:  4


100%|██████████| 2625/2625 [00:12<00:00, 204.42it/s]


Training Loss:  0.7184730603489191 Training Accuracy:  76.54047619047618 Test Loss:  0.7291812407137415 Testing Accuracy:  76.36666666666666
Epoch:  5


100%|██████████| 2625/2625 [00:14<00:00, 187.06it/s]


Training Loss:  0.6660964265714593 Training Accuracy:  78.66428571428571 Test Loss:  0.6951769538676701 Testing Accuracy:  78.03333333333333
Epoch:  6


100%|██████████| 2625/2625 [00:14<00:00, 182.65it/s]


Training Loss:  0.6584598335812198 Training Accuracy:  79.12619047619047 Test Loss:  0.6793256989624374 Testing Accuracy:  78.8
Epoch:  7


100%|██████████| 2625/2625 [00:14<00:00, 179.42it/s]


Training Loss:  0.6085861429423913 Training Accuracy:  80.82619047619048 Test Loss:  0.6352186603950201 Testing Accuracy:  80.09444444444445
Epoch:  8


100%|██████████| 2625/2625 [00:14<00:00, 181.43it/s]


Training Loss:  0.5748407852653108 Training Accuracy:  81.75714285714285 Test Loss:  0.6022291224288923 Testing Accuracy:  81.08333333333333
Epoch:  9


100%|██████████| 2625/2625 [00:14<00:00, 179.60it/s]


Training Loss:  0.5540484930355762 Training Accuracy:  81.86666666666666 Test Loss:  0.5860756688143558 Testing Accuracy:  81.27222222222223
Epoch:  10


100%|██████████| 2625/2625 [00:14<00:00, 184.74it/s]


Training Loss:  0.5236151417941648 Training Accuracy:  83.1952380952381 Test Loss:  0.5608251624746585 Testing Accuracy:  82.27777777777777


0,1
tr_accuracy,▁▆▇▇▇██████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▆▇▇▇██████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,83.19524
tr_loss,0.52362
val_accuracy,82.27778
val_loss,0.56083


[34m[1mwandb[0m: Agent Starting Run: a30r1pcz with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  64  ; activation function:  sigmoid
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  2.568553534604648 Training Accuracy:  10

100%|██████████| 1312/1312 [00:09<00:00, 140.83it/s]


Training Loss:  1.7862769543490098 Training Accuracy:  28.46904761904762 Test Loss:  1.7875848210052623 Testing Accuracy:  28.727777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:07<00:00, 179.70it/s]


Training Loss:  1.3924782828100826 Training Accuracy:  43.97142857142857 Test Loss:  1.3930699340696855 Testing Accuracy:  44.03888888888889
Epoch:  3


100%|██████████| 1312/1312 [00:08<00:00, 149.15it/s]


Training Loss:  1.1944620014342175 Training Accuracy:  53.49523809523809 Test Loss:  1.1944116563457738 Testing Accuracy:  53.083333333333336
Epoch:  4


100%|██████████| 1312/1312 [00:08<00:00, 162.81it/s]


Training Loss:  1.023905945322132 Training Accuracy:  58.24523809523809 Test Loss:  1.0258537334955282 Testing Accuracy:  58.18333333333333
Epoch:  5


100%|██████████| 1312/1312 [00:07<00:00, 166.26it/s]


Training Loss:  0.9218152961319154 Training Accuracy:  63.28333333333333 Test Loss:  0.9240890433081752 Testing Accuracy:  63.016666666666666
Epoch:  6


100%|██████████| 1312/1312 [00:08<00:00, 149.14it/s]


Training Loss:  0.8390029008851829 Training Accuracy:  68.4952380952381 Test Loss:  0.8417837022259466 Testing Accuracy:  68.16111111111111
Epoch:  7


100%|██████████| 1312/1312 [00:07<00:00, 181.47it/s]


Training Loss:  0.7786412366403439 Training Accuracy:  71.05238095238096 Test Loss:  0.7817807409460373 Testing Accuracy:  70.7
Epoch:  8


100%|██████████| 1312/1312 [00:09<00:00, 138.16it/s]


Training Loss:  0.7317997974434098 Training Accuracy:  73.36428571428571 Test Loss:  0.7354311280601825 Testing Accuracy:  73.06666666666666
Epoch:  9


100%|██████████| 1312/1312 [00:07<00:00, 187.28it/s]


Training Loss:  0.685554213424651 Training Accuracy:  75.78571428571429 Test Loss:  0.6902646662837562 Testing Accuracy:  75.37777777777778
Epoch:  10


100%|██████████| 1312/1312 [00:09<00:00, 133.33it/s]


Training Loss:  0.635578265714601 Training Accuracy:  77.93095238095238 Test Loss:  0.6410393247890231 Testing Accuracy:  77.46666666666667


0,1
tr_accuracy,▁▃▄▅▆▆▇▇███
tr_loss,█▅▄▃▂▂▂▂▁▁▁
val_accuracy,▁▃▅▅▆▇▇▇███
val_loss,█▅▄▃▂▂▂▂▁▁▁

0,1
tr_accuracy,77.93095
tr_loss,0.63558
val_accuracy,77.46667
val_loss,0.64104


[34m[1mwandb[0m: Agent Starting Run: n7aqz69x with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
1312.0

 Start of training
Training Loss:  17.52340242667046 Training Accuracy:  8.704761904761904 Test Loss:  17.510005121220374 Testing Accuracy:  8.8
Epoch:  1


100%|██████████| 1312/1312 [00:16<00:00, 78.05it/s]


Training Loss:  1.2035320886965155 Training Accuracy:  65.74285714285715 Test Loss:  1.2601439728710426 Testing Accuracy:  64.7611111111111
Epoch:  2


100%|██████████| 1312/1312 [00:17<00:00, 76.04it/s]


Training Loss:  0.5834088301412274 Training Accuracy:  79.41190476190476 Test Loss:  0.5897104592385846 Testing Accuracy:  79.25555555555556
Epoch:  3


100%|██████████| 1312/1312 [00:17<00:00, 76.00it/s]


Training Loss:  0.5600781614132055 Training Accuracy:  81.08095238095238 Test Loss:  0.5623020425378236 Testing Accuracy:  81.20555555555555
Epoch:  4


100%|██████████| 1312/1312 [00:17<00:00, 75.74it/s]


Training Loss:  0.5520875554013639 Training Accuracy:  81.4095238095238 Test Loss:  0.5565108571244763 Testing Accuracy:  81.47222222222223
Epoch:  5


100%|██████████| 1312/1312 [00:17<00:00, 76.51it/s]


Training Loss:  0.5509734782864425 Training Accuracy:  81.41904761904762 Test Loss:  0.5556754167078727 Testing Accuracy:  81.46111111111111
Epoch:  6


100%|██████████| 1312/1312 [00:17<00:00, 75.44it/s]


Training Loss:  0.5502383936935521 Training Accuracy:  81.48333333333333 Test Loss:  0.5549575100320182 Testing Accuracy:  81.40555555555555
Epoch:  7


100%|██████████| 1312/1312 [00:16<00:00, 79.24it/s]


Training Loss:  0.5512911471826393 Training Accuracy:  81.42857142857143 Test Loss:  0.555971171756043 Testing Accuracy:  81.37777777777778
Epoch:  8


100%|██████████| 1312/1312 [00:17<00:00, 76.63it/s]


Training Loss:  0.5516464221903559 Training Accuracy:  81.47142857142858 Test Loss:  0.5564862349235113 Testing Accuracy:  81.36666666666666
Epoch:  9


100%|██████████| 1312/1312 [00:16<00:00, 78.53it/s]


Training Loss:  0.5525493908623639 Training Accuracy:  81.37142857142857 Test Loss:  0.5573932884273956 Testing Accuracy:  81.21666666666667
Epoch:  10


100%|██████████| 1312/1312 [00:17<00:00, 75.25it/s]


Training Loss:  0.5521030633657713 Training Accuracy:  81.42142857142858 Test Loss:  0.5570997191316187 Testing Accuracy:  81.24444444444444


0,1
tr_accuracy,▁▆█████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▆█████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,81.42143
tr_loss,0.5521
val_accuracy,81.24444
val_loss,0.5571


[34m[1mwandb[0m: Agent Starting Run: nc7ivll7 with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 784) Bias vector dimention (32, 1)
----------------
Layer:  2  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  3  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  4  ; number of neurons:  32  ; activation function:  ReLU
Weight matrix dimention (32, 32) Bias vector dimention (32, 1)
----------------
Layer:  5  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 32) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  2.3125594199053388 Training Accuracy:  17.288095238095

100%|██████████| 2625/2625 [00:03<00:00, 656.67it/s]


Training Loss:  1.342383868754382 Training Accuracy:  54.82142857142857 Test Loss:  1.3434414264118892 Testing Accuracy:  55.46666666666667
Epoch:  2


100%|██████████| 2625/2625 [00:06<00:00, 396.94it/s]


Training Loss:  0.8788371452159108 Training Accuracy:  68.78095238095239 Test Loss:  0.8819502587469142 Testing Accuracy:  68.61666666666666
Epoch:  3


100%|██████████| 2625/2625 [00:03<00:00, 690.62it/s]


Training Loss:  0.7249406069625501 Training Accuracy:  74.14047619047619 Test Loss:  0.7314626147024312 Testing Accuracy:  74.02777777777777
Epoch:  4


100%|██████████| 2625/2625 [00:03<00:00, 658.18it/s]


Training Loss:  0.6533870964484045 Training Accuracy:  77.15 Test Loss:  0.6614864584429888 Testing Accuracy:  76.82777777777778
Epoch:  5


100%|██████████| 2625/2625 [00:05<00:00, 447.51it/s]


Training Loss:  0.6069897215722067 Training Accuracy:  78.9047619047619 Test Loss:  0.6158497560658518 Testing Accuracy:  78.60555555555555
Epoch:  6


100%|██████████| 2625/2625 [00:04<00:00, 549.63it/s]


Training Loss:  0.5740852583979017 Training Accuracy:  79.93333333333334 Test Loss:  0.5835994891795281 Testing Accuracy:  79.8
Epoch:  7


100%|██████████| 2625/2625 [00:03<00:00, 715.01it/s]


Training Loss:  0.5491428559473457 Training Accuracy:  80.80238095238096 Test Loss:  0.5593303899837951 Testing Accuracy:  80.59444444444445
Epoch:  8


100%|██████████| 2625/2625 [00:05<00:00, 512.19it/s]


Training Loss:  0.5285703258546374 Training Accuracy:  81.47380952380952 Test Loss:  0.539635523767501 Testing Accuracy:  81.49444444444444
Epoch:  9


100%|██████████| 2625/2625 [00:05<00:00, 515.64it/s]


Training Loss:  0.5110559903447147 Training Accuracy:  82.1 Test Loss:  0.5229689176895497 Testing Accuracy:  82.05
Epoch:  10


100%|██████████| 2625/2625 [00:03<00:00, 657.44it/s]


Training Loss:  0.49601330608031025 Training Accuracy:  82.62142857142857 Test Loss:  0.5087257328221073 Testing Accuracy:  82.58888888888889


0,1
tr_accuracy,▁▅▇▇▇██████
tr_loss,█▄▂▂▂▁▁▁▁▁▁
val_accuracy,▁▅▇▇▇██████
val_loss,█▄▂▂▂▁▁▁▁▁▁

0,1
tr_accuracy,82.62143
tr_loss,0.49601
val_accuracy,82.58889
val_loss,0.50873


[34m[1mwandb[0m: Agent Starting Run: 4r312rki with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 784) Bias vector dimention (64, 1)
----------------
Layer:  2  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  3  ; number of neurons:  64  ; activation function:  tanh
Weight matrix dimention (64, 64) Bias vector dimention (64, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 64) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  12.88799326840419 Training Accuracy:  6.642857142857143 Test Loss:  12.956228169136923 Testing Accuracy:  6.611111111111111
Epoch:  1


100%|██████████| 656/656 [00:04<00:00, 160.69it/s]


Training Loss:  7.164555968123772 Training Accuracy:  17.238095238095237 Test Loss:  7.284384599028975 Testing Accuracy:  16.93888888888889
Epoch:  2


100%|██████████| 656/656 [00:05<00:00, 125.28it/s]


Training Loss:  5.4832346073848335 Training Accuracy:  26.154761904761905 Test Loss:  5.541674314850145 Testing Accuracy:  25.805555555555557
Epoch:  3


100%|██████████| 656/656 [00:03<00:00, 169.03it/s]


Training Loss:  4.606509114027385 Training Accuracy:  33.03809523809524 Test Loss:  4.723319670949541 Testing Accuracy:  32.26111111111111
Epoch:  4


100%|██████████| 656/656 [00:03<00:00, 168.75it/s]


Training Loss:  4.081577302222958 Training Accuracy:  37.76190476190476 Test Loss:  4.215022412071683 Testing Accuracy:  36.61666666666667
Epoch:  5


100%|██████████| 656/656 [00:05<00:00, 121.50it/s]


Training Loss:  3.7048441828867356 Training Accuracy:  41.09047619047619 Test Loss:  3.8303295980290106 Testing Accuracy:  39.95
Epoch:  6


100%|██████████| 656/656 [00:03<00:00, 174.62it/s]


Training Loss:  3.3995863301057923 Training Accuracy:  43.62619047619047 Test Loss:  3.5339543570474174 Testing Accuracy:  42.25555555555555
Epoch:  7


100%|██████████| 656/656 [00:04<00:00, 157.11it/s]


Training Loss:  3.145238513975745 Training Accuracy:  45.19285714285714 Test Loss:  3.2820564008213964 Testing Accuracy:  44.19444444444444
Epoch:  8


100%|██████████| 656/656 [00:04<00:00, 141.50it/s]


Training Loss:  2.939875986395154 Training Accuracy:  46.726190476190474 Test Loss:  3.0701084611103133 Testing Accuracy:  45.68333333333333
Epoch:  9


100%|██████████| 656/656 [00:03<00:00, 171.48it/s]


Training Loss:  2.778717020564313 Training Accuracy:  48.076190476190476 Test Loss:  2.9154390480458217 Testing Accuracy:  46.894444444444446
Epoch:  10


100%|██████████| 656/656 [00:04<00:00, 134.99it/s]


Training Loss:  2.6455774378576513 Training Accuracy:  49.19761904761905 Test Loss:  2.782450880681761 Testing Accuracy:  47.91111111111111


0,1
tr_accuracy,▁▃▄▅▆▇▇▇███
tr_loss,█▄▃▂▂▂▂▁▁▁▁
val_accuracy,▁▃▄▅▆▇▇▇███
val_loss,█▄▃▂▂▂▂▁▁▁▁

0,1
tr_accuracy,49.19762
tr_loss,2.64558
val_accuracy,47.91111
val_loss,2.78245


[34m[1mwandb[0m: Agent Starting Run: kxp0voce with config:
[34m[1mwandb[0m: 	activation: sigmoid
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  sgd
Learning rate (initial):  0.0001
Batch size:  16
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  sigmoid
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
2625.0

 Start of training
Training Loss:  13.683287577455642 Training Accuracy:  10.16904761904762 Test Loss:  13.650882837561623 Testing Accuracy:  9.627777777777778
Epoch:  1


100%|██████████| 2625/2625 [00:13<00:00, 190.71it/s]


Training Loss:  7.328878132122278 Training Accuracy:  9.154761904761905 Test Loss:  7.28973645731155 Testing Accuracy:  8.955555555555556
Epoch:  2


100%|██████████| 2625/2625 [00:13<00:00, 189.19it/s]


Training Loss:  5.129618174904484 Training Accuracy:  9.742857142857142 Test Loss:  5.088765715708287 Testing Accuracy:  9.666666666666666
Epoch:  3


100%|██████████| 2625/2625 [00:12<00:00, 204.36it/s]


Training Loss:  4.053067786434212 Training Accuracy:  11.304761904761905 Test Loss:  4.0098180937635055 Testing Accuracy:  11.261111111111111
Epoch:  4


100%|██████████| 2625/2625 [00:12<00:00, 211.04it/s]


Training Loss:  3.509517417712524 Training Accuracy:  13.452380952380953 Test Loss:  3.4667622221677252 Testing Accuracy:  13.555555555555555
Epoch:  5


100%|██████████| 2625/2625 [00:13<00:00, 188.61it/s]


Training Loss:  3.161086601099011 Training Accuracy:  16.438095238095237 Test Loss:  3.121736141379396 Testing Accuracy:  16.944444444444443
Epoch:  6


100%|██████████| 2625/2625 [00:14<00:00, 184.11it/s]


Training Loss:  2.8945110387921757 Training Accuracy:  19.533333333333335 Test Loss:  2.8591811217312997 Testing Accuracy:  20.305555555555557
Epoch:  7


100%|██████████| 2625/2625 [00:12<00:00, 204.51it/s]


Training Loss:  2.683521561586245 Training Accuracy:  22.435714285714287 Test Loss:  2.651857485480431 Testing Accuracy:  23.16111111111111
Epoch:  8


100%|██████████| 2625/2625 [00:12<00:00, 212.15it/s]


Training Loss:  2.5139121000702866 Training Accuracy:  24.97142857142857 Test Loss:  2.485659435723276 Testing Accuracy:  25.905555555555555
Epoch:  9


100%|██████████| 2625/2625 [00:12<00:00, 213.63it/s]


Training Loss:  2.3753244621510197 Training Accuracy:  27.583333333333332 Test Loss:  2.3503153404007886 Testing Accuracy:  28.483333333333334
Epoch:  10


100%|██████████| 2625/2625 [00:12<00:00, 205.31it/s]


Training Loss:  2.2603532054315885 Training Accuracy:  29.885714285714286 Test Loss:  2.2382151782994284 Testing Accuracy:  30.677777777777777


0,1
tr_accuracy,▁▁▁▂▂▃▅▅▆▇█
tr_loss,█▄▃▂▂▂▁▁▁▁▁
val_accuracy,▁▁▁▂▂▄▅▆▆▇█
val_loss,█▄▃▂▂▂▁▁▁▁▁

0,1
tr_accuracy,29.88571
tr_loss,2.26035
val_accuracy,30.67778
val_loss,2.23822


[34m[1mwandb[0m: Agent Starting Run: 1m591a5c with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	weight_init: random


Hyper parameters: 

Weight initialization type :  random
Optimizer :  nadam
Learning rate (initial):  0.0001
Batch size:  32
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  5  ; number of neurons:  128  ; activation function:  tanh
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  6  ; number of neurons:  10  ; activation function:  softmax
Weight ma

100%|██████████| 1312/1312 [00:24<00:00, 53.11it/s]


Training Loss:  11.506880574767195 Training Accuracy:  19.19047619047619 Test Loss:  11.737911828889228 Testing Accuracy:  18.727777777777778
Epoch:  2


100%|██████████| 1312/1312 [00:23<00:00, 54.69it/s]


Training Loss:  9.38935136926415 Training Accuracy:  25.55 Test Loss:  9.519134081693737 Testing Accuracy:  25.622222222222224
Epoch:  3


100%|██████████| 1312/1312 [00:23<00:00, 56.54it/s]


Training Loss:  7.946456634159856 Training Accuracy:  31.090476190476192 Test Loss:  8.226846430725432 Testing Accuracy:  30.194444444444443
Epoch:  4


100%|██████████| 1312/1312 [00:25<00:00, 51.88it/s]


Training Loss:  7.085071394494339 Training Accuracy:  34.92857142857143 Test Loss:  7.329385009089714 Testing Accuracy:  33.861111111111114
Epoch:  5


100%|██████████| 1312/1312 [00:25<00:00, 51.28it/s]


Training Loss:  6.413775152170178 Training Accuracy:  37.66428571428571 Test Loss:  6.814313491078311 Testing Accuracy:  36.06666666666667
Epoch:  6


100%|██████████| 1312/1312 [00:26<00:00, 49.44it/s]


Training Loss:  5.962051647415354 Training Accuracy:  40.09761904761905 Test Loss:  6.222192223953702 Testing Accuracy:  39.05
Epoch:  7


100%|██████████| 1312/1312 [00:23<00:00, 54.76it/s]


Training Loss:  5.601440535201201 Training Accuracy:  42.00714285714286 Test Loss:  5.837951113417774 Testing Accuracy:  41.24444444444445
Epoch:  8


100%|██████████| 1312/1312 [00:24<00:00, 52.88it/s]


Training Loss:  5.219423761203196 Training Accuracy:  44.05238095238095 Test Loss:  5.451105285416694 Testing Accuracy:  43.03888888888889
Epoch:  9


100%|██████████| 1312/1312 [00:25<00:00, 51.55it/s]


Training Loss:  5.006987047785274 Training Accuracy:  45.147619047619045 Test Loss:  5.216871116949472 Testing Accuracy:  43.98888888888889
Epoch:  10


100%|██████████| 1312/1312 [00:24<00:00, 52.81it/s]


Training Loss:  4.66670005504201 Training Accuracy:  46.9 Test Loss:  4.984692945805736 Testing Accuracy:  45.394444444444446


0,1
tr_accuracy,▁▃▄▅▆▆▇▇▇██
tr_loss,█▅▄▃▃▂▂▂▁▁▁
val_accuracy,▁▃▄▅▆▆▇▇███
val_loss,█▅▄▃▃▂▂▂▁▁▁

0,1
tr_accuracy,46.9
tr_loss,4.6667
val_accuracy,45.39444
val_loss,4.98469


[34m[1mwandb[0m: Agent Starting Run: 580y9poi with config:
[34m[1mwandb[0m: 	activation: ReLU
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	weight_decay: 0.0005
[34m[1mwandb[0m: 	weight_init: Xavier


Hyper parameters: 

Weight initialization type :  Xavier
Optimizer :  adam
Learning rate (initial):  0.001
Batch size:  64
-------------------
Architecture Description:

Layer:  1  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 784) Bias vector dimention (128, 1)
----------------
Layer:  2  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  3  ; number of neurons:  128  ; activation function:  ReLU
Weight matrix dimention (128, 128) Bias vector dimention (128, 1)
----------------
Layer:  4  ; number of neurons:  10  ; activation function:  softmax
Weight matrix dimention (10, 128) Bias vector dimention (10, 1)
----------------
656.0

 Start of training
Training Loss:  2.320280754227305 Training Accuracy:  13.145238095238096 Test Loss:  2.318420903143096 Testing Accuracy:  13.63888888888889
Epoch:  1


100%|██████████| 656/656 [00:07<00:00, 92.25it/s] 


Training Loss:  0.38018989759103444 Training Accuracy:  86.3452380952381 Test Loss:  0.39273884794147557 Testing Accuracy:  85.86666666666666
Epoch:  2


100%|██████████| 656/656 [00:07<00:00, 84.65it/s]


Training Loss:  0.34685387019001784 Training Accuracy:  87.74285714285715 Test Loss:  0.3787306088172713 Testing Accuracy:  86.95555555555555
Epoch:  3


100%|██████████| 656/656 [00:07<00:00, 89.46it/s]


Training Loss:  0.3522178941845759 Training Accuracy:  88.06666666666666 Test Loss:  0.40205355607856624 Testing Accuracy:  87.12777777777778
Epoch:  4


100%|██████████| 656/656 [00:16<00:00, 39.36it/s]


Training Loss:  0.33687454965225955 Training Accuracy:  88.58333333333333 Test Loss:  0.40419474088051793 Testing Accuracy:  87.35
Epoch:  5


100%|██████████| 656/656 [00:19<00:00, 33.37it/s]


Training Loss:  0.36024016537757847 Training Accuracy:  88.45476190476191 Test Loss:  0.44725306619263183 Testing Accuracy:  87.08333333333333
Epoch:  6


100%|██████████| 656/656 [00:21<00:00, 30.91it/s]


Training Loss:  0.3689184530134282 Training Accuracy:  88.53333333333333 Test Loss:  0.46323996447566446 Testing Accuracy:  87.07222222222222
Epoch:  7


100%|██████████| 656/656 [00:23<00:00, 27.45it/s]


Training Loss:  0.34238236838337627 Training Accuracy:  89.02857142857142 Test Loss:  0.4489991048010819 Testing Accuracy:  87.56666666666666
Epoch:  8


100%|██████████| 656/656 [00:27<00:00, 24.17it/s]


Training Loss:  0.351444299763602 Training Accuracy:  89.3952380952381 Test Loss:  0.47387908723899275 Testing Accuracy:  87.74444444444444
Epoch:  9


100%|██████████| 656/656 [00:30<00:00, 21.18it/s]


Training Loss:  0.3429732450306472 Training Accuracy:  89.08809523809524 Test Loss:  0.45653488817097637 Testing Accuracy:  87.28888888888889
Epoch:  10


100%|██████████| 656/656 [00:36<00:00, 17.98it/s]


Training Loss:  0.35470441016192633 Training Accuracy:  89.52857142857142 Test Loss:  0.49695524725381385 Testing Accuracy:  87.45


0,1
tr_accuracy,▁██████████
tr_loss,█▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁██████████
val_loss,█▁▁▁▁▁▁▁▁▁▁

0,1
tr_accuracy,89.52857
tr_loss,0.3547
val_accuracy,87.45
val_loss,0.49696
