In [2]:
#!/usr/bin/env python3
import os
import numpy
from numpy import random

# import scipy
import matplotlib
import pickle

# matplotlib.use("agg")
from matplotlib import pyplot as plt

import torch
import torchvision

# you may wish to import other things like nn
from torch.utils.data import DataLoader
import torch.nn as nn

# hyperparameter settings and other constants
batch_size = 100
num_classes = 10
epochs = 10
mnist_input_shape = (28, 28, 1)
d1 = 1024
d2 = 256
alpha = 0.1
beta = 0.9
alpha_adam = 0.001
rho1 = 0.99
rho2 = 0.999
# end hyperparameter settings


# load the MNIST dataset using TensorFlow/Keras
def load_MNIST_dataset():
    train_dataset = torchvision.datasets.MNIST(
        root="./data",
        train=True,
        transform=torchvision.transforms.ToTensor(),
        download=True,
    )
    test_dataset = torchvision.datasets.MNIST(
        root="./data",
        train=False,
        transform=torchvision.transforms.ToTensor(),
        download=False,
    )
    return (train_dataset, test_dataset)


# construct dataloaders for the MNIST dataset
#
# train_dataset        input train dataset (output of load_MNIST_dataset)
# test_dataset         input test dataset (output of load_MNIST_dataset)
# batch_size           batch size for training
# shuffle_train        boolean: whether to shuffle the training dataset
#
# returns              tuple of (train_dataloader, test_dataloader)
#     each component of the tuple should be a torch.utils.data.DataLoader object
#     for the corresponding training set;
#     use the specified batch_size and shuffle_train values for the training DataLoader;
#     use a batch size of 100 and no shuffling for the test data loader


def construct_dataloaders(train_dataset, test_dataset, batch_size, shuffle_train=True):
    train_dataloader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=shuffle_train
    )
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_dataloader, test_dataloader


# evaluate a trained model on MNIST data
#
# dataloader    dataloader of examples to evaluate on
# model         trained PyTorch model
# loss_fn       loss function (e.g. nn.CrossEntropyLoss)
#
# returns       tuple of (loss, accuracy), both python floats
@torch.no_grad()
def evaluate_model(dataloader, model, loss_fn):
    model.eval()
    loss = correct = total = 0
    for inputs, labels in dataloader:
        output = model(inputs)
        loss += len(labels) * loss_fn(output, labels).item()
        correct += sum(torch.argmax(output, dim=1) == labels)
        total += output.shape[0]
    avg_loss = loss / total
    acc = correct / total
    return avg_loss, acc


def make_fully_connected_model_part1_1():
    return nn.Sequential(
        nn.Flatten(),
        nn.LazyLinear(out_features=1024),
        nn.ReLU(),
        nn.Linear(in_features=1024, out_features=256),
        nn.ReLU(),
        nn.Linear(in_features=256, out_features=10),
    )

    # build a fully connected two-hidden-layer neural network with Batch Norm, as in Part 1.4
    # use the default initialization for the parameters provided in PyTorch
    # returns   a new model of type nn.Sequential


def make_fully_connected_model_part1_4():
    return nn.Sequential(
        nn.Flatten(),
        nn.LazyLinear(out_features=1024),
        nn.BatchNorm1d(1024),
        nn.ReLU(),
        nn.Linear(in_features=1024, out_features=256),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        nn.Linear(in_features=256, out_features=10),
    )


# build a convolutional neural network, as in Part 3.1
# use the default initialization for the parameters provided in PyTorch
#
# returns   a new model of type nn.Sequential


def make_cnn_model_part3_1():
    return nn.Sequential(
        nn.LazyConv2d(16, (3, 3), 1, 0),
        nn.LazyBatchNorm2d(),
        nn.ReLU(),
        nn.LazyConv2d(16, (3, 3), 1, 0),
        nn.LazyBatchNorm2d(),
        nn.ReLU(),
        nn.MaxPool2d((2, 2)),
        nn.LazyConv2d(32, (3, 3), 1, 0),
        nn.LazyBatchNorm2d(),
        nn.ReLU(),
        nn.MaxPool2d((2, 2)),
        nn.Flatten(),
        nn.LazyLinear(out_features=128),
        nn.ReLU(),
        nn.Linear(128, 10),
    )

    # train a neural network on MNIST data
    #     be sure to call model.train() before training and model.eval() before evaluating!
    #
    # train_dataloader   training dataloader
    # test_dataloader    test dataloader
    # model              dnn model to be trained (training should mutate this)
    # loss_fn            loss function
    # optimizer          an optimizer that inherits from torch.optim.Optimizer
    # epochs             number of epochs to run
    # eval_train_stats   boolean; whether to evaluate statistics on training set each epoch
    # eval_test_stats    boolean; whether to evaluate statistics on test set each epoch
    #
    # returns   a tuple of
    #   train_loss       an array of length `epochs` containing the training loss after each epoch, or [] if eval_train_stats == False
    #   train_acc        an array of length `epochs` containing the training accuracy after each epoch, or [] if eval_train_stats == False
    #   test_loss        an array of length `epochs` containing the test loss after each epoch, or [] if eval_test_stats == False
    #   test_acc         an array of length `epochs` containing the test accuracy after each epoch, or [] if eval_test_stats == False
    #   approx_tr_loss   an array of length `epochs` containing the average training loss of examples processed in this epoch
    #   approx_tr_acc    an array of length `epochs` containing the average training accuracy of examples processed in this epoch

from tqdm import tqdm
def train(
    train_dataloader,
    test_dataloader,
    model,
    loss_fn,
    optimizer,
    epochs,
    eval_train_stats=True,
    eval_test_stats=True,
):
    train_loss, train_acc, test_loss, test_acc, approx_tr_loss, approx_tr_acc = (
        [],
        [],
        [],
        [],
        [],
        [],
    )
    for _ in tqdm(range(epochs)):
        model.train()
        total_loss = total_correct = total_examples = 0
        for batch_input, batch_labels in train_dataloader:
            optimizer.zero_grad()
            preds = model(batch_input)
            loss = loss_fn(preds, batch_labels)
            loss.backward()
            optimizer.step()
            total_loss += len(batch_labels) * loss.item()
            total_correct += sum(torch.argmax(preds, dim=1) == batch_labels)
            total_examples += len(batch_labels)
        approx_tr_loss.append(total_loss / total_examples)
        approx_tr_acc.append(total_correct / total_examples)
        if eval_train_stats:
            loss, acc = evaluate_model(train_dataloader, model, loss_fn)
            train_loss.append(loss)
            train_acc.append(acc)
        if eval_test_stats:
            loss, acc = evaluate_model(test_dataloader, model, loss_fn)
            test_loss.append(loss)
            test_acc.append(acc)
    return (train_loss, train_acc, test_loss, test_acc, approx_tr_loss, approx_tr_acc)



In [3]:
(train_dataset, test_dataset) = load_MNIST_dataset()
train_dataloader, test_dataloader = construct_dataloaders(
    train_dataset, test_dataset, 100
)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 139331592.61it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 108156869.49it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 35698976.90it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 18057373.24it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [None]:
# SGD with Momentum
epochs = 10
lrs = [1.0,0.3,0.1,0.03,0.01,0.003,0.001]
for lr in lrs:
  model_1_2 = make_fully_connected_model_part1_1()
  cross_entropy = nn.CrossEntropyLoss()
  momentum_opt = torch.optim.SGD(model_1_2.parameters(), lr=lr, momentum=0.9)
  train_loss, train_acc, test_loss, test_acc, approx_tr_loss, approx_tr_acc = train(
      train_dataloader, test_dataloader, model_1_2, cross_entropy, momentum_opt, epochs,
      eval_train_stats=False,
    eval_test_stats=False,
  )
  test_loss, test_acc = evaluate_model(test_dataloader, model_1_2, cross_entropy)
  print(f'Learning Rate: {lr}')
  print(f'Val Loss: {test_loss}, Val Accuracy: {test_acc}')
  print(f'Moving on to next one...')



100%|██████████| 10/10 [04:45<00:00, 28.54s/it]


Learning Rate: 1.0
Val Loss: 2.3252865505218505, Val Accuracy: 0.10279999673366547
Moving on to next one...


100%|██████████| 10/10 [04:10<00:00, 25.00s/it]


Learning Rate: 0.3
Val Loss: 0.14741767410472675, Val Accuracy: 0.9682999849319458
Moving on to next one...


100%|██████████| 10/10 [03:32<00:00, 21.21s/it]


Learning Rate: 0.1
Val Loss: 0.06442476138630354, Val Accuracy: 0.9836999773979187
Moving on to next one...


100%|██████████| 10/10 [03:17<00:00, 19.76s/it]


Learning Rate: 0.03
Val Loss: 0.0655407006772748, Val Accuracy: 0.9818000197410583
Moving on to next one...


100%|██████████| 10/10 [03:16<00:00, 19.68s/it]


Learning Rate: 0.01
Val Loss: 0.06534248367184774, Val Accuracy: 0.9801999926567078
Moving on to next one...


100%|██████████| 10/10 [03:13<00:00, 19.34s/it]


Learning Rate: 0.003
Val Loss: 0.1296159452246502, Val Accuracy: 0.961899995803833
Moving on to next one...


100%|██████████| 10/10 [03:07<00:00, 18.70s/it]


Learning Rate: 0.001
Val Loss: 0.2455887557566166, Val Accuracy: 0.9305999875068665
Moving on to next one...


In [5]:
def batch_norm(width = 1024, narrow_width = 256):
    return torch.nn.Sequential(
        torch.nn.Flatten(),
        torch.nn.LazyLinear(out_features=width),
        torch.nn.BatchNorm1d(width),
        torch.nn.ReLU(),
        torch.nn.Linear(in_features=width, out_features=narrow_width),
        torch.nn.BatchNorm1d(narrow_width),
        torch.nn.ReLU(),
        torch.nn.Linear(in_features=narrow_width, out_features=10),
		torch.nn.BatchNorm1d(10)
    )


In [9]:
# Perform random search
best_score = float('-inf')
best_hyperparameters = None
num_trials = 10
lr_bound = [0,0.1]
beta_bound = [0.8,1]
width_bound = [512, 2048]
narrow_bound = [128, 512]
from time import time
start = time()

for _ in range(num_trials):
    learning_rate = random.uniform(lr_bound[0],lr_bound[1])
    beta = random.uniform(beta_bound[0],beta_bound[1])
    width = random.randint(width_bound[0],width_bound[1])
    narrow = random.randint(narrow_bound[0],narrow_bound[1])
    model_1_4 = batch_norm(width=width, narrow_width=narrow)
    cross_entropy = torch.nn.CrossEntropyLoss()
    momentum_opt = torch.optim.SGD(
        model_1_4.parameters(), lr=learning_rate, momentum=beta)
    train_loss, train_acc, test_loss, test_acc, approx_tr_loss, approx_tr_acc = train(
        train_dataloader, test_dataloader, model_1_4, cross_entropy, momentum_opt, epochs,
          eval_train_stats=False,
      eval_test_stats=False,
    )
    test_loss, test_acc = evaluate_model(test_dataloader, model_1_4, cross_entropy)
    print(f'lr: {learning_rate}, beta: {beta}, width: {width}, narrow: {narrow}')
    print(f'Val Loss: {test_loss}, Val Accuracy: {test_acc}')
    print(f'Moving on to next one...')

    if test_acc > best_score:
        best_score = test_acc
        best_hyperparameters = (learning_rate, beta, width, narrow)
final = time()
difference = final - start
print(time)
print("Best hyperparameters:")
print("Learning Rate:", best_hyperparameters[0])
print("Beta:", best_hyperparameters[1])
print("Width:", best_hyperparameters[2])
print("Narrow:", best_hyperparameters[3])


100%|██████████| 10/10 [02:12<00:00, 13.23s/it]


lr: 0.01674814374014366, beta: 0.9834734024541815, width: 1293, narrow: 170
Val Loss: 0.05530764216899115, Val Accuracy: 0.9857000112533569
Moving on to next one...


100%|██████████| 10/10 [02:50<00:00, 17.04s/it]


lr: 0.042571575457360025, beta: 0.81840533445735, width: 1547, narrow: 348
Val Loss: 0.051586740053026006, Val Accuracy: 0.9855999946594238
Moving on to next one...


100%|██████████| 10/10 [01:56<00:00, 11.64s/it]


lr: 0.06093236661340029, beta: 0.8386568249043684, width: 692, narrow: 355
Val Loss: 0.049467295781651044, Val Accuracy: 0.986299991607666
Moving on to next one...


100%|██████████| 10/10 [02:37<00:00, 15.79s/it]


lr: 0.07341698833516962, beta: 0.8410073829111946, width: 1616, narrow: 164
Val Loss: 0.048631352997472274, Val Accuracy: 0.9854000210762024
Moving on to next one...


100%|██████████| 10/10 [01:56<00:00, 11.69s/it]


lr: 0.09507807065112249, beta: 0.803889418856422, width: 588, narrow: 392
Val Loss: 0.054211344709183325, Val Accuracy: 0.9843000173568726
Moving on to next one...


100%|██████████| 10/10 [02:20<00:00, 14.09s/it]


lr: 0.027602412978517434, beta: 0.8708177121189624, width: 1086, narrow: 334
Val Loss: 0.049394558702479115, Val Accuracy: 0.9848999977111816
Moving on to next one...


100%|██████████| 10/10 [02:08<00:00, 12.86s/it]


lr: 0.015113245287413491, beta: 0.8508672785548237, width: 850, narrow: 348
Val Loss: 0.04992826288915239, Val Accuracy: 0.9855999946594238
Moving on to next one...


100%|██████████| 10/10 [02:46<00:00, 16.65s/it]


lr: 0.06414799715802426, beta: 0.972671549222937, width: 1650, narrow: 260
Val Loss: 0.05369049780962087, Val Accuracy: 0.984499990940094
Moving on to next one...


100%|██████████| 10/10 [02:04<00:00, 12.40s/it]


lr: 0.023120842423648636, beta: 0.912110449445107, width: 843, narrow: 323
Val Loss: 0.04360672566312132, Val Accuracy: 0.9866999983787537
Moving on to next one...


100%|██████████| 10/10 [01:45<00:00, 10.58s/it]


lr: 0.0017033288965179084, beta: 0.9359535177919398, width: 527, narrow: 355
Val Loss: 0.062222722796723244, Val Accuracy: 0.9829000234603882
Moving on to next one...
<built-in function time>
Best hyperparameters:
Learning Rate: 0.023120842423648636
Beta: 0.912110449445107
Width: 843
Narrow: 323


In [13]:
print(f'Time: {difference}')

Time: 1374.897782087326


In [None]:
betas = [0.95,0.9,0.85, 0.75]
widths = [4096, 2048, 1024, 512]
narrow_widths = [512, 256, 128, 64]
# batch norm
epochs = 10
for beta in betas:
  for width in widths:
    for narrow in narrow_widths:
      model_1_4 = batch_norm(width=width, narrow_width=narrow)
      cross_entropy = torch.nn.CrossEntropyLoss()
      momentum_opt = torch.optim.SGD(
          model_1_4.parameters(), lr=.001, momentum=beta)
      train_loss, train_acc, test_loss, test_acc, approx_tr_loss, approx_tr_acc = train(
          train_dataloader, test_dataloader, model_1_4, cross_entropy, momentum_opt, epochs,
            eval_train_stats=False,
        eval_test_stats=False,
      )
      test_loss, test_acc = evaluate_model(test_dataloader, model_1_4, cross_entropy)
      print(f'beta: {beta}, width: {width}, narrow: {narrow}')
      print(f'Val Loss: {test_loss}, Val Accuracy: {test_acc}')
      print(f'Moving on to next one...')




100%|██████████| 10/10 [10:08<00:00, 60.86s/it]


beta: 0.95, width: 4096, narrow: 512
Val Loss: 0.06053978884592652, Val Accuracy: 0.9832000136375427
Moving on to next one...


100%|██████████| 10/10 [07:36<00:00, 45.66s/it]


beta: 0.95, width: 4096, narrow: 256
Val Loss: 0.059152940274216234, Val Accuracy: 0.9843999743461609
Moving on to next one...


100%|██████████| 10/10 [06:48<00:00, 40.86s/it]


beta: 0.95, width: 4096, narrow: 128
Val Loss: 0.05705431597307324, Val Accuracy: 0.984499990940094
Moving on to next one...


100%|██████████| 10/10 [06:35<00:00, 39.55s/it]


beta: 0.95, width: 4096, narrow: 64
Val Loss: 0.06343436273280531, Val Accuracy: 0.9836999773979187
Moving on to next one...


100%|██████████| 10/10 [05:27<00:00, 32.80s/it]


beta: 0.95, width: 2048, narrow: 512
Val Loss: 0.060285219098441305, Val Accuracy: 0.9851999878883362
Moving on to next one...


100%|██████████| 10/10 [04:38<00:00, 27.81s/it]


beta: 0.95, width: 2048, narrow: 256
Val Loss: 0.06125199325848371, Val Accuracy: 0.984499990940094
Moving on to next one...


100%|██████████| 10/10 [04:21<00:00, 26.13s/it]


beta: 0.95, width: 2048, narrow: 128
Val Loss: 0.06061399092897773, Val Accuracy: 0.9847999811172485
Moving on to next one...


100%|██████████| 10/10 [04:10<00:00, 25.09s/it]


beta: 0.95, width: 2048, narrow: 64
Val Loss: 0.06357972434721887, Val Accuracy: 0.9822999835014343
Moving on to next one...


100%|██████████| 10/10 [03:41<00:00, 22.16s/it]


beta: 0.95, width: 1024, narrow: 512
Val Loss: 0.06386022735852748, Val Accuracy: 0.9825999736785889
Moving on to next one...


100%|██████████| 10/10 [03:12<00:00, 19.26s/it]


beta: 0.95, width: 1024, narrow: 256
Val Loss: 0.06453188241925091, Val Accuracy: 0.9832000136375427
Moving on to next one...


100%|██████████| 10/10 [03:04<00:00, 18.47s/it]


beta: 0.95, width: 1024, narrow: 128
Val Loss: 0.06495889911893755, Val Accuracy: 0.983299970626831
Moving on to next one...


100%|██████████| 10/10 [03:03<00:00, 18.30s/it]


beta: 0.95, width: 1024, narrow: 64
Val Loss: 0.06675260317046196, Val Accuracy: 0.9815999865531921
Moving on to next one...


100%|██████████| 10/10 [02:51<00:00, 17.14s/it]


beta: 0.95, width: 512, narrow: 512
Val Loss: 0.06244891137350351, Val Accuracy: 0.9839000105857849
Moving on to next one...


100%|██████████| 10/10 [02:35<00:00, 15.53s/it]


beta: 0.95, width: 512, narrow: 256
Val Loss: 0.061620734822936354, Val Accuracy: 0.9847999811172485
Moving on to next one...


100%|██████████| 10/10 [02:27<00:00, 14.74s/it]


beta: 0.95, width: 512, narrow: 128
Val Loss: 0.06846960379742086, Val Accuracy: 0.982699990272522
Moving on to next one...


100%|██████████| 10/10 [02:24<00:00, 14.49s/it]


beta: 0.95, width: 512, narrow: 64
Val Loss: 0.06637632611673325, Val Accuracy: 0.9821000099182129
Moving on to next one...


100%|██████████| 10/10 [09:12<00:00, 55.26s/it]


beta: 0.9, width: 4096, narrow: 512
Val Loss: 0.07413634128868579, Val Accuracy: 0.984000027179718
Moving on to next one...


100%|██████████| 10/10 [07:48<00:00, 46.83s/it]


beta: 0.9, width: 4096, narrow: 256
Val Loss: 0.07466410583816468, Val Accuracy: 0.9829000234603882
Moving on to next one...


100%|██████████| 10/10 [06:54<00:00, 41.42s/it]


beta: 0.9, width: 4096, narrow: 128
Val Loss: 0.07545837223529815, Val Accuracy: 0.9821000099182129
Moving on to next one...


100%|██████████| 10/10 [06:33<00:00, 39.32s/it]


beta: 0.9, width: 4096, narrow: 64
Val Loss: 0.07614178190007806, Val Accuracy: 0.9815999865531921
Moving on to next one...


100%|██████████| 10/10 [05:30<00:00, 33.07s/it]


beta: 0.9, width: 2048, narrow: 512
Val Loss: 0.07857122071087361, Val Accuracy: 0.9828000068664551
Moving on to next one...


100%|██████████| 10/10 [04:40<00:00, 28.08s/it]


beta: 0.9, width: 2048, narrow: 256
Val Loss: 0.07811137652955949, Val Accuracy: 0.9804999828338623
Moving on to next one...


100%|██████████| 10/10 [04:20<00:00, 26.03s/it]


beta: 0.9, width: 2048, narrow: 128
Val Loss: 0.07987643690779805, Val Accuracy: 0.9818000197410583
Moving on to next one...


100%|██████████| 10/10 [04:19<00:00, 25.93s/it]


beta: 0.9, width: 2048, narrow: 64
Val Loss: 0.08065251032821834, Val Accuracy: 0.9807999730110168
Moving on to next one...


100%|██████████| 10/10 [03:50<00:00, 23.03s/it]


beta: 0.9, width: 1024, narrow: 512
Val Loss: 0.07792651490308344, Val Accuracy: 0.9835000038146973
Moving on to next one...


100%|██████████| 10/10 [03:20<00:00, 20.03s/it]


beta: 0.9, width: 1024, narrow: 256
Val Loss: 0.07979533201083541, Val Accuracy: 0.9812999963760376
Moving on to next one...


100%|██████████| 10/10 [03:03<00:00, 18.31s/it]


beta: 0.9, width: 1024, narrow: 128
Val Loss: 0.07965950782410801, Val Accuracy: 0.9824000000953674
Moving on to next one...


100%|██████████| 10/10 [02:57<00:00, 17.74s/it]


beta: 0.9, width: 1024, narrow: 64
Val Loss: 0.0826896638981998, Val Accuracy: 0.980400025844574
Moving on to next one...


100%|██████████| 10/10 [02:48<00:00, 16.86s/it]


beta: 0.9, width: 512, narrow: 512
Val Loss: 0.08080509747378528, Val Accuracy: 0.9815999865531921
Moving on to next one...


100%|██████████| 10/10 [02:34<00:00, 15.47s/it]


beta: 0.9, width: 512, narrow: 256
Val Loss: 0.08322364912368357, Val Accuracy: 0.9805999994277954
Moving on to next one...


100%|██████████| 10/10 [02:28<00:00, 14.81s/it]


beta: 0.9, width: 512, narrow: 128
Val Loss: 0.08598860112950206, Val Accuracy: 0.9799000024795532
Moving on to next one...


100%|██████████| 10/10 [02:24<00:00, 14.42s/it]


beta: 0.9, width: 512, narrow: 64
Val Loss: 0.08403270945884288, Val Accuracy: 0.9812999963760376
Moving on to next one...


100%|██████████| 10/10 [09:24<00:00, 56.47s/it]


beta: 0.85, width: 4096, narrow: 512
Val Loss: 0.09009105350822211, Val Accuracy: 0.9825999736785889
Moving on to next one...


100%|██████████| 10/10 [08:02<00:00, 48.23s/it]


beta: 0.85, width: 4096, narrow: 256
Val Loss: 0.08748898923397064, Val Accuracy: 0.9836000204086304
Moving on to next one...


100%|██████████| 10/10 [07:09<00:00, 42.98s/it]


beta: 0.85, width: 4096, narrow: 128
Val Loss: 0.09037665404379368, Val Accuracy: 0.9818999767303467
Moving on to next one...


100%|██████████| 10/10 [06:45<00:00, 40.58s/it]


beta: 0.85, width: 4096, narrow: 64
Val Loss: 0.09163711076602339, Val Accuracy: 0.980400025844574
Moving on to next one...


 10%|█         | 1/10 [00:34<05:07, 34.15s/it]