In [1]:
import torch
import models
import loader
import time
import torchsummary
from importlib import reload
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# @TODO: dataset.download()
!wget -nc http://cs231n.stanford.edu/tiny-imagenet-200.zip
    
# run this script only once
!unzip tiny-imagenet-200.zip

File ‘tiny-imagenet-200.zip’ already there; not retrieving.



In [3]:
before = time.time()
dataset = loader.TinyImageNet200()
dataset.prepare(split=loader.TRAIN)
print('train dataset has loaded in', time.time() - before, 's')

loaded in 12.486455202102661 s


In [5]:
before = time.time()
valid = loader.TinyImageNet200()
valid.prepare(split=loader.VALID)
print('validation dataset has loaded in', time.time() - before, 's')

loaded in 1.4639499187469482 s


In [21]:
import torch
# an example batch is processed here
x, y = next(iter(dataset.dataloader(batch_size=12)))
print('input shape', x.shape, 'target shape', y.shape)
pred = model.cpu()(x)
print('preds shape', pred.shape)
loss = torch.nn.functional.nll_loss(pred.log(), y)
print('example targets', y.detach().numpy().tolist())
print('example preds', pred.argmax(1).detach().numpy().tolist())
print('loss score', loss.item())

torch.Size([12, 3, 64, 64]) torch.Size([12])
torch.Size([12, 200])
torch.Size([12]) torch.Size([12, 200]) 5.699825286865234
tensor([ 99,  29, 113, 185, 100, 187, 115, 166,  88,  50,  67,  64])
tensor([ 92,  71,  24, 155, 172,  92,  71, 172, 172,  71, 172, 172])


In [73]:
import torch
from tqdm import tqdm
from torchmetrics.functional import accuracy, f1_score
import models
import wandb

wandb.login()
reload(models)


def train(model_params, train_data, valid_data, num_epochs, num_epochs_per_validation, verbose=True, **config):
    # create a model with given parameters
    model = models.ResidualNet(**model_params)
    torchsummary.summary(model, input_size=(3, 64, 64), device='cpu', batch_size=config['batch_size'])
    
    # initialize loss function and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], 
                                 weight_decay=config.setdefault('weight_decay', 0))
    
    # if plateau_monitor is set to a value then monitor it in scheduler
    if 'plateau_monitor' in config and config['plateau_monitor'] is not None:
        monitor = config['plateau_monitor']
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
    else:
        scheduler = None
    
    # initialize a wandb session
    wandb.init(project='imagenet', entity='adnanhd', 
               name='avg-pool', config={**config, **model_params})
    wandb.watch(model)
    
    # initialize dataloaders from Datasets defined in loader.py
    train_dataloader = train_data.dataloader(batch_size=config['batch_size'])
    valid_dataloader = valid_data.dataloader()
    num_of_train_batchs = len(train_dataloader)

    # return value for further comparison
    valid_accuracy = []
    pbar = tqdm(range(1, num_epochs + 1))
    for n in pbar:
        pbar.set_description(f'Epoch {n} of {num_epochs}')
        log = {'loss': 0, 'accuracy': 0, 'f1_score': 0}
        for x, y in train_dataloader:
            x = x.to(device=device)
            y = y.to(device=device)
            pred = model.cuda()(x)
            loss = criterion(pred.log(), y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            log['loss'] += loss.item()
            log['accuracy'] += accuracy(preds=pred, target=y, num_classes=200, average='macro').item()
            log['f1_score'] += f1_score(preds=pred, target=y).item()
            del x, y, pred
        log['loss'] /= num_of_train_batchs
        log['accuracy'] /= num_of_train_batchs
        log['f1_score'] /= num_of_train_batchs
        pbar.set_postfix(log)
        wandb.log(log, step=n)
        if scheduler is not None:
            scheduler.step(log[monitor])
        if n % num_epochs_per_validation == 0:
            with torch.no_grad():
                valid_log = {}
                for x, y in valid_dataloader:
                    pred = model.cpu()(x)
                    loss = criterion(pred.log(), y)
                    valid_log['val_loss'] = loss.item()
                    valid_log['val_accuracy'] = accuracy(preds=pred, target=y).item()
                    valid_log['val_f1_score'] = f1_score(preds=pred, target=y).item()
                    valid_accuracy.append(valid_log['val_accuracy'])
                    wandb.log(valid_log, step=n)
    return sum(valid_accuracy) / len(valid_accuracy)

In [78]:
# an example individual/singleton run
train(dict(residual=True, pool='avg', batch_norm=False, pool_stride=3,
           conv_kernel_size=3, pool_kernel_size=3, linear_output=128), 
      dataset, valid, 100, 10, batch_size=1000, learning_rate=4e-6)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [1000, 32, 32, 32]             896
              ReLU-2         [1000, 32, 32, 32]               0
         AvgPool2d-3         [1000, 32, 10, 10]               0
            Conv2d-4        [1000, 128, 10, 10]          36,992
              ReLU-5        [1000, 128, 10, 10]               0
            Conv2d-6         [1000, 32, 10, 10]          36,896
              ReLU-7         [1000, 32, 10, 10]               0
            Conv2d-8         [1000, 64, 10, 10]          18,496
     ResidualBlock-9         [1000, 64, 10, 10]               0
           Conv2d-10        [1000, 256, 10, 10]         147,712
             ReLU-11        [1000, 256, 10, 10]               0
           Conv2d-12         [1000, 64, 10, 10]         147,520
             ReLU-13         [1000, 64, 10, 10]               0
           Conv2d-14        [1000, 128,

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1_score,▁
loss,▁

0,1
accuracy,0.005
f1_score,0.005
loss,5.29869


Epoch 2 of 100:   1%|▌                                                            | 1/100 [00:21<35:32, 21.54s/it, loss=5.3, accuracy=0.00504, f1_score=0.005]


KeyboardInterrupt: 

In [75]:
from hyperopt import hp, fmin, tpe
from collections import OrderedDict
import math

In [81]:
# this cell is for hardcoding the num of params before the last fc layer (a.k.a. linear_output)
linear_output_dict = {}
def get_linear_output_dict(conv_kernel_size=3, pool_kernel_size=3, pool_stride=3, **kwargs):
    return linear_output_dict[(conv_kernel_size, pool_kernel_size, pool_stride)]

def set_linear_output_dict(linear_output, conv_kernel_size=3, pool_kernel_size=3, pool_stride=3):
    linear_output_dict[(conv_kernel_size, pool_kernel_size, pool_stride)] = linear_output


set_linear_output_dict(conv_kernel_size=2, pool_kernel_size=2, pool_stride=2, linear_output=2048)
set_linear_output_dict(conv_kernel_size=2, pool_kernel_size=2, pool_stride=3, linear_output=512)
set_linear_output_dict(conv_kernel_size=2, pool_kernel_size=3, pool_stride=2, linear_output=1152)
set_linear_output_dict(conv_kernel_size=2, pool_kernel_size=3, pool_stride=3, linear_output=512)

set_linear_output_dict(conv_kernel_size=3, pool_kernel_size=2, pool_stride=2, linear_output=2048)
set_linear_output_dict(conv_kernel_size=3, pool_kernel_size=2, pool_stride=3, linear_output=512)
set_linear_output_dict(conv_kernel_size=3, pool_kernel_size=3, pool_stride=2, linear_output=1152)
set_linear_output_dict(conv_kernel_size=3, pool_kernel_size=3, pool_stride=3, linear_output=128)

In [69]:
# this space defines ranges and possibilities that our hyperparameters can take
EXPERIMENT_SPACE = OrderedDict([
    ('learning_rate', hp.loguniform('learning_rate', math.log(1e-7), math.log(1e-5))), # learning rate
    ('residual', hp.choice('residual', (True, False))), # residual network
    ('batch_norm', hp.choice('batch_norm', (True, False))), # batch normalization
    ('weight_decay', hp.choice('weight_decay', [0, 1e-2, 1e-4, 4e-5])), # regularization
    ('pool', hp.choice('pool', ('avg', 'max'))), # pooling
    ('conv_kernel_size', hp.choice('conv_kernel_size', range(2, 4, 1))), # kernel size
    ('pool_kernel_size', hp.choice('pool_kernel_size', range(2, 4, 1))),
    ('pool_stride', hp.choice('pool_stride', range(2, 4, 1))), # stride
    ('plateau_monitor', hp.choice('plateau_monitor', [None, 'f1_score', 'accuracy', 'loss'])), # lr plateau
    ('batch_size', hp.choice('batch_size', [1000, 2000, 5000, 10000])), # batch_size
])

In [82]:
# this objective function is applied in bayesian process to find the current accuracy
def objective(params):
    config = {key: params[key] for key in ('learning_rate', 'weight_decay', 'plateau_monitor', 'batch_size')}
    model_params = {key: params[key] for key in ('residual', 'batch_norm', 'pool', 'pool_stride',
                                                 'conv_kernel_size', 'pool_kernel_size')}
    print('model_params', model_params)
    linear_output_kwargs = {key: params[key] for key in ('conv_kernel_size', 'pool_kernel_size', 'pool_stride')}
    model_params['linear_output'] = get_linear_output_dict(**linear_output_kwargs)
    accuracy = train(model_params, dataset, valid, num_epochs=200, **config, num_epochs_per_validation=10)
    # here hyperopt library tries to minimize 'loss' value therefore 
    # taking the negative logarithm of accuracy get it done.
    print('accuracy is', accuracy)
    return {'loss': abs(-math.log(accuracy)), 'accuracy': accuracy}

In [None]:
# here we apply a bayesian optimization for finding the best hyperparameters
best_hparams = fmin(objective, EXPERIMENT_SPACE, algo=tpe.suggest)
print(best_hparams)

model_params                                                                                                                                                  
{'residual': True, 'batch_norm': True, 'pool': 'avg', 'pool_stride': 3, 'conv_kernel_size': 2, 'pool_kernel_size': 3}                                         
----------------------------------------------------------------                                                                                              
        Layer (type)               Output Shape         Param #                                                                                               
            Conv2d-1         [5000, 32, 32, 32]             896                                                                                               
       BatchNorm2d-2         [5000, 32, 32, 32]              64                                                                                               
              ReLU-3         [5000, 32, 32, 32

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,0.00504
f1_score,0.005
loss,5.29973



  0%|          | 0/200 [00:00<?, ?it/s][A
Epoch 1 of 200:   0%|          | 0/200 [00:00<?, ?it/s][A
Epoch 1 of 200:   0%|          | 0/200 [00:14<?, ?it/s, loss=5.33, accuracy=0.00471, f1_score=0.00469][A
Epoch 1 of 200:   0%|          | 1/200 [00:14<48:05, 14.50s/it, loss=5.33, accuracy=0.00471, f1_score=0.00469][A
Epoch 2 of 200:   0%|          | 1/200 [00:14<48:05, 14.50s/it, loss=5.33, accuracy=0.00471, f1_score=0.00469][A
Epoch 2 of 200:   0%|          | 1/200 [00:29<48:05, 14.50s/it, loss=5.33, accuracy=0.0049, f1_score=0.00483] [A
Epoch 2 of 200:   1%|1         | 2/200 [00:29<47:51, 14.50s/it, loss=5.33, accuracy=0.0049, f1_score=0.00483][A
Epoch 3 of 200:   1%|1         | 2/200 [00:29<47:51, 14.50s/it, loss=5.33, accuracy=0.0049, f1_score=0.00483][A
Epoch 3 of 200:   1%|1         | 2/200 [00:43<47:51, 14.50s/it, loss=5.33, accuracy=0.00473, f1_score=0.00473][A
Epoch 3 of 200:   2%|1         | 3/200 [00:43<47:38, 14.51s/it, loss=5.33, accuracy=0.00473, f1_score=0.00473

Epoch 24 of 200:  12%|#2        | 24/200 [05:56<43:19, 14.77s/it, loss=5.33, accuracy=0.00538, f1_score=0.0053][A
Epoch 25 of 200:  12%|#2        | 24/200 [05:56<43:19, 14.77s/it, loss=5.33, accuracy=0.00538, f1_score=0.0053][A
Epoch 25 of 200:  12%|#2        | 24/200 [06:11<43:19, 14.77s/it, loss=5.33, accuracy=0.00527, f1_score=0.00531][A
Epoch 25 of 200:  12%|#2        | 25/200 [06:11<42:49, 14.68s/it, loss=5.33, accuracy=0.00527, f1_score=0.00531][A
Epoch 26 of 200:  12%|#2        | 25/200 [06:11<42:49, 14.68s/it, loss=5.33, accuracy=0.00527, f1_score=0.00531][A
Epoch 26 of 200:  12%|#2        | 25/200 [06:25<42:49, 14.68s/it, loss=5.33, accuracy=0.00529, f1_score=0.00527][A
Epoch 26 of 200:  13%|#3        | 26/200 [06:25<42:19, 14.59s/it, loss=5.33, accuracy=0.00529, f1_score=0.00527][A
Epoch 27 of 200:  13%|#3        | 26/200 [06:25<42:19, 14.59s/it, loss=5.33, accuracy=0.00529, f1_score=0.00527][A
Epoch 27 of 200:  13%|#3        | 26/200 [06:40<42:19, 14.59s/it, loss=5.3

Epoch 48 of 200:  24%|##3       | 47/200 [11:38<37:09, 14.57s/it, loss=5.33, accuracy=0.00536, f1_score=0.00531][A
Epoch 48 of 200:  24%|##3       | 47/200 [11:53<37:09, 14.57s/it, loss=5.33, accuracy=0.00525, f1_score=0.0053] [A
Epoch 48 of 200:  24%|##4       | 48/200 [11:53<36:45, 14.51s/it, loss=5.33, accuracy=0.00525, f1_score=0.0053][A
Epoch 49 of 200:  24%|##4       | 48/200 [11:53<36:45, 14.51s/it, loss=5.33, accuracy=0.00525, f1_score=0.0053][A
Epoch 49 of 200:  24%|##4       | 48/200 [12:07<36:45, 14.51s/it, loss=5.33, accuracy=0.00531, f1_score=0.00533][A
Epoch 49 of 200:  24%|##4       | 49/200 [12:07<36:29, 14.50s/it, loss=5.33, accuracy=0.00531, f1_score=0.00533][A
Epoch 50 of 200:  24%|##4       | 49/200 [12:07<36:29, 14.50s/it, loss=5.33, accuracy=0.00531, f1_score=0.00533][A
Epoch 50 of 200:  24%|##4       | 49/200 [12:22<36:29, 14.50s/it, loss=5.33, accuracy=0.00534, f1_score=0.00535][A
Epoch 50 of 200:  25%|##5       | 50/200 [12:26<39:47, 15.91s/it, loss=5.3

Epoch 71 of 200:  35%|###5      | 70/200 [17:40<34:38, 15.99s/it, loss=5.32, accuracy=0.0052, f1_score=0.00528][A
Epoch 71 of 200:  36%|###5      | 71/200 [17:40<33:21, 15.52s/it, loss=5.32, accuracy=0.0052, f1_score=0.00528][A
Epoch 72 of 200:  36%|###5      | 71/200 [17:40<33:21, 15.52s/it, loss=5.32, accuracy=0.0052, f1_score=0.00528][A