In [1]:
import os

import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms

In [2]:
DEVICE = torch.device("cuda:1")
CLASSES = 10
DIR = os.getcwd()
EPOCHS = 10
DATASET = 'FashionMNIST'

In [3]:
train_data = getattr(datasets, DATASET)(DIR, train=True, download=True, transform=transforms.ToTensor())
test_data = getattr(datasets, DATASET)(DIR, train=False, download=True, transform=transforms.ToTensor())

len(train_data), len(test_data)

(60000, 10000)

In [4]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np

class SizeEstimator(object):

    def __init__(self, model, input_size=(1,1,32,32), bits=32):
        '''
        Estimates the size of PyTorch models in memory
        for a given input size
        '''
        self.model = model
        self.input_size = input_size
        self.bits = bits
        return

    def get_parameter_sizes(self):
        '''Get sizes of all parameters in `model`'''
        mods = list(self.model.modules())
        sizes = []
        
        for i in range(1,len(mods)):
            m = mods[i]
            p = list(m.parameters())
            for j in range(len(p)):
                sizes.append(np.array(p[j].size()))

        self.param_sizes = sizes
        return

    def get_output_sizes(self):
        '''Run sample input through each layer to get output sizes'''
        input_ = Variable(torch.FloatTensor(*self.input_size), volatile=True)
        mods = list(self.model.modules())
        out_sizes = []
        for i in range(1, len(mods)):
            m = mods[i]
            out = m(input_)
            out_sizes.append(np.array(out.size()))
            input_ = out

        self.out_sizes = out_sizes
        return

    def calc_param_bits(self):
        '''Calculate total number of bits to store `model` parameters'''
        total_bits = 0
        for i in range(len(self.param_sizes)):
            s = self.param_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        self.param_bits = total_bits
        return

    def calc_forward_backward_bits(self):
        '''Calculate bits to store forward and backward pass'''
        total_bits = 0
        for i in range(len(self.out_sizes)):
            s = self.out_sizes[i]
            bits = np.prod(np.array(s))*self.bits
            total_bits += bits
        # multiply by 2 for both forward AND backward
        self.forward_backward_bits = (total_bits*2)
        return

    def calc_input_bits(self):
        '''Calculate bits to store input'''
        self.input_bits = np.prod(np.array(self.input_size))*self.bits
        return

    def estimate_size(self):
        '''Estimate model size in memory in megabytes and bits'''
        self.get_parameter_sizes()
        self.get_output_sizes()
        self.calc_param_bits()
        self.calc_forward_backward_bits()
        self.calc_input_bits()
        total = self.param_bits + self.forward_backward_bits + self.input_bits

        total_megabytes = (total/8)/(1024**2)
        return total_megabytes, total

In [5]:
def dimensions_shape(x, k=3):
    return int(x - (k-1) - 1) + 1

In [6]:
def get_model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

In [7]:
def define_model(trial):
    
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    in_channels = 1
    dim_shape = 28
    
    in_features = in_channels * dim_shape * dim_shape
    
    p = trial.suggest_float("dropout_l", 0.1, 0.5)
    
    conv_filter_1 = trial.suggest_int("conv_filter_1", 16, 96)
    conv_filter_2 = trial.suggest_int("conv_filter_2", 3, 16)
    
    n_layers_conv1 = trial.suggest_int("n_layers_conv1", 1, 3)
    n_layers_conv2 = trial.suggest_int("n_layers_conv2", 0, 3)
    n_layers_fc = trial.suggest_int("n_layers_fc", 1, 3)
    
    layers = [
        nn.BatchNorm2d(in_channels)
    ]
    
    

    for i in range(n_layers_conv1):
        
        layers.append(nn.Conv2d(in_channels, conv_filter_1, 3, bias=False))
        layers.append(nn.BatchNorm2d(conv_filter_1))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout2d(p))
        
        dim_shape = dimensions_shape(dim_shape)

        in_channels = conv_filter_1
        
    

    for i in range(n_layers_conv2):
        
        layers.append(nn.Conv2d(in_channels, conv_filter_2, 3, bias=False))
        layers.append(nn.BatchNorm2d(conv_filter_2))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout2d(p))
        
        dim_shape = dimensions_shape(dim_shape)

        in_channels = conv_filter_2
        
    layers.append(nn.Flatten())
    in_features = in_channels * dim_shape * dim_shape

    for i in range(n_layers_fc):
        out_features = trial.suggest_int("n_units_l{}".format(i), 50, 400)
        
        layers.append(nn.Linear(in_features, out_features, bias=False))
        layers.append(nn.BatchNorm1d(out_features))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(p))

        in_features = out_features

        
    layers.append(nn.Linear(in_features, 40, bias=False))
    layers.append(nn.BatchNorm1d(40))
    layers.append(nn.ReLU())
    layers.append(nn.Linear(40, CLASSES))

    return nn.Sequential(*layers)


In [8]:
def get_data(trial):
    
    BATCHSIZE = trial.suggest_int('batchsize', 32, 128)
    
    N_TRAIN_EXAMPLES = 20_000 // BATCHSIZE
    N_VALID_EXAMPLES = 2_500 // BATCHSIZE
    
    # Load FashionMNIST dataset.
    train_loader = torch.utils.data.DataLoader(train_data,batch_size=BATCHSIZE,shuffle=True)
    valid_loader = torch.utils.data.DataLoader(test_data,batch_size=BATCHSIZE,shuffle=True)

    return train_loader, valid_loader, BATCHSIZE, N_TRAIN_EXAMPLES, N_VALID_EXAMPLES

In [9]:
def objective(trial):
    # Generate the model.
    model = define_model(trial).to(DEVICE)
    
    # Generate the optimizers.
    loss_function = nn.CrossEntropyLoss()
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # Get the FashionMNIST dataset.
    train_loader, valid_loader, BATCHSIZE, N_TRAIN_EXAMPLES, N_VALID_EXAMPLES = get_data(trial)

    estimated_size = get_model_size(model)
    
    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx >= N_TRAIN_EXAMPLES:
                break

            data, target = data.to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = loss_function(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx >= N_VALID_EXAMPLES:
                    break
                data, target = data.to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1)
                correct += (pred == target).sum().item()

        accuracy = correct / (N_VALID_EXAMPLES * BATCHSIZE)

#         trial.report(epoch, accuracy, estimated_size)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy

In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200, timeout=60 * 60 * 2)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])


# print(f"Number of trials on the Pareto front: {len(study.best_trials)}")

# trial_with_highest_accuracy = max(study.best_trials, key=lambda t: t.values[1])
# print(f"Trial with highest accuracy: ")
# print(f"\tnumber: {trial_with_highest_accuracy.number}")
# print(f"\tparams: {trial_with_highest_accuracy.params}")
# print(f"\tvalues: {trial_with_highest_accuracy.values}")

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2023-04-09 23:15:39,610][0m A new study created in memory with name: no-name-ef081e92-141a-4ee0-91af-825637bf430e[0m
[32m[I 2023-04-09 23:17:23,648][0m Trial 0 finished with value: 0.8781074578989575 and parameters: {'dropout_l': 0.1819027312202544, 'conv_filter_1': 30, 'conv_filter_2': 10, 'n_layers_conv1': 3, 'n_layers_conv2': 3, 'n_layers_fc': 2, 'n_units_l0': 96, 'n_units_l1': 56, 'optimizer': 'SGD', 'lr': 0.07888620486750411, 'batchsize': 43}. Best is trial 0 with value: 0.8781074578989575.[0m
[32m[I 2023-04-09 23:18:20,849][0m Trial 1 finished with value: 0.9024096385542169 and parameters: {'dropout_l': 0.16874804249654002, 'conv_filter_1': 75, 'conv_filter_2': 15, 'n_layers_conv1': 1, 'n_layers_conv2': 2, 'n_layers_fc': 2, 'n_units_l0': 86, 'n_units_l1': 187, 'optimizer': 'Adam', 'lr': 0.009345865811785183, 'batchsize': 83}. Best is trial 1 with value: 0.9024096385542169.[0m
[32m[I 2023-04-09 23:19:24,794][0m Trial 2 finished with value: 0.8933658933658933 and 

[32m[I 2023-04-09 23:38:03,165][0m Trial 21 finished with value: 0.8975903614457831 and parameters: {'dropout_l': 0.16167457206953484, 'conv_filter_1': 77, 'conv_filter_2': 15, 'n_layers_conv1': 1, 'n_layers_conv2': 2, 'n_layers_fc': 2, 'n_units_l0': 94, 'n_units_l1': 184, 'optimizer': 'Adam', 'lr': 0.006217838372381532, 'batchsize': 83}. Best is trial 13 with value: 0.91.[0m
[32m[I 2023-04-09 23:38:59,883][0m Trial 22 finished with value: 0.8873009391588403 and parameters: {'dropout_l': 0.17169395715276933, 'conv_filter_1': 64, 'conv_filter_2': 7, 'n_layers_conv1': 1, 'n_layers_conv2': 2, 'n_layers_fc': 2, 'n_units_l0': 59, 'n_units_l1': 262, 'optimizer': 'Adam', 'lr': 0.0024723833981829028, 'batchsize': 79}. Best is trial 13 with value: 0.91.[0m
[32m[I 2023-04-09 23:39:57,955][0m Trial 23 finished with value: 0.903814935064935 and parameters: {'dropout_l': 0.23496577671350977, 'conv_filter_1': 45, 'conv_filter_2': 14, 'n_layers_conv1': 1, 'n_layers_conv2': 2, 'n_layers_fc': 2

[32m[I 2023-04-09 23:57:38,553][0m Trial 43 finished with value: 0.9033333333333333 and parameters: {'dropout_l': 0.12497165178973958, 'conv_filter_1': 50, 'conv_filter_2': 3, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 338, 'n_units_l1': 306, 'optimizer': 'Adam', 'lr': 0.0005942367418734197, 'batchsize': 120}. Best is trial 41 with value: 0.9198347107438016.[0m
[32m[I 2023-04-09 23:58:27,448][0m Trial 44 finished with value: 0.8936688311688312 and parameters: {'dropout_l': 0.38174207812314986, 'conv_filter_1': 58, 'conv_filter_2': 5, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 312, 'n_units_l1': 273, 'optimizer': 'Adam', 'lr': 0.0017909967275524287, 'batchsize': 112}. Best is trial 41 with value: 0.9198347107438016.[0m
[32m[I 2023-04-09 23:59:15,554][0m Trial 45 finished with value: 0.9085365853658537 and parameters: {'dropout_l': 0.1415893497413615, 'conv_filter_1': 22, 'conv_filter_2': 4, 'n_layers_conv1': 2, 'n_lay

[32m[I 2023-04-10 00:15:36,629][0m Trial 64 finished with value: 0.9122308004876066 and parameters: {'dropout_l': 0.1782498670029277, 'conv_filter_1': 59, 'conv_filter_2': 5, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 278, 'n_units_l1': 400, 'optimizer': 'Adam', 'lr': 0.0003771173827856945, 'batchsize': 107}. Best is trial 61 with value: 0.9231692677070829.[0m
[32m[I 2023-04-10 00:16:24,582][0m Trial 65 finished with value: 0.8987603305785123 and parameters: {'dropout_l': 0.47941573441841295, 'conv_filter_1': 53, 'conv_filter_2': 6, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 229, 'n_units_l1': 318, 'optimizer': 'Adam', 'lr': 0.0006832615523337892, 'batchsize': 110}. Best is trial 61 with value: 0.9231692677070829.[0m
[32m[I 2023-04-10 00:17:16,300][0m Trial 66 finished with value: 0.9175 and parameters: {'dropout_l': 0.15474916722849452, 'conv_filter_1': 63, 'conv_filter_2': 3, 'n_layers_conv1': 2, 'n_layers_conv2': 

[32m[I 2023-04-10 00:33:42,203][0m Trial 85 finished with value: 0.9074818986323411 and parameters: {'dropout_l': 0.34569089454244306, 'conv_filter_1': 57, 'conv_filter_2': 3, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 236, 'n_units_l1': 359, 'optimizer': 'Adam', 'lr': 0.0010513538797849564, 'batchsize': 113}. Best is trial 61 with value: 0.9231692677070829.[0m
[32m[I 2023-04-10 00:34:31,402][0m Trial 86 finished with value: 0.815 and parameters: {'dropout_l': 0.13156351604441624, 'conv_filter_1': 54, 'conv_filter_2': 4, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 327, 'n_units_l1': 400, 'optimizer': 'SGD', 'lr': 0.00045868415654457266, 'batchsize': 120}. Best is trial 61 with value: 0.9231692677070829.[0m
[32m[I 2023-04-10 00:35:22,332][0m Trial 87 finished with value: 0.8701668701668702 and parameters: {'dropout_l': 0.45832016432545597, 'conv_filter_1': 76, 'conv_filter_2': 7, 'n_layers_conv1': 2, 'n_layers_conv2': 

[32m[I 2023-04-10 00:51:24,133][0m Trial 106 finished with value: 0.9002403846153846 and parameters: {'dropout_l': 0.11914417656135325, 'conv_filter_1': 70, 'conv_filter_2': 4, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 331, 'n_units_l1': 392, 'optimizer': 'Adam', 'lr': 0.009728106905526118, 'batchsize': 96}. Best is trial 61 with value: 0.9231692677070829.[0m
[32m[I 2023-04-10 00:52:14,282][0m Trial 107 finished with value: 0.9064182194616978 and parameters: {'dropout_l': 0.12668624458560246, 'conv_filter_1': 66, 'conv_filter_2': 4, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 308, 'n_units_l1': 374, 'optimizer': 'Adam', 'lr': 0.0008246673050952902, 'batchsize': 105}. Best is trial 61 with value: 0.9231692677070829.[0m
[32m[I 2023-04-10 00:53:09,408][0m Trial 108 finished with value: 0.8992279561154003 and parameters: {'dropout_l': 0.10064116498352968, 'conv_filter_1': 63, 'conv_filter_2': 3, 'n_layers_conv1': 2, 'n_l

[32m[I 2023-04-10 01:09:44,910][0m Trial 127 finished with value: 0.9122596153846154 and parameters: {'dropout_l': 0.11659508472037824, 'conv_filter_1': 65, 'conv_filter_2': 9, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 334, 'n_units_l1': 382, 'optimizer': 'Adam', 'lr': 0.002779920773287505, 'batchsize': 64}. Best is trial 61 with value: 0.9231692677070829.[0m
[32m[I 2023-04-10 01:10:36,462][0m Trial 128 finished with value: 0.911504424778761 and parameters: {'dropout_l': 0.15633515477284304, 'conv_filter_1': 80, 'conv_filter_2': 3, 'n_layers_conv1': 2, 'n_layers_conv2': 0, 'n_layers_fc': 2, 'n_units_l0': 377, 'n_units_l1': 345, 'optimizer': 'Adam', 'lr': 0.0006676821230749441, 'batchsize': 113}. Best is trial 61 with value: 0.9231692677070829.[0m
[32m[I 2023-04-10 01:11:22,871][0m Trial 129 finished with value: 0.9105090311986864 and parameters: {'dropout_l': 0.17836554455376893, 'conv_filter_1': 59, 'conv_filter_2': 6, 'n_layers_conv1': 2, 'n_la

Study statistics: 
  Number of finished trials:  136
  Number of pruned trials:  0
  Number of complete trials:  136
Best trial:
  Value:  0.9231692677070829
  Params: 
    dropout_l: 0.17578166518664656
    conv_filter_1: 61
    conv_filter_2: 4
    n_layers_conv1: 2
    n_layers_conv2: 0
    n_layers_fc: 2
    n_units_l0: 254
    n_units_l1: 399
    optimizer: Adam
    lr: 0.0004919153512392123
    batchsize: 119


In [14]:
# df = study.trials_dataframe().sort_values(['values_0', 'values_1'], ascending=False)
df = study.trials_dataframe().sort_values('value', ascending=False)
df.to_csv(DIR + f'/optuna-{DATASET.lower()}.csv', index=False)

In [15]:
df.head(7)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_batchsize,params_conv_filter_1,params_conv_filter_2,params_dropout_l,params_lr,params_n_layers_conv1,params_n_layers_conv2,params_n_layers_fc,params_n_units_l0,params_n_units_l1,params_n_units_l2,params_optimizer,state
61,61,0.923169,2023-04-10 00:12:11.539183,2023-04-10 00:13:03.877949,0 days 00:00:52.338766,119,61,4,0.175782,0.000492,2,0,2,254,399.0,,Adam,COMPLETE
94,94,0.922518,2023-04-10 00:40:28.453459,2023-04-10 00:41:19.725864,0 days 00:00:51.272405,118,62,4,0.100024,0.000554,2,0,2,292,386.0,,Adam,COMPLETE
47,47,0.922477,2023-04-10 00:00:02.325247,2023-04-10 00:00:53.527314,0 days 00:00:51.202067,106,69,4,0.12159,0.004333,3,0,2,321,228.0,,Adam,COMPLETE
104,104,0.922067,2023-04-10 00:48:46.936118,2023-04-10 00:49:36.095549,0 days 00:00:49.159431,106,67,4,0.100279,0.004527,2,0,2,341,380.0,,Adam,COMPLETE
114,114,0.922018,2023-04-10 00:57:30.505754,2023-04-10 00:58:28.711100,0 days 00:00:58.205346,109,84,5,0.187478,0.002953,2,0,2,337,392.0,,Adam,COMPLETE
103,103,0.921561,2023-04-10 00:47:57.295340,2023-04-10 00:48:46.931672,0 days 00:00:49.636332,113,66,4,0.203782,0.000756,2,0,2,341,364.0,,Adam,COMPLETE
109,109,0.920756,2023-04-10 00:53:09.410687,2023-04-10 00:54:01.594921,0 days 00:00:52.184234,113,73,5,0.174035,0.001067,2,0,2,343,379.0,,Adam,COMPLETE


In [13]:
# Trial 17 finished with values: [0.905562224889956, 3.3138427734375] and parameters: {'dropout_l': 0.1709405615825048, 'conv_filter_1': 45, 'conv_filter_2': 8, 'n_layers_conv1': 2, 'n_layers_conv2': 1, 'n_layers_fc': 1, 'n_units_l0': 216, 'optimizer': 'Adam', 'lr': 0.0010785994045218131, 'batchsize': 51}.