## import modules

In [1]:
!pip install torch==1.12.1
!pip install torchvision==0.13.1
!pip install matplotlib==3.5.2
!pip install optuna==2.10.1



[0m

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import matplotlib.pyplot as plt
import optuna

torch.use_deterministic_algorithms(True)
device = torch.device("cpu")

## define model architecture

In [3]:
class ConvNet(nn.Module):
    def __init__(self, trial):
        super(ConvNet, self).__init__()
        num_conv_layers = trial.suggest_int("num_conv_layers", 1, 4)
        num_fc_layers = trial.suggest_int("num_fc_layers", 1, 2)

        self.layers = []
        input_depth = 1 # grayscale image
        for i in range(num_conv_layers):
            output_depth = trial.suggest_int(f"conv_depth_{i}", 16, 64)
            self.layers.append(nn.Conv2d(input_depth, output_depth, 3, 1))
            self.layers.append(nn.ReLU())
            input_depth = output_depth
        self.layers.append(nn.MaxPool2d(2))
        p = trial.suggest_float(f"conv_dropout_{i}", 0.1, 0.4)
        self.layers.append(nn.Dropout(p))
        self.layers.append(nn.Flatten())

        input_feat = self._get_flatten_shape()
        for i in range(num_fc_layers):
            output_feat = trial.suggest_int(f"fc_output_feat_{i}", 16, 64)
            self.layers.append(nn.Linear(input_feat, output_feat))
            self.layers.append(nn.ReLU())
            p = trial.suggest_float(f"fc_dropout_{i}", 0.1, 0.4)
            self.layers.append(nn.Dropout(p))
            input_feat = output_feat
        self.layers.append(nn.Linear(input_feat, 10))
        self.layers.append(nn.LogSoftmax(dim=1))
        
        self.model = nn.Sequential(*self.layers)
    
    def _get_flatten_shape(self):
        conv_model = nn.Sequential(*self.layers)
        op_feat = conv_model(torch.rand(1, 1, 28, 28))
        n_size = op_feat.data.view(1, -1).size(1)
        return n_size
 
    def forward(self, x):
        return self.model(x)

## create data loaders

In [4]:
# The mean and standard deviation values are calculated as the mean of all pixel values of all images in the training dataset
train_ds = datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1302,), (0.3069,))]))
test_ds = datasets.MNIST('../data', train=False, 
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1302,), (0.3069,))]))

train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=500, shuffle=True)

## define training and inference routines

In [5]:
def train(model, device, train_dataloader, optim, epoch):
    model.train()
    for b_i, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)
        optim.zero_grad()
        pred_prob = model(X)
        loss = F.nll_loss(pred_prob, y) # nll is the negative likelihood loss
        loss.backward()
        optim.step()
        if b_i % 500 == 0:
            print('epoch: {} [{}/{} ({:.0f}%)]\t training loss: {:.6f}'.format(
                epoch, b_i * len(X), len(train_dataloader.dataset),
                100. * b_i / len(train_dataloader), loss.item()))

In [6]:
def test(model, device, test_dataloader):
    model.eval()
    loss = 0
    success = 0
    with torch.no_grad():
        for X, y in test_dataloader:
            X, y = X.to(device), y.to(device)
            pred_prob = model(X)
            loss += F.nll_loss(pred_prob, y, reduction='sum').item()  # loss summed across the batch
            pred = pred_prob.argmax(dim=1, keepdim=True)  # use argmax to get the most likely prediction
            success += pred.eq(y.view_as(pred)).sum().item()

    loss /= len(test_dataloader.dataset)
    
    accuracy = 100. * success / len(test_dataloader.dataset)

    print('\nTest dataset: Overall Loss: {:.4f}, Overall Accuracy: {}/{} ({:.0f}%)\n'.format(
        loss, success, len(test_dataloader.dataset), accuracy))
    
    return accuracy

## define optimizer and model training routine

In [7]:
def objective(trial):
    
    model = ConvNet(trial)
    opt_name = trial.suggest_categorical("optimizer", ["Adam", "Adadelta", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-1, 5e-1, log=True)
    optimizer = getattr(optim, opt_name)(model.parameters(), lr=lr)
    
    for epoch in range(1, 3):
        train(model, device, train_dataloader, optimizer, epoch)
        accuracy = test(model, device, test_dataloader)
        trial.report(accuracy, epoch)
        
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy

## run the hyperparameter search

In [8]:
study = optuna.create_study(study_name="mastering_pytorch", direction="maximize")
study.optimize(objective, n_trials=100, timeout=2000)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("results: ")
print("num_trials_conducted: ", len(study.trials))
print("num_trials_pruned: ", len(pruned_trials))
print("num_trials_completed: ", len(complete_trials))

print("results from best trial:")
trial = study.best_trial

print("accuracy: ", trial.value)
print("hyperparameters: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

[32m[I 2022-10-30 19:24:56,015][0m A new study created in memory with name: mastering_pytorch[0m



Test dataset: Overall Loss: 2.3457, Overall Accuracy: 1009/10000 (10%)



[32m[I 2022-10-30 19:29:08,508][0m Trial 0 finished with value: 11.35 and parameters: {'num_conv_layers': 2, 'num_fc_layers': 2, 'conv_depth_0': 53, 'conv_depth_1': 33, 'conv_dropout_1': 0.17143248024909297, 'fc_output_feat_0': 55, 'fc_dropout_0': 0.2541003269326993, 'fc_output_feat_1': 26, 'fc_dropout_1': 0.3321122467173282, 'optimizer': 'Adam', 'lr': 0.34442962131036203}. Best is trial 0 with value: 11.35.[0m



Test dataset: Overall Loss: 2.3166, Overall Accuracy: 1135/10000 (11%)


Test dataset: Overall Loss: 0.1789, Overall Accuracy: 9506/10000 (95%)



[32m[I 2022-10-30 19:30:25,372][0m Trial 1 finished with value: 96.54 and parameters: {'num_conv_layers': 1, 'num_fc_layers': 2, 'conv_depth_0': 32, 'conv_dropout_0': 0.210147972090233, 'fc_output_feat_0': 38, 'fc_dropout_0': 0.23776899365181517, 'fc_output_feat_1': 25, 'fc_dropout_1': 0.3023416454325315, 'optimizer': 'Adadelta', 'lr': 0.14875299308435333}. Best is trial 1 with value: 96.54.[0m



Test dataset: Overall Loss: 0.1189, Overall Accuracy: 9654/10000 (97%)


Test dataset: Overall Loss: 2.3416, Overall Accuracy: 1032/10000 (10%)



[32m[I 2022-10-30 19:37:43,731][0m Trial 2 finished with value: 9.74 and parameters: {'num_conv_layers': 3, 'num_fc_layers': 2, 'conv_depth_0': 36, 'conv_depth_1': 61, 'conv_depth_2': 62, 'conv_dropout_2': 0.3908412995474969, 'fc_output_feat_0': 23, 'fc_dropout_0': 0.2988917276039279, 'fc_output_feat_1': 63, 'fc_dropout_1': 0.38302968688018324, 'optimizer': 'RMSprop', 'lr': 0.31021398410582246}. Best is trial 1 with value: 96.54.[0m



Test dataset: Overall Loss: 2.3680, Overall Accuracy: 974/10000 (10%)


Test dataset: Overall Loss: 2.3033, Overall Accuracy: 1135/10000 (11%)



[32m[I 2022-10-30 19:42:59,904][0m Trial 3 finished with value: 9.58 and parameters: {'num_conv_layers': 4, 'num_fc_layers': 1, 'conv_depth_0': 34, 'conv_depth_1': 39, 'conv_depth_2': 26, 'conv_depth_3': 49, 'conv_dropout_3': 0.17891855958588482, 'fc_output_feat_0': 31, 'fc_dropout_0': 0.3085510868976604, 'optimizer': 'SGD', 'lr': 0.4142508421154321}. Best is trial 1 with value: 96.54.[0m



Test dataset: Overall Loss: 2.3052, Overall Accuracy: 958/10000 (10%)


Test dataset: Overall Loss: 0.1119, Overall Accuracy: 9656/10000 (97%)



[32m[I 2022-10-30 19:44:01,530][0m Trial 4 finished with value: 97.36 and parameters: {'num_conv_layers': 1, 'num_fc_layers': 1, 'conv_depth_0': 40, 'conv_dropout_0': 0.20849005332387924, 'fc_output_feat_0': 36, 'fc_dropout_0': 0.1884916748930443, 'optimizer': 'SGD', 'lr': 0.31003454731062713}. Best is trial 4 with value: 97.36.[0m



Test dataset: Overall Loss: 0.0866, Overall Accuracy: 9736/10000 (97%)


Test dataset: Overall Loss: 0.0528, Overall Accuracy: 9819/10000 (98%)



[32m[I 2022-10-30 19:46:11,006][0m Trial 5 finished with value: 98.61 and parameters: {'num_conv_layers': 2, 'num_fc_layers': 1, 'conv_depth_0': 30, 'conv_depth_1': 40, 'conv_dropout_1': 0.1420256489315686, 'fc_output_feat_0': 60, 'fc_dropout_0': 0.19463202740431523, 'optimizer': 'Adadelta', 'lr': 0.14308833731292162}. Best is trial 5 with value: 98.61.[0m



Test dataset: Overall Loss: 0.0424, Overall Accuracy: 9861/10000 (99%)



[32m[I 2022-10-30 19:48:44,003][0m Trial 6 pruned. [0m



Test dataset: Overall Loss: 2.3623, Overall Accuracy: 1028/10000 (10%)



[32m[I 2022-10-30 19:50:48,121][0m Trial 7 pruned. [0m



Test dataset: Overall Loss: 2.3051, Overall Accuracy: 1028/10000 (10%)


Test dataset: Overall Loss: 0.0621, Overall Accuracy: 9803/10000 (98%)



[32m[I 2022-10-30 19:56:53,612][0m Trial 8 finished with value: 98.48 and parameters: {'num_conv_layers': 3, 'num_fc_layers': 1, 'conv_depth_0': 42, 'conv_depth_1': 61, 'conv_depth_2': 59, 'conv_dropout_2': 0.13829291201008012, 'fc_output_feat_0': 32, 'fc_dropout_0': 0.1867735709629219, 'optimizer': 'Adadelta', 'lr': 0.10970838415947894}. Best is trial 5 with value: 98.61.[0m



Test dataset: Overall Loss: 0.0445, Overall Accuracy: 9848/10000 (98%)



[32m[I 2022-10-30 19:58:52,389][0m Trial 9 pruned. [0m



Test dataset: Overall Loss: 2.3092, Overall Accuracy: 974/10000 (10%)

results: 
num_trials_conducted:  10
num_trials_pruned:  3
num_trials_completed:  7
results from best trial:
accuracy:  98.61
hyperparameters: 
num_conv_layers: 2
num_fc_layers: 1
conv_depth_0: 30
conv_depth_1: 40
conv_dropout_1: 0.1420256489315686
fc_output_feat_0: 60
fc_dropout_0: 0.19463202740431523
optimizer: Adadelta
lr: 0.14308833731292162
