## import modules

In [1]:
!pip install torch==2.2
!pip install torchvision==0.17.0
!pip install matplotlib==3.5.2
!pip install optuna==2.10.1

Collecting optuna==2.10.1
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.2/308.2 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting cliff (from optuna==2.10.1)
  Downloading cliff-4.5.0-py3-none-any.whl.metadata (2.0 kB)
Collecting cmaes>=0.8.2 (from optuna==2.10.1)
  Downloading cmaes-0.10.0-py3-none-any.whl.metadata (19 kB)
Collecting autopage>=0.4.0 (from cliff->optuna==2.10.1)
  Downloading autopage-0.5.2-py3-none-any.whl.metadata (7.9 kB)
Collecting cmd2>=1.0.0 (from cliff->optuna==2.10.1)
  Downloading cmd2-2.4.3-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.2/147.2 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting stevedore>=2.0.1 (from cliff->optuna==2.10.1)
  Downloading stevedore-5.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pyperclip>=1.6 (from cmd2>=1.0.0->cliff->optuna==2.10.1)
  Downloading pyperclip-1.8.2.tar.gz (20 kB)
  Prepa

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import matplotlib.pyplot as plt
import optuna

torch.use_deterministic_algorithms(True)
device = torch.device("cpu")

  _torch_pytree._register_pytree_node(


## define model architecture

In [3]:
class ConvNet(nn.Module):
    def __init__(self, trial):
        super(ConvNet, self).__init__()
        num_conv_layers = trial.suggest_int("num_conv_layers", 1, 4)
        num_fc_layers = trial.suggest_int("num_fc_layers", 1, 2)

        self.layers = []
        input_depth = 1 # grayscale image
        for i in range(num_conv_layers):
            output_depth = trial.suggest_int(f"conv_depth_{i}", 16, 64)
            self.layers.append(nn.Conv2d(input_depth, output_depth, 3, 1))
            self.layers.append(nn.ReLU())
            input_depth = output_depth
        self.layers.append(nn.MaxPool2d(2))
        p = trial.suggest_float(f"conv_dropout_{i}", 0.1, 0.4)
        self.layers.append(nn.Dropout(p))
        self.layers.append(nn.Flatten())

        input_feat = self._get_flatten_shape()
        for i in range(num_fc_layers):
            output_feat = trial.suggest_int(f"fc_output_feat_{i}", 16, 64)
            self.layers.append(nn.Linear(input_feat, output_feat))
            self.layers.append(nn.ReLU())
            p = trial.suggest_float(f"fc_dropout_{i}", 0.1, 0.4)
            self.layers.append(nn.Dropout(p))
            input_feat = output_feat
        self.layers.append(nn.Linear(input_feat, 10))
        self.layers.append(nn.LogSoftmax(dim=1))
        
        self.model = nn.Sequential(*self.layers)
    
    def _get_flatten_shape(self):
        conv_model = nn.Sequential(*self.layers)
        op_feat = conv_model(torch.rand(1, 1, 28, 28))
        n_size = op_feat.data.view(1, -1).size(1)
        return n_size
 
    def forward(self, x):
        return self.model(x)

## create data loaders

In [4]:
# The mean and standard deviation values are calculated as the mean of all pixel values of all images in the training dataset
train_ds = datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1302,), (0.3069,))]))
test_ds = datasets.MNIST('../data', train=False, 
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1302,), (0.3069,))]))

train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=500, shuffle=True)

## define training and inference routines

In [5]:
def train(model, device, train_dataloader, optim, epoch):
    model.train()
    for b_i, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)
        optim.zero_grad()
        pred_prob = model(X)
        loss = F.nll_loss(pred_prob, y) # nll is the negative likelihood loss
        loss.backward()
        optim.step()
        if b_i % 500 == 0:
            print('epoch: {} [{}/{} ({:.0f}%)]\t training loss: {:.6f}'.format(
                epoch, b_i * len(X), len(train_dataloader.dataset),
                100. * b_i / len(train_dataloader), loss.item()))

In [6]:
def test(model, device, test_dataloader):
    model.eval()
    loss = 0
    success = 0
    with torch.no_grad():
        for X, y in test_dataloader:
            X, y = X.to(device), y.to(device)
            pred_prob = model(X)
            loss += F.nll_loss(pred_prob, y, reduction='sum').item()  # loss summed across the batch
            pred = pred_prob.argmax(dim=1, keepdim=True)  # use argmax to get the most likely prediction
            success += pred.eq(y.view_as(pred)).sum().item()

    loss /= len(test_dataloader.dataset)
    
    accuracy = 100. * success / len(test_dataloader.dataset)

    print('\nTest dataset: Overall Loss: {:.4f}, Overall Accuracy: {}/{} ({:.0f}%)\n'.format(
        loss, success, len(test_dataloader.dataset), accuracy))
    
    return accuracy

## define optimizer and model training routine

In [7]:
def objective(trial):
    
    model = ConvNet(trial)
    opt_name = trial.suggest_categorical("optimizer", ["Adam", "Adadelta", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-1, 5e-1, log=True)
    optimizer = getattr(optim, opt_name)(model.parameters(), lr=lr)
    
    for epoch in range(1, 3):
        train(model, device, train_dataloader, optimizer, epoch)
        accuracy = test(model, device, test_dataloader)
        trial.report(accuracy, epoch)
        
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy

## run the hyperparameter search

In [8]:
study = optuna.create_study(study_name="mastering_pytorch", direction="maximize")
study.optimize(objective, n_trials=100, timeout=2000)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("results: ")
print("num_trials_conducted: ", len(study.trials))
print("num_trials_pruned: ", len(pruned_trials))
print("num_trials_completed: ", len(complete_trials))

print("results from best trial:")
trial = study.best_trial

print("accuracy: ", trial.value)
print("hyperparameters: ")
for key, value in trial.params.items():
    print("{}: {}".format(key, value))

[32m[I 2024-02-16 02:15:30,029][0m A new study created in memory with name: mastering_pytorch[0m



Test dataset: Overall Loss: 2.3178, Overall Accuracy: 1135/10000 (11%)



[32m[I 2024-02-16 02:16:58,997][0m Trial 0 finished with value: 9.58 and parameters: {'num_conv_layers': 2, 'num_fc_layers': 1, 'conv_depth_0': 34, 'conv_depth_1': 41, 'conv_dropout_1': 0.15383558720767948, 'fc_output_feat_0': 62, 'fc_dropout_0': 0.15506861406996786, 'optimizer': 'Adam', 'lr': 0.20023572685997063}. Best is trial 0 with value: 9.58.[0m



Test dataset: Overall Loss: 2.3296, Overall Accuracy: 958/10000 (10%)


Test dataset: Overall Loss: 2.4947, Overall Accuracy: 1009/10000 (10%)



[32m[I 2024-02-16 02:19:10,390][0m Trial 1 finished with value: 9.8 and parameters: {'num_conv_layers': 3, 'num_fc_layers': 2, 'conv_depth_0': 29, 'conv_depth_1': 49, 'conv_depth_2': 47, 'conv_dropout_2': 0.278335463941041, 'fc_output_feat_0': 51, 'fc_dropout_0': 0.31004103413830064, 'fc_output_feat_1': 58, 'fc_dropout_1': 0.15485667529511574, 'optimizer': 'RMSprop', 'lr': 0.4780748567930119}. Best is trial 1 with value: 9.8.[0m



Test dataset: Overall Loss: 2.3665, Overall Accuracy: 980/10000 (10%)


Test dataset: Overall Loss: 2.3327, Overall Accuracy: 958/10000 (10%)



[32m[I 2024-02-16 02:41:10,990][0m Trial 2 finished with value: 9.82 and parameters: {'num_conv_layers': 4, 'num_fc_layers': 2, 'conv_depth_0': 34, 'conv_depth_1': 51, 'conv_depth_2': 28, 'conv_depth_3': 55, 'conv_dropout_3': 0.14208996846752817, 'fc_output_feat_0': 35, 'fc_dropout_0': 0.2660624056161214, 'fc_output_feat_1': 36, 'fc_dropout_1': 0.15326719427526703, 'optimizer': 'Adam', 'lr': 0.14119025099148208}. Best is trial 2 with value: 9.82.[0m



Test dataset: Overall Loss: 2.3158, Overall Accuracy: 982/10000 (10%)


Test dataset: Overall Loss: 2.3727, Overall Accuracy: 1010/10000 (10%)



[32m[I 2024-02-16 02:45:38,374][0m Trial 3 finished with value: 8.92 and parameters: {'num_conv_layers': 3, 'num_fc_layers': 2, 'conv_depth_0': 17, 'conv_depth_1': 29, 'conv_depth_2': 58, 'conv_dropout_2': 0.33864705908649817, 'fc_output_feat_0': 39, 'fc_dropout_0': 0.23750831217409174, 'fc_output_feat_1': 40, 'fc_dropout_1': 0.14041408480339376, 'optimizer': 'Adam', 'lr': 0.310858350845543}. Best is trial 2 with value: 9.82.[0m



Test dataset: Overall Loss: 2.3431, Overall Accuracy: 892/10000 (9%)


Test dataset: Overall Loss: 2.3197, Overall Accuracy: 1135/10000 (11%)



[32m[I 2024-02-16 02:49:20,214][0m Trial 4 finished with value: 10.32 and parameters: {'num_conv_layers': 2, 'num_fc_layers': 1, 'conv_depth_0': 26, 'conv_depth_1': 35, 'conv_dropout_1': 0.19941309294192455, 'fc_output_feat_0': 59, 'fc_dropout_0': 0.2574163114292745, 'optimizer': 'RMSprop', 'lr': 0.14980606145925923}. Best is trial 4 with value: 10.32.[0m



Test dataset: Overall Loss: 2.3201, Overall Accuracy: 1032/10000 (10%)

results: 
num_trials_conducted:  5
num_trials_pruned:  0
num_trials_completed:  5
results from best trial:
accuracy:  10.32
hyperparameters: 
num_conv_layers: 2
num_fc_layers: 1
conv_depth_0: 26
conv_depth_1: 35
conv_dropout_1: 0.19941309294192455
fc_output_feat_0: 59
fc_dropout_0: 0.2574163114292745
optimizer: RMSprop
lr: 0.14980606145925923
