In [1]:
from collections import OrderedDict
from dataclasses import dataclass
import os.path as path

import torch as t
import torch.nn.functional as F
import torchvision as tv

import numpy as np

from haikunator import Haikunator

import mlflow
import mlflow.pytorch

from ax.service.managed_loop import optimize

In [2]:
DEVICE = t.device("cuda" if t.cuda.is_available() else "cpu")
DATAROOT = path.expanduser("~/mldata/pytorch")
EXPERIMENT_NAME = "FashionAutoTune"
DEVICE

device(type='cuda')

In [3]:
mlruns_dir = path.expanduser("~/mlruns")
mlflow.set_tracking_uri(mlruns_dir)

In [4]:
xform = tv.transforms.Compose([
    tv.transforms.ToTensor(),
    tv.transforms.Normalize((0.5,), (0.5,))
])
datapath = path.join(DATAROOT, "fashion-mnist")
train_val_set = tv.datasets.FashionMNIST(datapath, download=True, train=True, transform=xform)
train_size = int(len(train_val_set) * 0.8)
val_size = len(train_val_set) - train_size
trainset, valset = t.utils.data.random_split(train_val_set, [train_size, val_size])
testset = tv.datasets.FashionMNIST(datapath, download=True, train=False, transform=xform)

In [5]:
def create_model():
    model = t.nn.Sequential(OrderedDict([
        ("flatten", t.nn.Flatten()),
        ("fc1", t.nn.Linear(784, 128)),
        ("relu1", t.nn.ReLU()),
        ("fc2", t.nn.Linear(128, 64)),
        ("relu2", t.nn.ReLU()),
        ("fc3", t.nn.Linear(64, 32)),
        ("relu3", t.nn.ReLU()),
        ("logits", t.nn.Linear(32, 10))
    ]))
    return model

In [6]:
def accuracy(outputs, targets):
    assert outputs.shape[0] == targets.shape[0]
    predictions = t.argmax(outputs, dim=1)
    correct = t.sum(predictions == targets).item()
    return correct / targets.shape[0]

In [7]:
@dataclass
class Hyperparams:
    batch_size: int = 10
    epochs: int = 10
    learning_rate: float = 0.0001

    def to_dict(self):
        return {
            "batch_size": self.batch_size,
            "epochs": self.epochs,
            "learning_rate": np.around(self.learning_rate, 3)
        }

In [8]:
def train(model, optim, loss_fn, epochs, trainloader, valloader, hparams):
    run_name = Haikunator().haikunate()
    print(f"Starting run {run_name}")
    model = model.to(DEVICE)

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(hparams.to_dict())
        for epoch in range(epochs):
            # Process the training set
            model.train()
            with t.enable_grad():
                for images, targets in trainloader:
                    images = images.to(DEVICE)
                    targets = targets.to(DEVICE)

                    optim.zero_grad()
                    outputs = model.forward(images)
                    loss = loss_fn(outputs, targets)
                    loss.backward()
                    optim.step()

            # Calculate the validation metrics for this epoch
            val_outputs = t.empty(0, 10).to(DEVICE)
            val_targets = t.tensor([], dtype=t.long).to(DEVICE)
            model.eval()
            with t.no_grad():
                for images, targets in valloader:
                    images = images.to(DEVICE)
                    targets = targets.to(DEVICE)
                    outputs = model(images)
                    loss = loss_fn(outputs, targets)
                    val_outputs = t.cat((val_outputs, outputs.detach()))
                    val_targets = t.cat((val_targets, targets.detach()))
            val_acc = accuracy(val_outputs, val_targets)
            mlflow.log_metric("val_acc", np.around(val_acc, 2), step=epoch)
    return val_acc  # this is the final val_acc of the last epoch

In [9]:
def train_evaluate(hparams):
    hparams = Hyperparams(**hparams)
    model = create_model()
    optim = t.optim.SGD(model.parameters(), lr=hparams.learning_rate)
    loss_fn = t.nn.CrossEntropyLoss()
    trainloader = t.utils.data.DataLoader(trainset, batch_size=hparams.batch_size, shuffle=True)
    valloader = t.utils.data.DataLoader(valset, batch_size=5000)
    val_acc = train(model, optim, loss_fn, hparams.epochs, trainloader, valloader, hparams)
    return {"accuracy": (val_acc, 0.0)}

In [10]:
mlflow.set_experiment(EXPERIMENT_NAME)

hparams = [
    {"name": "batch_size", "type": "choice", "value_type": "int", "values": [16, 32, 64]},
    {"name": "epochs", "type": "range", "value_type": "int", "bounds": [7, 13]},
    {"name": "learning_rate", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True}
]

best_params, values, experiment, model = optimize(
    hparams, 
    evaluation_function=train_evaluate, 
    objective_name="accuracy",
    total_trials=5  # default is 20
)

[INFO 03-20 19:07:13] ax.modelbridge.dispatch_utils: Using Sobol generation strategy.
[INFO 03-20 19:07:13] ax.service.managed_loop: Started full optimization with 5 steps.
[INFO 03-20 19:07:13] ax.service.managed_loop: Running optimization trial 1...


Starting run plain-night-9958


[INFO 03-20 19:09:53] ax.service.managed_loop: Running optimization trial 2...


Starting run green-silence-3849


[INFO 03-20 19:12:56] ax.service.managed_loop: Running optimization trial 3...


Starting run still-wind-1821


[INFO 03-20 19:15:20] ax.service.managed_loop: Running optimization trial 4...


Starting run fragrant-bird-8937


[INFO 03-20 19:16:59] ax.service.managed_loop: Running optimization trial 5...


Starting run quiet-river-9063


In [11]:
best_params

{'epochs': 12, 'learning_rate': 0.0020909231008650105, 'batch_size': 32}

In [12]:
values

({'accuracy': 0.83475}, {'accuracy': {'accuracy': 0.0}})