In [27]:
import os
import logging
import math
from filelock import FileLock

# __import_tune_begin__
from pytorch_lightning.loggers import TensorBoardLogger
from ray import tune
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback
# __import_tune_end__


from LightningMNISTClassifier import LightningMNISTClassifier

In [6]:
logger = logging.getLogger('App')
logging.basicConfig(level=logging.INFO)

In [8]:
def train_mnist_tune_checkpoint(config,
                                checkpoint_dir=None,
                                num_epochs=10,
                                num_gpus=0):
    data_dir = os.path.expanduser("~/data")

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=config["progress_bar_refresh_rate"],
        num_sanity_val_steps=0,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_accuracy"
                },
                filename="checkpoint",
                on="validation_end"
            )
        ]
    )

    if checkpoint_dir:
        model = LightningMNISTClassifier.load_from_checkpoint(os.path.join(checkpoint_dir, "checkpoint"), config=config,
                                                              data_dir=data_dir)
        logger.info('Lightning loaded from checkpoint')
    else:
        model = LightningMNISTClassifier(config=config, data_dir=data_dir)
        logger.info('Lightning initialized')

    trainer.fit(model)

In [30]:
def tune_mnist_pbt(num_samples=20, num_epochs=10, gpus_per_trial=0):
    config = {
        "layer_1_size": tune.choice([32, 64, 128, 256, 512, 1024]),
        "layer_2_size": tune.choice([32, 64, 128, 256, 512, 1024]),
        "lr": 1e-3,
        "batch_size": tune.choice([32, 64, 128, 256, 512, 1024]),
    }

    config = {
        "layer_1_size": 512,
        "layer_2_size": 512,
        "lr": 1e-3,
        "batch_size": 64,
    }

    def explore(config):
        logger.info("======================================= EXPLORE =========================================")
        logger.info(config)
        config['batch_size'] = config['batch_size'] + 10
        return config

    def generate_batch_sizes():
        res = []
        for _ in range(random.randint(1, 10)):
            res.append(random.randint(8, 129))
        print(res)
        return res

    """
    hyperparam_mutations={
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": [32, 64, 128]
    }
    """
    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        perturbation_interval=1,
        # Models will be considered for perturbation at this interval of time_attr="time_total_s"
        hyperparam_mutations={
            "batch_size": tune.choice([32, 64, 128, 256, 512, 1024]),
        },
        custom_explore_fn=explore,
        log_config=True
    )

    """
    reporter_cli = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"]
    )
    """

    reporter_jupyter = JupyterNotebookReporter(
      overwrite = False,
      parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
      metric_columns=["loss", "mean_accuracy", "training_iteration"]
    )

    analysis = tune.run(
        tune.with_parameters(
            train_mnist_tune_checkpoint,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        metric="loss",
        mode="min",
        config={
            "progress_bar_refresh_rate": 0,
            "layer_1_size": tune.choice([32, 64, 128, 256, 512, 1024]),
            "layer_2_size": tune.choice([32, 64, 128, 256, 512, 1024]),
            "lr": tune.choice([1e-2, 1e-3, 1e-4, 1e-5, 1e-6]),
            "batch_size": tune.choice([32, 64, 128, 256, 512, 1024]),
        },
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter_jupyter,
        verbose=1,
        name="tune_mnist_pbt",
        stop={  # Stop a single trial if one of the conditions are met
            "mean_accuracy": 0.98,
            "training_iteration": 15},
        local_dir="./data",
    )

    print("Best hyperparameters found were: ", analysis.best_config)
    return analysis

In [31]:
analysis = tune_mnist_pbt(num_samples=5, num_epochs=5, gpus_per_trial=0.2)

print(analysis.results)

[2m[36m(pid=63246)[0m GPU available: True, used: True
[2m[36m(pid=63246)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=63246)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=63246)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=63246)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=63248)[0m GPU available: True, used: True
[2m[36m(pid=63248)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=63248)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=63248)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=63248)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=63334)[0m GPU available: True, used: True
[2m[36m(pid=63334)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=63334)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=63334)[0m   return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)
[2m[36m(pid=63334)[0m LOCAL_RANK

[2m[36m(pid=63246)[0m   rank_zero_deprecation(
[2m[36m(pid=64607)[0m   rank_zero_deprecation(
[2m[36m(pid=63248)[0m   rank_zero_deprecation(
[2m[36m(pid=63334)[0m   rank_zero_deprecation(


2021-09-26 18:59:03,689	INFO pbt.py:540 -- [exploit] transferring weights from trial train_mnist_tune_checkpoint_9e4dd_00000 (score -0.1713031381368637) -> train_mnist_tune_checkpoint_9e4dd_00001 (score -1.9344497919082642)
INFO:App:{'progress_bar_refresh_rate': 0, 'layer_1_size': 64, 'layer_2_size': 1024, 'lr': 0.001, 'batch_size': 128}
2021-09-26 18:59:03,694	INFO pbt.py:557 -- [explore] perturbed config from {'batch_size': 256} -> {'batch_size': 138}
[2m[36m(pid=63246)[0m 2021-09-26 18:59:03,706	INFO trainable.py:382 -- Restored on 192.168.1.23 from checkpoint: /home/akaver/!Dev/pbt-demo-mnist/data/tune_mnist_pbt/train_mnist_tune_checkpoint_9e4dd_00001_1_batch_size=256,layer_1_size=64,layer_2_size=64,lr=1e-05_2021-09-26_18-58-41/checkpoint_tmpa3b2ab/./
[2m[36m(pid=63246)[0m 2021-09-26 18:59:03,706	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 13.041486740112305, '_episodes_total': None}
[2m[36m(pid=63246)[

[2m[36m(pid=64606)[0m   rank_zero_deprecation(
2021-09-26 18:59:14,040	INFO pbt.py:540 -- [exploit] transferring weights from trial train_mnist_tune_checkpoint_9e4dd_00000 (score -0.10354693233966827) -> train_mnist_tune_checkpoint_9e4dd_00003 (score -1.6637017726898193)
INFO:App:{'progress_bar_refresh_rate': 0, 'layer_1_size': 64, 'layer_2_size': 1024, 'lr': 0.001, 'batch_size': 204}
2021-09-26 18:59:14,046	INFO pbt.py:557 -- [explore] perturbed config from {'batch_size': 256} -> {'batch_size': 214}
[2m[36m(pid=64606)[0m 2021-09-26 18:59:14,069	INFO trainable.py:382 -- Restored on 192.168.1.23 from checkpoint: /home/akaver/!Dev/pbt-demo-mnist/data/tune_mnist_pbt/train_mnist_tune_checkpoint_9e4dd_00003_3_batch_size=32,layer_1_size=1024,layer_2_size=1024,lr=1e-06_2021-09-26_18-58-41/checkpoint_tmpd3dda0/./
[2m[36m(pid=64606)[0m 2021-09-26 18:59:14,069	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 29.9422636032

[2m[36m(pid=64607)[0m 2021-09-26 18:59:19,901	INFO trainable.py:382 -- Restored on 192.168.1.23 from checkpoint: /home/akaver/!Dev/pbt-demo-mnist/data/tune_mnist_pbt/train_mnist_tune_checkpoint_9e4dd_00004_4_batch_size=512,layer_1_size=512,layer_2_size=128,lr=1e-05_2021-09-26_18-58-41/checkpoint_tmp2e8848/./
[2m[36m(pid=64607)[0m 2021-09-26 18:59:19,901	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 29.94226360321045, '_episodes_total': None}
[2m[36m(pid=64607)[0m GPU available: True, used: True
[2m[36m(pid=64607)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=64607)[0m IPU available: False, using: 0 IPUs
[2m[36m(pid=64607)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=64607)[0m 
[2m[36m(pid=64607)[0m   | Name    | Type   | Params
[2m[36m(pid=64607)[0m -----------------------------------
[2m[36m(pid=64607)[0m 0 | layer_1 | Linear | 50.2 K
[2m[36m(pid=64607)[0m 1 | la

2021-09-26 18:59:31,134	INFO pbt.py:540 -- [exploit] transferring weights from trial train_mnist_tune_checkpoint_9e4dd_00004 (score -0.07100281864404678) -> train_mnist_tune_checkpoint_9e4dd_00002 (score -0.17781594395637512)
INFO:App:{'progress_bar_refresh_rate': 0, 'layer_1_size': 64, 'layer_2_size': 1024, 'lr': 0.001, 'batch_size': 256}
2021-09-26 18:59:31,137	INFO pbt.py:557 -- [explore] perturbed config from {'batch_size': 214} -> {'batch_size': 266}
[2m[36m(pid=63248)[0m 2021-09-26 18:59:31,146	INFO trainable.py:382 -- Restored on 192.168.1.23 from checkpoint: /home/akaver/!Dev/pbt-demo-mnist/data/tune_mnist_pbt/train_mnist_tune_checkpoint_9e4dd_00002_2_batch_size=256,layer_1_size=64,layer_2_size=256,lr=0.01_2021-09-26_18-58-41/checkpoint_tmp1d56ca/./
[2m[36m(pid=63248)[0m 2021-09-26 18:59:31,146	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 4, '_timesteps_total': None, '_time_total': 39.39965867996216, '_episodes_total': None}
[2m[36m(pid=63248)

2021-09-26 18:59:39,893	INFO pbt.py:540 -- [exploit] transferring weights from trial train_mnist_tune_checkpoint_9e4dd_00002 (score -0.0741184800863266) -> train_mnist_tune_checkpoint_9e4dd_00003 (score -0.1259935200214386)
INFO:App:{'progress_bar_refresh_rate': 0, 'layer_1_size': 64, 'layer_2_size': 1024, 'lr': 0.001, 'batch_size': 319}
2021-09-26 18:59:39,895	INFO pbt.py:557 -- [explore] perturbed config from {'batch_size': 266} -> {'batch_size': 329}
[2m[36m(pid=64606)[0m 2021-09-26 18:59:39,904	INFO trainable.py:382 -- Restored on 192.168.1.23 from checkpoint: /home/akaver/!Dev/pbt-demo-mnist/data/tune_mnist_pbt/train_mnist_tune_checkpoint_9e4dd_00003_3_batch_size=32,layer_1_size=1024,layer_2_size=1024,lr=1e-06_2021-09-26_18-58-41/checkpoint_tmpbdf497/./
[2m[36m(pid=64606)[0m 2021-09-26 18:59:39,904	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 5, '_timesteps_total': None, '_time_total': 46.93543744087219, '_episodes_total': None}
[2m[36m(pid=64606)

2021-09-26 18:59:48,077	INFO pbt.py:540 -- [exploit] transferring weights from trial train_mnist_tune_checkpoint_9e4dd_00002 (score -0.07930481433868408) -> train_mnist_tune_checkpoint_9e4dd_00001 (score -0.11694955825805664)
INFO:App:{'progress_bar_refresh_rate': 0, 'layer_1_size': 64, 'layer_2_size': 1024, 'lr': 0.001, 'batch_size': 319}
2021-09-26 18:59:48,078	INFO pbt.py:557 -- [explore] perturbed config from {'batch_size': 266} -> {'batch_size': 329}
[2m[36m(pid=63246)[0m 2021-09-26 18:59:48,084	INFO trainable.py:382 -- Restored on 192.168.1.23 from checkpoint: /home/akaver/!Dev/pbt-demo-mnist/data/tune_mnist_pbt/train_mnist_tune_checkpoint_9e4dd_00001_1_batch_size=256,layer_1_size=64,layer_2_size=64,lr=1e-05_2021-09-26_18-58-41/checkpoint_tmpa6ce6d/./
[2m[36m(pid=63246)[0m 2021-09-26 18:59:48,084	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 6, '_timesteps_total': None, '_time_total': 54.66638445854187, '_episodes_total': None}
[2m[36m(pid=63246)

2021-09-26 18:59:56,641	INFO pbt.py:489 -- [pbt]: no checkpoint for trial. Skip exploit for Trial train_mnist_tune_checkpoint_9e4dd_00004


2021-09-26 19:00:01,777	INFO tune.py:550 -- Total run time: 80.89 seconds (80.76 seconds for the tuning loop).


Best hyperparameters found were:  {'progress_bar_refresh_rate': 0, 'layer_1_size': 64, 'layer_2_size': 1024, 'lr': 0.001, 'batch_size': 329}
{'9e4dd_00000': {'loss': 0.0987594947218895, 'mean_accuracy': 0.9701861143112183, 'time_this_iter_s': 9.433017492294312, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 5, 'experiment_id': 'eb2908eb5f734574af91336783829850', 'date': '2021-09-26_18-59-31', 'timestamp': 1632671971, 'time_total_s': 48.29545521736145, 'pid': 63334, 'hostname': 'ml-linux', 'node_ip': '192.168.1.23', 'config': {'progress_bar_refresh_rate': 0, 'layer_1_size': 64, 'layer_2_size': 1024, 'lr': 0.001, 'batch_size': 256}, 'time_since_restore': 48.29545521736145, 'timesteps_since_restore': 0, 'iterations_since_restore': 5, 'trial_id': '9e4dd_00000', 'experiment_tag': '0_batch_size=256,layer_1_size=64,layer_2_size=1024,lr=0.001'}, '9e4dd_00001': {'loss': 0.04850537329912186, 'mean_accuracy': 0.982487678527832, 'tim