In [1]:
import os
import logging
import math
from filelock import FileLock
import random

# __import_lightning_begin__
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
from torchvision.datasets import FashionMNIST
from torchvision import transforms
# __import_lightning_end__

# __import_tune_begin__
from pytorch_lightning.loggers import TensorBoardLogger
from ray import tune
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback
# __import_tune_end__

from FashionMNISTLightningDataModule import FashionMNISTLightningDataModule
from augmentation.augmentation import TRANSFORM_NAMES
from FashionMNISTLightningModule import FashionMNISTLightningModule


In [2]:
log = logging.getLogger('App')
logging.basicConfig(level=logging.INFO)

In [3]:
def train_mnist_tune_checkpoint(config,
                                checkpoint_dir=None,
                                # no really used, we stop after every epoch and let tune decide what to do
                                num_epochs=999,
                                num_gpus=0):
    # data_dir = os.path.expanduser("~/data")
    data_dir = config["data_dir"]

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # If fractional GPUs passed in, convert to int.
        gpus=math.ceil(num_gpus),
        logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name="", version="."),
        progress_bar_refresh_rate=config["progress_bar_refresh_rate"],
        num_sanity_val_steps=0,
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={
                    "loss": "ptl/val_loss",
                    "mean_accuracy": "ptl/val_acc"
                },
                filename="checkpoint",
                on="validation_end"
            )
        ]
    )

    if checkpoint_dir:
        model = FashionMNISTLightningModule.load_from_checkpoint(os.path.join(checkpoint_dir, "checkpoint"), conf=config)
        log.info('Lightning loaded from checkpoint')
    else:
        model = FashionMNISTLightningModule(conf=config)
        log.info('Lightning initialized')

    data = FashionMNISTLightningDataModule(conf=config)

    trainer.fit(model, data)

In [4]:
def tune_mnist_pbt(num_samples=15, training_iteration=15, gpus_per_trial=0):
    def explore(config):
        log.info("======================================= EXPLORE =========================================")
        # calculate new magnitudes for augmentations
        augmentations = []
        for tfn_name in TRANSFORM_NAMES:
            augmentations.append((tfn_name, random.random()))

        config["augmentations"] = augmentations
        log.info(config)
        return config

    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        # Models will be considered for perturbation at this interval of time_attr="time_total_s"
        perturbation_interval=1,
        custom_explore_fn=explore,
        log_config=True
    )

    reporter_jupyter = JupyterNotebookReporter(
        overwrite=True,
        parameter_columns=["augmentations"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"]
    )

    """
    # initial config
            config={
            "progress_bar_refresh_rate": 0,
            "layer_1_size": tune.choice([32, 64, 128, 256, 512, 1024]),
            "layer_2_size": tune.choice([32, 64, 128, 256, 512, 1024]),
            "lr": tune.choice([1e-2, 1e-3, 1e-4, 1e-5, 1e-6]),
            "batch_size": tune.choice([32, 64, 128, 256, 512, 1024, 2048]),
        },
    """

    # set up the augmentations
    # tuple of augmentation name and its magnitude
    augmentations = []
    for tfn_name in TRANSFORM_NAMES:
        augmentations.append((tfn_name, random.random()))

    config={
        # https://docs.ray.io/en/master/tune/api_docs/search_space.html?highlight=tune.choice#
        "progress_bar_refresh_rate": 0,
        "layer_1_size": 512,
        "layer_2_size": 512,
        "lr": 0.00005 ,
        "batch_size": 1024,
        "data_dir": "./data",
        "data_mean": 0.28604063391685486,
        "data_std": 0.35302430391311646,
        "augmentations": augmentations,
    }

    analysis = tune.run(
        tune.with_parameters(
            train_mnist_tune_checkpoint,
            num_gpus=gpus_per_trial),
        resources_per_trial={
            "cpu": 1 / 16,
            "gpu": gpus_per_trial
        },
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter_jupyter,
        verbose=1,
        name="FashionMNIST-pbt",
        stop={  # Stop a single trial if one of the conditions are met
            "mean_accuracy": 0.95,
            "training_iteration": training_iteration},
        local_dir="./data",
    )

    print("Best hyperparameters found were: ", analysis.best_config)
    return analysis

In [5]:
analysis = tune_mnist_pbt(num_samples=30, training_iteration=15, gpus_per_trial=1/8)
analysis.best_config
analysis.results

Trial name,status,loc,augmentations
train_mnist_tune_checkpoint_525f7_00000,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00001,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00002,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00003,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00004,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00005,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00006,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00007,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00008,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"
train_mnist_tune_checkpoint_525f7_00009,PENDING,,"[('blur', 0.8293034766867767), ('rotate_left', 0.023870989116744457), ('rotate_right', 0.10642285066768153)]"


2021-12-11 23:30:47,088	ERROR tune.py:626 -- Trials did not complete: [train_mnist_tune_checkpoint_525f7_00000, train_mnist_tune_checkpoint_525f7_00001, train_mnist_tune_checkpoint_525f7_00002, train_mnist_tune_checkpoint_525f7_00003, train_mnist_tune_checkpoint_525f7_00004, train_mnist_tune_checkpoint_525f7_00005, train_mnist_tune_checkpoint_525f7_00006, train_mnist_tune_checkpoint_525f7_00007, train_mnist_tune_checkpoint_525f7_00008, train_mnist_tune_checkpoint_525f7_00009, train_mnist_tune_checkpoint_525f7_00010, train_mnist_tune_checkpoint_525f7_00011, train_mnist_tune_checkpoint_525f7_00012, train_mnist_tune_checkpoint_525f7_00013, train_mnist_tune_checkpoint_525f7_00014, train_mnist_tune_checkpoint_525f7_00015]
2021-12-11 23:30:47,089	INFO tune.py:630 -- Total run time: 121.24 seconds (120.99 seconds for the tuning loop).


Best hyperparameters found were:  None


{'525f7_00000': {'trial_id': '525f7_00000'},
 '525f7_00001': {'trial_id': '525f7_00001'},
 '525f7_00002': {'trial_id': '525f7_00002'},
 '525f7_00003': {'trial_id': '525f7_00003'},
 '525f7_00004': {'trial_id': '525f7_00004'},
 '525f7_00005': {'trial_id': '525f7_00005'},
 '525f7_00006': {'trial_id': '525f7_00006'},
 '525f7_00007': {'trial_id': '525f7_00007'},
 '525f7_00008': {'trial_id': '525f7_00008'},
 '525f7_00009': {'trial_id': '525f7_00009'},
 '525f7_00010': {'trial_id': '525f7_00010'},
 '525f7_00011': {'trial_id': '525f7_00011'},
 '525f7_00012': {'trial_id': '525f7_00012'},
 '525f7_00013': {'trial_id': '525f7_00013'},
 '525f7_00014': {'trial_id': '525f7_00014'},
 '525f7_00015': {'trial_id': '525f7_00015'}}

In [5]:
from time import sleep, perf_counter as pc

results = []
for models in range(8):
    experiments_to_run_in_parallel = (models + 1) * 2
    print(f"Starting {experiments_to_run_in_parallel} experiments in parallel")
    start_time = pc()

    analysis = tune_mnist_pbt(num_samples=16, num_epochs=5, gpus_per_trial=1 / experiments_to_run_in_parallel)

    elapsed_time = pc() - start_time
    results.append((models, elapsed_time))

results

Trial name,# failures,error file
train_mnist_tune_checkpoint_575ec_00000,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00000_0_batch_size=256,layer_1_size=1024,layer_2_size=128,lr=0.0001_2021-09-28_01-29-13/error.txt"
train_mnist_tune_checkpoint_575ec_00001,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00001_1_batch_size=256,layer_1_size=512,layer_2_size=512,lr=1e-05_2021-09-28_01-29-13/error.txt"
train_mnist_tune_checkpoint_575ec_00002,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00002_2_batch_size=32,layer_1_size=512,layer_2_size=64,lr=0.01_2021-09-28_01-29-13/error.txt"
train_mnist_tune_checkpoint_575ec_00003,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00003_3_batch_size=64,layer_1_size=512,layer_2_size=1024,lr=1e-05_2021-09-28_01-29-13/error.txt"
train_mnist_tune_checkpoint_575ec_00004,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00004_4_batch_size=1024,layer_1_size=1024,layer_2_size=64,lr=0.01_2021-09-28_01-29-14/error.txt"
train_mnist_tune_checkpoint_575ec_00005,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00005_5_batch_size=32,layer_1_size=1024,layer_2_size=64,lr=0.0001_2021-09-28_01-29-14/error.txt"
train_mnist_tune_checkpoint_575ec_00006,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00006_6_batch_size=512,layer_1_size=64,layer_2_size=1024,lr=0.01_2021-09-28_01-29-14/error.txt"
train_mnist_tune_checkpoint_575ec_00007,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00007_7_batch_size=256,layer_1_size=1024,layer_2_size=64,lr=0.0001_2021-09-28_01-29-14/error.txt"
train_mnist_tune_checkpoint_575ec_00008,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00008_8_batch_size=512,layer_1_size=512,layer_2_size=256,lr=1e-06_2021-09-28_01-29-14/error.txt"
train_mnist_tune_checkpoint_575ec_00009,1,"/home/akaver/!Dev/pbt-demo-mnist/data/FashionMNIST/train_mnist_tune_checkpoint_575ec_00009_9_batch_size=2048,layer_1_size=32,layer_2_size=128,lr=1e-06_2021-09-28_01-29-14/error.txt"


TuneError: ('Trials did not complete', [train_mnist_tune_checkpoint_575ec_00000, train_mnist_tune_checkpoint_575ec_00001, train_mnist_tune_checkpoint_575ec_00002, train_mnist_tune_checkpoint_575ec_00003, train_mnist_tune_checkpoint_575ec_00004, train_mnist_tune_checkpoint_575ec_00005, train_mnist_tune_checkpoint_575ec_00006, train_mnist_tune_checkpoint_575ec_00007, train_mnist_tune_checkpoint_575ec_00008, train_mnist_tune_checkpoint_575ec_00009, train_mnist_tune_checkpoint_575ec_00010, train_mnist_tune_checkpoint_575ec_00011, train_mnist_tune_checkpoint_575ec_00012, train_mnist_tune_checkpoint_575ec_00013, train_mnist_tune_checkpoint_575ec_00014, train_mnist_tune_checkpoint_575ec_00015])

In [6]:
results

[(0, 1250.4137557320064),
 (1, 1054.7564893119998),
 (2, 978.0088943799929),
 (3, 1215.7183314590075),
 (4, 1026.1449859720015),
 (5, 1067.1325380739872),
 (6, 1058.9811587199947)]