## Setup an IPyParallel cluster

In [None]:
import ipcluster_magics

In [None]:
job_name = "isc_ihpc_mnist"
nodes = 1
engines = 1
module = "python/3.6-anaconda-4.4"
conda_env = "/global/cscratch1/sd/sfarrell/conda/isc-ihpc"

In [None]:
%ipcluster -m $module -e $conda_env -N $nodes -J $job_name -t 01:00:00

## Connect a client to the running IPP cluster

In [None]:
cluster_id = None

In [None]:
# Connect to IPP controller
import time
import ipyparallel as ipp

c = None
wait_time = 5
retries = 3
while retries > 0:
    print("checking ipcontroller...")
    try:
        if cluster_id is not None:
            c = ipp.Client(cluster_id=cluster_id)
        else:
            c = ipp.Client()
        print("ipcontroller is running")
        break
    except Exception as e:
        print(e.args)
        print("ipcontroller is not running yet, waiting {} seconds before retry...".format(wait_time))
        time.sleep(wait_time)
        retries -= 1

wait_time = 10
retries = 3
while c is not None and retries > 0:
    if len(c.ids) == 0:
        print("engines are not registered yet with controller, waiting {} seconds before retry...".format(wait_time))
        time.sleep(wait_time)
        retries -= 1
    elif len(c.ids) < engines:
        print("not all engines have registered, waiting {} seconds...".format(wait_time))
        time.sleep(wait_time)
    else:
        break

if c is not None:
    lv = c.load_balanced_view()
    dv = c.direct_view()
    print(c.ids)

## Interactively run multiple parameter sets

In [None]:
import mnist
x_train, y_train, x_test, y_test = mnist.load_data()

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

In [None]:
# Training config
# Hold these parameters constant
import os

checkpoint_dir = '/global/cscratch1/sd/$USER/cori-interactive-dl/mnist-hpo'
n_samples = 1000

fixed_params = {
    "verbose": 0,
    "batch_size": 128,
    "nthreads": 32,
    "n_epochs": 32,
    "valid_frac": 0.17,
#    "checkpoint_file": os.path.join(os.path.expandvars(checkpoint_dir), 'model_single.h5'),
    "x_train": x_train[:n_samples], 
    "y_train": y_train[:n_samples]
}

In [None]:
import numpy as np

n_hpo_trials = 4
grid_h1 = np.random.choice([4, 8, 16, 32, 64], size=n_hpo_trials)
grid_h2 = np.random.choice([4, 8, 16, 32, 64], size=n_hpo_trials)
grid_h3 = np.random.choice([8, 16, 32, 64, 128], size=n_hpo_trials)
grid_dropout = np.random.rand(n_hpo_trials)
grid_optimizer = np.random.choice(['Adadelta', 'Adam', 'Nadam'], size=n_hpo_trials)

In [None]:
import functools as ft
from mlextras import build_and_train
from hpo_widgets import ModelPlot, ParamSpanWidget

run_training = ft.partial(build_and_train, **fixed_params)
plot_metrics = ft.partial(
    ModelPlot,
    y=['loss', 'acc', 'val_loss', 'val_acc'],
    xlim=[0, fixed_params["n_epochs"]],
    xlabel='epochs',
    ylabel='training metrics'
)

hpo_params = dict(
    h1=grid_h1,
    h2=grid_h2,
    h3=grid_h3,
    dropout=grid_dropout,
    optimizer=grid_optimizer
)

psw = ParamSpanWidget(
    run_training, 
    plot_metrics, 
    hpo_params,
    ipp_cluster_id=cluster_id)

psw.submit_computations()

psw

In [None]:
psw.debug

In [None]:
from IPython.display import display
for m in psw.model_plots:
    display(m.debug)

## Look at additional model details

In [None]:
import pprint
for i in range(len(psw.model_runs)):
    pprint.pprint(psw.model_runs[i].metadata)

## Release job resources

#### Grab the job id for connecting to this cluster

In [None]:
%%bash -s "{job_name}" --out job_id
#capture the jobid to a variable
squeue -u $USER -n $1 | awk '{if (NR!=1) {printf "%s", $1}}'

#### Cancel the current job

In [None]:
%%bash -s "{job_id}"
scancel $1