In [4]:
! pip uninstall kubeflow-katib

Collecting kubeflow-katib
  Using cached kubeflow_katib-0.15.0-py3-none-any.whl (107 kB)
Collecting grpcio==1.41.1
  Using cached grpcio-1.41.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
Collecting protobuf==3.19.5
  Using cached protobuf-3.19.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Installing collected packages: protobuf, grpcio, kubeflow-katib
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.6
    Uninstalling protobuf-3.19.6:
      Successfully uninstalled protobuf-3.19.6
  Attempting uninstall: grpcio
    Found existing installation: grpcio 1.54.2
    Uninstalling grpcio-1.54.2:
      Successfully uninstalled grpcio-1.54.2
Successfully installed grpcio-1.41.1 kubeflow-katib-0.15.0 protobuf-3.19.5


In [104]:

def train_mnist_model_pytorch(parameters):

    
    import logging
    import os
    from torchvision import datasets, transforms
    import torch
    import torch.distributed as dist
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    
    logging.basicConfig(
        format="%(asctime)s %(levelname)-8s %(message)s",
        datefmt="%Y-%m-%dT%H:%M:%SZ",
        level=logging.INFO,
    )    
    logging.info("--------------------------------------------------------------------------------------")
    logging.info(f"Input Parameters: {parameters}")
    logging.info("--------------------------------------------------------------------------------------\n\n")


    WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))
    test_batch_size= 1000
    no_cuda = False
    seed = 1 
    log_interval = 10
    log_path = ""
    save_model = False
    
    # Get HyperParameters from the input params dict.
    batch_size = int(parameters["batch_size"])
    epochs = int(parameters["num_epoch"])
    momentum = float(parameters["momentum"])
    lr = float(parameters["lr"])
    


    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 20, 5, 1)
            self.conv2 = nn.Conv2d(20, 50, 5, 1)
            self.fc1 = nn.Linear(4*4*50, 500)
            self.fc2 = nn.Linear(500, 10)

        def forward(self, x):
            x = F.relu(self.conv1(x))
            x = F.max_pool2d(x, 2, 2)
            x = F.relu(self.conv2(x))
            x = F.max_pool2d(x, 2, 2)
            x = x.view(-1, 4*4*50)
            x = F.relu(self.fc1(x))
            x = self.fc2(x)
            return F.log_softmax(x, dim=1)


    def train(model, device, train_loader, optimizer, epoch):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            log_interval = 10
            if batch_idx % log_interval == 0:
                msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item())
                #logging.info(msg)
                niter = epoch * len(train_loader) + batch_idx


    def test(model, device, test_loader, epoch):
        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
                pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        test_accuracy = float(correct) / len(test_loader.dataset)
        logging.info(
                "Epoch {}/{}. accuracy={:.4f} - loss={:.4f}".format(
                    epoch+1, epochs, test_accuracy, test_loss
                )
            )
        


    def should_distribute():
        return dist.is_available() and WORLD_SIZE > 1

    def is_distributed():
        return dist.is_available() and dist.is_initialized()

    # Training settings
    if dist.is_available():
        backend = dist.Backend.GLOO


    use_cuda = not no_cuda and torch.cuda.is_available()
    
    if use_cuda:
        print("Using CUDA")
        
    torch.manual_seed(seed)
    device = torch.device("cuda" if use_cuda else "cpu")

    if should_distribute():
        print("Using distributed PyTorch with {} backend".format(backend))
        dist.init_process_group(backend=backend)

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    

    train_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST("./data",
                              train=True,
                              download=True,
                              transform=transforms.Compose([
                                  transforms.ToTensor()
                              ])),
        batch_size= batch_size, shuffle=True, **kwargs)

    test_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST("./data",
                              train=False,
                              transform=transforms.Compose([
                                  transforms.ToTensor()
                              ])),
        batch_size=test_batch_size, shuffle=False, **kwargs)

    model = Net().to(device)

    if is_distributed():
        logging.info("Distributed Training")
        Distributor = nn.parallel.DistributedDataParallel if use_cuda \
            else nn.parallel.DistributedDataParallelCPU
        model = Distributor(model)
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader, epoch)
    #torch.save(model.state_dict(), "mnist_cnn.pt")

In [72]:
# Set Parameters for Local Training.
parameters = {
    "lr": 0.01,
    "num_epoch": 2,
    "momentum": 0.01,
    "is_dist": False,
    "batch_size": 4,
    "num_workers": 1
}

# Train Model locally in the Notebook.
train_mnist_model(parameters)

--------------------------------------------------------------------------------------
Input Parameters: {'lr': 0.01, 'num_epoch': 2, 'momentum': 0.01, 'is_dist': False, 'batch_size': 4, 'num_workers': 1}
--------------------------------------------------------------------------------------


Single-worker MultiWorkerMirroredStrategy with local_devices = ('/device:CPU:0',), communication = CommunicationImplementation.RING
Running Single Worker Training
--------------------------------------------------------------------------------------


Epoch 1/2. accuracy=0.6167 - loss=1.7098
Epoch 2/2. accuracy=0.8520 - loss=0.6748


In [105]:
import kubeflow.katib as katib

# Set parameters with their distribution for HyperParameter Tuning with Katib.
parameters = {
    "lr": katib.search.double(min=0.1, max=0.2),
    "num_epoch": katib.search.int(min=10, max=15),
    "momentum": katib.search.double(min=0.01, max=0.5),
    "batch_size": katib.search.int(min=4, max=128),
    "is_dist": False,
    "num_workers": 1
}

# Start the Katib Experiment.
exp_name = "tune-mnist-pytorch"
katib_client = katib.KatibClient()

katib_client.tune(
    name=exp_name,
    objective=train_mnist_model, # Objective function.
    parameters=parameters, # HyperParameters to tune.
    algorithm_name="cmaes", # Alorithm to use.
    objective_metric_name="accuracy", # Katib is going to optimize "accuracy".
    additional_metric_names=["loss"], # Katib is going to collect these metrics in addition to the objective metric.
    max_trial_count=12, # Trial Threshold.
    parallel_trial_count=4,
)

Experiment kubeflow-user-asabet/tune-mnist-pytorch has been created


In [106]:
status = katib_client.is_experiment_succeeded(exp_name)
print(f"Katib Experiment is Succeeded: {status}\n")

best_hps = katib_client.get_optimal_hyperparameters(exp_name)

if best_hps != None:
    print("Current Optimal Trial\n")
    print(best_hps)
    
    for hp in best_hps.parameter_assignments:
        if hp.name == "lr":
            best_lr = hp.value
        elif hp.name == "momentum":
            best_momentum = hp.value
        elif hp.name == "num_epoch":
            best_num_epoch = hp.value
        elif hp.name == "batch_size":
            best_batch_size = hp.value

Katib Experiment is Succeeded: True

Current Optimal Trial

{'best_trial_name': 'tune-mnist-pytorch-ssp8h5xw',
 'observation': {'metrics': [{'latest': '0.9690',
                              'max': '0.9690',
                              'min': '0.7433',
                              'name': 'accuracy'},
                             {'latest': '0.1069',
                              'max': '0.8389',
                              'min': '0.1069',
                              'name': 'loss'}]},
 'parameter_assignments': [{'name': 'batch_size', 'value': '66'},
                           {'name': 'lr', 'value': '0.1452465916581278'},
                           {'name': 'num_epoch', 'value': '13'},
                           {'name': 'momentum',
                            'value': '0.26270278642319494'}]}


In [107]:
from kubeflow.training import TrainingClient
# Set Parameters for Distributed Training with TFJob.

parameters = {
    "lr": best_lr,
    "num_epoch": best_num_epoch,
    "momentum": best_momentum,
    "is_dist": False,
    "batch_size": best_batch_size,
    "num_workers": 1
}

# Start TFJob Training.
pytorchjob_name = "train-mnist"
pytorchjob_client = TrainingClient()
# methods = dir(tfjob_client)
# for method in methods:
#     print(method)
pytorchjob_client.create_pytorchjob_from_func(
    name=pytorchjob_name,
    func=train_mnist_model_pytorch,
    parameters=parameters, # Input parameters for the train function.
    num_worker_replicas=5, # How many TFJob Workers will be run.
)

PyTorchJob kubeflow-user-asabet/train-mnist has been created


In [95]:
print(f"TFJob status: {pytorchjob_client.get_pytorchjob(tfjob_name)}")

TFJob status: {'api_version': 'kubeflow.org/v1',
 'kind': 'PyTorchJob',
 'metadata': {'annotations': None,
              'creation_timestamp': datetime.datetime(2023, 5, 31, 12, 56, 46, tzinfo=tzlocal()),
              'deletion_grace_period_seconds': None,
              'deletion_timestamp': None,
              'finalizers': None,
              'generate_name': None,
              'generation': 1,
              'labels': None,
              'managed_fields': [{'api_version': 'kubeflow.org/v1',
                                  'fields_type': 'FieldsV1',
                                  'fields_v1': {'f:spec': {'.': {},
                                                           'f:pytorchReplicaSpecs': {'.': {},
                                                                                     'f:Master': {'.': {},
                                                                                                  'f:replicas': {},
                                                      

In [93]:
methods = dir(tfjob_client)
for method in methods:
    print(method)

__class__
__delattr__
__dict__
__dir__
__doc__
__eq__
__format__
__ge__
__getattribute__
__gt__
__hash__
__init__
__init_subclass__
__le__
__lt__
__module__
__ne__
__new__
__reduce__
__reduce_ex__
__repr__
__setattr__
__sizeof__
__str__
__subclasshook__
__weakref__
api_client
core_api
create_mpijob
create_mpijob_from_func
create_mxjob
create_mxjob_from_func
create_paddlejob
create_paddlejob_from_func
create_pytorchjob
create_pytorchjob_from_func
create_tfjob
create_tfjob_from_func
create_xgboostjob
create_xgboostjob_from_func
custom_api
delete_mpijob
delete_mxjob
delete_paddlejob
delete_pytorchjob
delete_tfjob
delete_xgboostjob
get_job_conditions
get_job_logs
get_job_pod_names
get_mpijob
get_mxjob
get_paddlejob
get_pytorchjob
get_tfjob
get_xgboostjob
is_job_created
is_job_failed
is_job_restarting
is_job_running
is_job_succeeded
list_mpijobs
list_mxjobs
list_paddlejobs
list_pytorchjobs
list_tfjobs
list_xgboostjobs
patch_mpijob
patch_mxjob
patch_paddlejob
patch_pytorchjob
patch_tfjob
pat

In [103]:
pytorchjob_client.delete_pytorchjob("train-mnist")

PyTorchJob kubeflow-user-asabet/train-mnist has been deleted
