diff --git a/operators/OWNERS b/operators/OWNERS new file mode 100644 index 00000000000..b8e0eb60cd2 --- /dev/null +++ b/operators/OWNERS @@ -0,0 +1,3 @@ +approvers: + - knkski + - rfmvasconcelos diff --git a/operators/README.md b/operators/README.md new file mode 100644 index 00000000000..10d7409e561 --- /dev/null +++ b/operators/README.md @@ -0,0 +1,33 @@ +## Katib Operators + +### Overview +This bundle encompasses the Kubernetes python operators (a.k.a. charms) for Katib +(see [CharmHub](https://charmhub.io/?q=katib)). + +The Katib operators are python scripts that wrap the latest released [Katib manifests][manifests], +providing lifecycle management for each application, handling events (install, upgrade, +integrate, remove). + +[manifests]: https://github.com/kubeflow/katib/tree/master/manifests + +## Install + +### Install applications + +To install Katib, run: + + juju deploy katib + +You can also install each application individually, like this: + + juju deploy + +where `` is one of `katib-controller`, `katib-ui`, or `katib-db-manager`. + +** Note **: As a default, when you `juju deploy` an application or the full Katib +bundle, you will deploy the latest pushed commit of Katib, even if unreleased updates are +already available in the Kubeflow manifests. If you would like to try the latest +available charm run: + + + juju deploy foo --channel=edge diff --git a/operators/bundle.yaml b/operators/bundle.yaml new file mode 100644 index 00000000000..a3be92988c3 --- /dev/null +++ b/operators/bundle.yaml @@ -0,0 +1,8 @@ +bundle: kubernetes +applications: + katib-controller: { charm: katib-controller, scale: 1, annotations: { gui-x: '0', gui-y: '0' } } + katib-db: { charm: cs:~charmed-osm/mariadb-k8s, scale: 1, annotations: { gui-x: '0', gui-y: '300' }, options: { database: katib } } + katib-db-manager: { charm: katib-db-manager, scale: 1, annotations: { gui-x: '300', gui-y: '0' } } + katib-ui: { charm: katib-ui, scale: 1, annotations: { gui-x: '300', gui-y: '300' } } +relations: +- [katib-db-manager, katib-db] diff --git a/operators/katib-controller/config.yaml b/operators/katib-controller/config.yaml new file mode 100644 index 00000000000..f771f5e9722 --- /dev/null +++ b/operators/katib-controller/config.yaml @@ -0,0 +1,9 @@ +options: + webhook-port: + type: int + default: 443 + description: Webhook port + metrics-port: + type: int + default: 8080 + description: Metrics port diff --git a/operators/katib-controller/files/crds.yaml b/operators/katib-controller/files/crds.yaml new file mode 100644 index 00000000000..01b704aa52d --- /dev/null +++ b/operators/katib-controller/files/crds.yaml @@ -0,0 +1,95 @@ +--- +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: experiments.kubeflow.org +spec: + additionalPrinterColumns: + - JSONPath: .status.conditions[-1:].type + name: Type + type: string + - JSONPath: .status.conditions[-1:].status + name: Status + type: string + - JSONPath: .metadata.creationTimestamp + name: Age + type: date + group: kubeflow.org + version: v1beta1 + scope: Namespaced + subresources: + status: {} + names: + kind: Experiment + singular: experiment + plural: experiments + categories: + - all + - kubeflow + - katib + +--- +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: suggestions.kubeflow.org +spec: + additionalPrinterColumns: + - JSONPath: .status.conditions[-1:].type + name: Type + type: string + - JSONPath: .status.conditions[-1:].status + name: Status + type: string + - JSONPath: .spec.requests + name: Requested + type: string + - JSONPath: .status.suggestionCount + name: Assigned + type: string + - JSONPath: .metadata.creationTimestamp + name: Age + type: date + group: kubeflow.org + version: v1beta1 + scope: Namespaced + subresources: + status: {} + names: + kind: Suggestion + singular: suggestion + plural: suggestions + categories: + - all + - kubeflow + - katib + +--- +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: trials.kubeflow.org +spec: + additionalPrinterColumns: + - JSONPath: .status.conditions[-1:].type + name: Type + type: string + - JSONPath: .status.conditions[-1:].status + name: Status + type: string + - JSONPath: .metadata.creationTimestamp + name: Age + type: date + group: kubeflow.org + version: v1beta1 + scope: Namespaced + subresources: + status: {} + names: + kind: Trial + singular: trial + plural: trials + categories: + - all + - kubeflow + - katib diff --git a/operators/katib-controller/files/defaultTrialTemplate.yaml b/operators/katib-controller/files/defaultTrialTemplate.yaml new file mode 100644 index 00000000000..00223ea5eb3 --- /dev/null +++ b/operators/katib-controller/files/defaultTrialTemplate.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: Job +spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/mxnet-mnist:v1beta1-e294a90 + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never diff --git a/operators/katib-controller/files/early-stopping.json b/operators/katib-controller/files/early-stopping.json new file mode 100644 index 00000000000..4e1cf486998 --- /dev/null +++ b/operators/katib-controller/files/early-stopping.json @@ -0,0 +1,6 @@ +{ + "medianstop": { + "image": "docker.io/kubeflowkatib/earlystopping-medianstop", + "imagePullPolicy": "Always" + } +} diff --git a/operators/katib-controller/files/enasCPUTemplate.yaml b/operators/katib-controller/files/enasCPUTemplate.yaml new file mode 100644 index 00000000000..588f4e98694 --- /dev/null +++ b/operators/katib-controller/files/enasCPUTemplate.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: Job +spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu:v1beta1-e294a90 + command: + - python3 + - -u + - RunTrial.py + - --num_epochs=1 + - "--architecture=\"${trialParameters.neuralNetworkArchitecture}\"" + - "--nn_config=\"${trialParameters.neuralNetworkConfig}\"" + restartPolicy: Never diff --git a/operators/katib-controller/files/metrics-collector-sidecar.json b/operators/katib-controller/files/metrics-collector-sidecar.json new file mode 100644 index 00000000000..016c8f2a8ad --- /dev/null +++ b/operators/katib-controller/files/metrics-collector-sidecar.json @@ -0,0 +1,16 @@ +{ + "StdOut": { + "image": "docker.io/kubeflowkatib/file-metrics-collector" + }, + "File": { + "image": "docker.io/kubeflowkatib/file-metrics-collector" + }, + "TensorFlowEvent": { + "image": "docker.io/kubeflowkatib/tfevent-metrics-collector", + "resources": { + "limits": { + "memory": "1Gi" + } + } + } +} diff --git a/operators/katib-controller/files/pytorchJobTemplate.yaml b/operators/katib-controller/files/pytorchJobTemplate.yaml new file mode 100644 index 00000000000..dda8b1d1ab6 --- /dev/null +++ b/operators/katib-controller/files/pytorchJobTemplate.yaml @@ -0,0 +1,32 @@ +apiVersion: "kubeflow.org/v1" +kind: PyTorchJob +spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + imagePullPolicy: Always + command: + - "python" + - "/var/mnist.py" + - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" + Worker: + replicas: 2 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + imagePullPolicy: Always + command: + - "python" + - "/var/mnist.py" + - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" diff --git a/operators/katib-controller/files/suggestion.json b/operators/katib-controller/files/suggestion.json new file mode 100644 index 00000000000..e9492975fea --- /dev/null +++ b/operators/katib-controller/files/suggestion.json @@ -0,0 +1,32 @@ +{ + "random": { + "image": "docker.io/kubeflowkatib/suggestion-hyperopt" + }, + "grid": { + "image": "docker.io/kubeflowkatib/suggestion-chocolate" + }, + "hyperband": { + "image": "docker.io/kubeflowkatib/suggestion-hyperband" + }, + "bayesianoptimization": { + "image": "docker.io/kubeflowkatib/suggestion-skopt" + }, + "tpe": { + "image": "docker.io/kubeflowkatib/suggestion-hyperopt" + }, + "enas": { + "image": "docker.io/kubeflowkatib/suggestion-enas", + "imagePullPolicy": "Always", + "resources": { + "limits": { + "memory": "200Mi" + } + } + }, + "cmaes": { + "image": "docker.io/kubeflowkatib/suggestion-goptuna" + }, + "darts": { + "image": "docker.io/kubeflowkatib/suggestion-darts" + } +} diff --git a/operators/katib-controller/layer.yaml b/operators/katib-controller/layer.yaml new file mode 100644 index 00000000000..36be00576d9 --- /dev/null +++ b/operators/katib-controller/layer.yaml @@ -0,0 +1,6 @@ +repo: https://github.com/juju-solutions/bundle-kubeflow.git +includes: + - "layer:caas-base" + - "layer:status" + - "layer:docker-resource" + - "interface:http" diff --git a/operators/katib-controller/metadata.yaml b/operators/katib-controller/metadata.yaml new file mode 100755 index 00000000000..d2072edfee2 --- /dev/null +++ b/operators/katib-controller/metadata.yaml @@ -0,0 +1,22 @@ +name: katib-controller +display-name: Katib Controller +summary: A Kubernetes-native project for automated machine learning (AutoML) +description: | + Katib supports Hyperparameter Tuning, Early Stopping and Neural Architecture Search + + Katib is the project which is agnostic to machine learning (ML) frameworks. It can tune + hyperparameters of applications written in any language of the users’ choice and natively + supports many ML frameworks, such as TensorFlow, MXNet, PyTorch, XGBoost, and others. +tags: [ai, bigdata, katib, kubeflow, machine-learning, hyperparameter] +maintainers: [Kenneth Koski ] +series: [kubernetes] +resources: + oci-image: + type: oci-image + description: Backing OCI image + auto-fetch: true + upstream-source: docker.io/kubeflowkatib/katib-controller:v1beta1-a96ff59 +provides: + katib-controller: + interface: http +min-juju-version: 2.8.6 diff --git a/operators/katib-controller/reactive/katib_controller.py b/operators/katib-controller/reactive/katib_controller.py new file mode 100644 index 00000000000..aa6ec7f1335 --- /dev/null +++ b/operators/katib-controller/reactive/katib_controller.py @@ -0,0 +1,392 @@ +import json +import os +from base64 import b64encode +from pathlib import Path +from subprocess import check_call + +import yaml + +from charmhelpers.core import hookenv +from charms import layer +from charms.reactive import clear_flag, hook, set_flag, when, when_not + + +@hook("upgrade-charm") +def upgrade_charm(): + clear_flag("charm.started") + + +@when("charm.started") +def charm_ready(): + layer.status.active("") + + +@when("layer.docker-resource.oci-image.changed") +def update_image(): + clear_flag("charm.started") + + +def gen_certs(namespace, service_name): + if Path("/run/cert.pem").exists(): + hookenv.log("Found existing cert.pem, not generating new cert.") + return + + Path("/run/ssl.conf").write_text( + f"""[ req ] +default_bits = 2048 +prompt = no +default_md = sha256 +req_extensions = req_ext +distinguished_name = dn +[ dn ] +C = GB +ST = Canonical +L = Canonical +O = Canonical +OU = Canonical +CN = 127.0.0.1 +[ req_ext ] +subjectAltName = @alt_names +[ alt_names ] +DNS.1 = {service_name} +DNS.2 = {service_name}.{namespace} +DNS.3 = {service_name}.{namespace}.svc +DNS.4 = {service_name}.{namespace}.svc.cluster +DNS.5 = {service_name}.{namespace}.svc.cluster.local +IP.1 = 127.0.0.1 +[ v3_ext ] +authorityKeyIdentifier=keyid,issuer:always +basicConstraints=CA:FALSE +keyUsage=keyEncipherment,dataEncipherment,digitalSignature +extendedKeyUsage=serverAuth,clientAuth +subjectAltName=@alt_names""" + ) + + check_call(["openssl", "genrsa", "-out", "/run/ca.key", "2048"]) + check_call(["openssl", "genrsa", "-out", "/run/server.key", "2048"]) + check_call( + [ + "openssl", + "req", + "-x509", + "-new", + "-sha256", + "-nodes", + "-days", + "3650", + "-key", + "/run/ca.key", + "-subj", + "/CN=127.0.0.1", + "-out", + "/run/ca.crt", + ] + ) + check_call( + [ + "openssl", + "req", + "-new", + "-sha256", + "-key", + "/run/server.key", + "-out", + "/run/server.csr", + "-config", + "/run/ssl.conf", + ] + ) + check_call( + [ + "openssl", + "x509", + "-req", + "-sha256", + "-in", + "/run/server.csr", + "-CA", + "/run/ca.crt", + "-CAkey", + "/run/ca.key", + "-CAcreateserial", + "-out", + "/run/cert.pem", + "-days", + "365", + "-extensions", + "v3_ext", + "-extfile", + "/run/ssl.conf", + ] + ) + + +@when("layer.docker-resource.oci-image.available") +@when_not("charm.started") +def start_charm(): + if not hookenv.is_leader(): + hookenv.log("This unit is not a leader.") + return False + + layer.status.maintenance("configuring container") + + image_info = layer.docker_resource.get_info("oci-image") + namespace = os.environ["JUJU_MODEL_NAME"] + config = dict(hookenv.config()) + + gen_certs(namespace, hookenv.service_name()) + ca_bundle = b64encode(Path("/run/cert.pem").read_bytes()).decode("utf-8") + + layer.caas_base.pod_spec_set( + { + "version": 3, + "serviceAccount": { + "roles": [ + { + "rules": [ + { + "apiGroups": [""], + "resources": [ + "configmaps", + "serviceaccounts", + "services", + "secrets", + "events", + "namespaces", + "persistentvolumes", + "persistentvolumeclaims", + ], + "verbs": ["*"], + }, + { + "apiGroups": [""], + "resources": [ + "pods", + "pods/log", + "pods/status", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["apps"], + "resources": ["deployments"], + "verbs": ["*"], + }, + { + "apiGroups": ["batch"], + "resources": ["jobs", "cronjobs"], + "verbs": ["*"], + }, + { + "apiGroups": ["apiextensions.k8s.io"], + "resources": ["customresourcedefinitions"], + "verbs": ["create", "get"], + }, + { + "apiGroups": ["admissionregistration.k8s.io"], + "resources": [ + "validatingwebhookconfigurations", + "mutatingwebhookconfigurations", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["kubeflow.org"], + "resources": [ + "experiments", + "experiments/status", + "experiments/finalizers", + "trials", + "trials/status", + "trials/finalizers", + "suggestions", + "suggestions/status", + "suggestions/finalizers", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["kubeflow.org"], + "resources": [ + "tfjobs", + "pytorchjobs", + "mpijobs", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["tekton.dev"], + "resources": [ + "pipelineruns", + "taskruns", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["rbac.authorization.k8s.io"], + "resources": [ + "roles", + "rolebindings", + ], + "verbs": ["*"], + }, + ] + } + ] + }, + "containers": [ + { + "name": "katib-controller", + "command": ["./katib-controller"], + "args": [ + "--webhook-port", + str(config["webhook-port"]), + "--trial-resources=Job.v1.batch", + "--trial-resources=TFJob.v1.kubeflow.org", + "--trial-resources=PyTorchJob.v1.kubeflow.org", + "--trial-resources=MPIJob.v1.kubeflow.org", + "--trial-resources=PipelineRun.v1beta1.tekton.dev", + ], + "imageDetails": { + "imagePath": image_info.registry_path, + "username": image_info.username, + "password": image_info.password, + }, + "ports": [ + {"name": "webhook", "containerPort": config["webhook-port"]}, + {"name": "metrics", "containerPort": config["metrics-port"]}, + ], + "envConfig": { + "KATIB_CORE_NAMESPACE": os.environ["JUJU_MODEL_NAME"] + }, + "volumeConfig": [ + { + "name": "cert", + "mountPath": "/tmp/cert", + "files": [ + { + "path": "cert.pem", + "content": Path("/run/cert.pem").read_text(), + }, + { + "path": "key.pem", + "content": Path("/run/server.key").read_text(), + }, + ], + } + ], + "kubernetes": {"securityContext": {"runAsUser": 0}}, + } + ], + }, + k8s_resources={ + "kubernetesResources": { + "customResourceDefinitions": [ + {"name": crd["metadata"]["name"], "spec": crd["spec"]} + for crd in yaml.safe_load_all(Path("files/crds.yaml").read_text()) + ], + "mutatingWebhookConfigurations": [ + { + "name": "katib-mutating-webhook-config", + "webhooks": [ + { + "name": "mutating.experiment.katib.kubeflow.org", + "rules": [ + { + "apiGroups": ["kubeflow.org"], + "apiVersions": ["v1beta1"], + "operations": ["CREATE", "UPDATE"], + "resources": ["experiments"], + "scope": "*", + } + ], + "failurePolicy": "Fail", + "clientConfig": { + "service": { + "name": hookenv.service_name(), + "namespace": namespace, + "path": "/mutate-experiments", + "port": config["webhook-port"], + }, + "caBundle": ca_bundle, + }, + }, + { + "name": "mutating.pod.katib.kubeflow.org", + "rules": [ + { + "apiGroups": [""], + "apiVersions": ["v1"], + "operations": ["CREATE"], + "resources": ["pods"], + "scope": "*", + } + ], + "failurePolicy": "Ignore", + "clientConfig": { + "service": { + "name": hookenv.service_name(), + "namespace": namespace, + "path": "/mutate-pods", + "port": config["webhook-port"], + }, + "caBundle": ca_bundle, + }, + }, + ], + } + ], + "validatingWebhookConfigurations": [ + { + "name": "katib-validating-webhook-config", + "webhooks": [ + { + "name": "validating.experiment.katib.kubeflow.org", + "rules": [ + { + "apiGroups": ["kubeflow.org"], + "apiVersions": ["v1beta1"], + "operations": ["CREATE", "UPDATE"], + "resources": ["experiments"], + "scope": "*", + } + ], + "failurePolicy": "Fail", + "sideEffects": "Unknown", + "clientConfig": { + "service": { + "name": hookenv.service_name(), + "namespace": namespace, + "path": "/validate-experiments", + "port": config["webhook-port"], + }, + "caBundle": ca_bundle, + }, + } + ], + } + ], + }, + "configMaps": { + "katib-config": { + f: Path(f"files/{f}.json").read_text() + for f in ( + "metrics-collector-sidecar", + "suggestion", + "early-stopping", + ) + }, + "trial-template": { + f + suffix: Path(f"files/{f}.yaml").read_text() + for f, suffix in ( + ("defaultTrialTemplate", ".yaml"), + ("enasCPUTemplate", ""), + ("pytorchJobTemplate", ""), + ) + }, + }, + }, + ) + + layer.status.maintenance("creating container") + set_flag("charm.started") diff --git a/operators/katib-db-manager/config.yaml b/operators/katib-db-manager/config.yaml new file mode 100644 index 00000000000..bc274e18747 --- /dev/null +++ b/operators/katib-db-manager/config.yaml @@ -0,0 +1,5 @@ +options: + port: + type: int + default: 6789 + description: API port diff --git a/operators/katib-db-manager/layer.yaml b/operators/katib-db-manager/layer.yaml new file mode 100644 index 00000000000..e1f47d59ed2 --- /dev/null +++ b/operators/katib-db-manager/layer.yaml @@ -0,0 +1,7 @@ +repo: https://github.com/juju-solutions/bundle-kubeflow.git +includes: + - "layer:caas-base" + - "layer:status" + - "layer:docker-resource" + - "interface:mysql" + - "interface:http" diff --git a/operators/katib-db-manager/metadata.yaml b/operators/katib-db-manager/metadata.yaml new file mode 100755 index 00000000000..767b462d1d6 --- /dev/null +++ b/operators/katib-db-manager/metadata.yaml @@ -0,0 +1,25 @@ +name: katib-db-manager +display-name: Katib DB Manager +summary: A Kubernetes-native project for automated machine learning (AutoML) +description: | + Katib supports Hyperparameter Tuning, Early Stopping and Neural Architecture Search + + Katib is the project which is agnostic to machine learning (ML) frameworks. It can tune + hyperparameters of applications written in any language of the users’ choice and natively + supports many ML frameworks, such as TensorFlow, MXNet, PyTorch, XGBoost, and others. +tags: [ai, bigdata, katib, kubeflow, machine-learning, hyperparameter] +maintainers: [Kenneth Koski ] +series: [kubernetes] +resources: + oci-image: + type: oci-image + description: Backing OCI image + auto-fetch: true + upstream-source: docker.io/kubeflowkatib/katib-db-manager:v1beta1-a96ff59 +requires: + mysql: + interface: mysql +provides: + katib-db-manager: + interface: http +min-juju-version: 2.8.6 diff --git a/operators/katib-db-manager/reactive/katib_db_manager.py b/operators/katib-db-manager/reactive/katib_db_manager.py new file mode 100644 index 00000000000..deff7e47ac8 --- /dev/null +++ b/operators/katib-db-manager/reactive/katib_db_manager.py @@ -0,0 +1,88 @@ +from charmhelpers.core import hookenv +from charms import layer +from charms.reactive import ( + hook, + set_flag, + clear_flag, + when, + when_any, + when_not, + endpoint_from_name, +) + + +@hook("upgrade-charm") +def upgrade_charm(): + clear_flag("charm.started") + + +@when("charm.started") +def charm_ready(): + layer.status.active("") + + +@when_any("layer.docker-resource.oci-image.changed", "config.changed", "mysql.changed") +def update_image(): + clear_flag("charm.started") + + +@when("layer.docker-resource.oci-image.available", "mysql.available") +@when_not("charm.started") +def start_charm(): + if not hookenv.is_leader(): + hookenv.log("This unit is not a leader.") + return False + + layer.status.maintenance("configuring container") + + image_info = layer.docker_resource.get_info("oci-image") + + mysql = endpoint_from_name("mysql") + + port = hookenv.config("port") + + layer.caas_base.pod_spec_set( + { + "version": 3, + "containers": [ + { + "name": "katib-db-manager", + "command": ["./katib-db-manager"], + "imageDetails": { + "imagePath": image_info.registry_path, + "username": image_info.username, + "password": image_info.password, + }, + "ports": [{"name": "api", "containerPort": port}], + "envConfig": { + "DB_NAME": "mysql", + "DB_USER": "root", + "DB_PASSWORD": mysql.root_password(), + "KATIB_MYSQL_DB_HOST": mysql.host(), + "KATIB_MYSQL_DB_PORT": mysql.port(), + "KATIB_MYSQL_DB_DATABASE": "katib", + }, + "kubernetes": { + "readinessProbe": { + "exec": { + "command": ["/bin/grpc_health_probe", f"-addr=:{port}"] + }, + "initialDelaySeconds": 5, + }, + "livenessProbe": { + "exec": { + "command": ["/bin/grpc_health_probe", f"-addr=:{port}"] + }, + "initialDelaySeconds": 10, + "periodSeconds": 60, + "failureThreshold": 5, + }, + }, + } + ], + }, + ) + + layer.status.maintenance("creating container") + clear_flag("mysql.changed") + set_flag("charm.started") diff --git a/operators/katib-ui/config.yaml b/operators/katib-ui/config.yaml new file mode 100644 index 00000000000..ec3eb24005c --- /dev/null +++ b/operators/katib-ui/config.yaml @@ -0,0 +1,5 @@ +options: + port: + type: int + default: 8080 + description: HTTP port diff --git a/operators/katib-ui/layer.yaml b/operators/katib-ui/layer.yaml new file mode 100644 index 00000000000..1095b71ff1e --- /dev/null +++ b/operators/katib-ui/layer.yaml @@ -0,0 +1,6 @@ +repo: https://github.com/juju-solutions/bundle-kubeflow.git +includes: + - "layer:caas-base" + - "layer:status" + - "layer:docker-resource" + - "interface:service-mesh" diff --git a/operators/katib-ui/metadata.yaml b/operators/katib-ui/metadata.yaml new file mode 100755 index 00000000000..005a8f6a586 --- /dev/null +++ b/operators/katib-ui/metadata.yaml @@ -0,0 +1,25 @@ +name: katib-ui +display-name: Katib UI +summary: A Kubernetes-native project for automated machine learning (AutoML) +description: | + Katib supports Hyperparameter Tuning, Early Stopping and Neural Architecture Search + + Katib is the project which is agnostic to machine learning (ML) frameworks. It can tune + hyperparameters of applications written in any language of the users’ choice and natively + supports many ML frameworks, such as TensorFlow, MXNet, PyTorch, XGBoost, and others. +tags: [ai, bigdata, katib, kubeflow, machine-learning, hyperparameter] +maintainers: [Kenneth Koski ] +series: [kubernetes] +resources: + oci-image: + type: oci-image + description: Backing OCI image + auto-fetch: true + upstream-source: docker.io/kubeflowkatib/katib-ui:v1beta1-a96ff59 +requires: + service-mesh: + interface: service-mesh +provides: + katib-ui: + interface: http +min-juju-version: 2.8.6 diff --git a/operators/katib-ui/reactive/katib_ui.py b/operators/katib-ui/reactive/katib_ui.py new file mode 100644 index 00000000000..1f7cf490a05 --- /dev/null +++ b/operators/katib-ui/reactive/katib_ui.py @@ -0,0 +1,94 @@ +import os + +from charmhelpers.core import hookenv +from charms import layer +from charms.reactive import ( + clear_flag, + endpoint_from_name, + hook, + set_flag, + when, + when_not, +) + + +@hook("upgrade-charm") +def upgrade_charm(): + clear_flag("charm.started") + + +@when("charm.started") +def charm_ready(): + layer.status.active("") + + +@when("layer.docker-resource.oci-image.changed") +def update_image(): + clear_flag("charm.started") + + +@when("endpoint.service-mesh.joined") +def configure_mesh(): + endpoint_from_name("service-mesh").add_route( + prefix="/katib/", service=hookenv.service_name(), port=hookenv.config("port") + ) + + +@when("layer.docker-resource.oci-image.available") +@when_not("charm.started") +def start_charm(): + if not hookenv.is_leader(): + hookenv.log("This unit is not a leader.") + return False + + layer.status.maintenance("configuring container") + + image_info = layer.docker_resource.get_info("oci-image") + + port = hookenv.config("port") + + layer.caas_base.pod_spec_set( + { + "version": 2, + "serviceAccount": { + "rules": [ + { + "apiGroups": [""], + "resources": [ + "configmaps", + "namespaces", + ], + "verbs": ["*"], + }, + { + "apiGroups": ["kubeflow.org"], + "resources": [ + "experiments", + "trials", + "suggestions", + ], + "verbs": ["*"], + }, + ] + }, + "containers": [ + { + "name": "katib-ui", + "command": ["./katib-ui"], + "args": [f"--port={port}"], + "imageDetails": { + "imagePath": image_info.registry_path, + "username": image_info.username, + "password": image_info.password, + }, + "ports": [{"name": "http", "containerPort": port}], + "config": { + "KATIB_CORE_NAMESPACE": os.environ["JUJU_MODEL_NAME"], + }, + } + ], + } + ) + + layer.status.maintenance("creating container") + set_flag("charm.started")