In [1]:
!pip show klops

[0m

In [2]:
import sys
sys.path.append("/home/jupyter/asrul_workspace/development/klops/")

In [3]:
from klops.experiment import Experiment
import mlflow
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [4]:
mlflow.end_run()

In [5]:
experiment = Experiment(name="cakrul-test-25-8-22-final", tracking_uri="http://34.128.91.55:5000/")

In [6]:
X, y = load_iris(return_X_y=True)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Experiment Without Any Tuner

In [7]:
hyper_params = {'max_depth': 3,
               'min_samples_split': 2}
experiment.start(RandomForestClassifier(**hyper_params), x_train_data=X, y_train_data=y)

<klops.experiment.experiment.Experiment at 0x7fbb1e2e5580>

## Experiment Using GridsearchCV

In [8]:
param_grid = {'max_depth': [3, 5, 10],
               'min_samples_split': [2, 5, 10]}
classifier = RandomForestClassifier()
experiment.start(classifier, x_train_data=X, y_train_data=y, tuner="gridsearch", tuner_args=param_grid)

<klops.experiment.experiment.Experiment at 0x7fbb1e2e5580>

## Experiment Using Hyperopt

In [9]:
from hyperopt import hp
from hyperopt.pyll import scope

SEARCH_SPACE = {
    "n_estimators": scope.int(hp.quniform("n_estimators", 10, 50, 1)),
    "max_depth": scope.int(hp.quniform("max_depth", 10, 30, 1))
}

classifier = RandomForestClassifier()

experiment = experiment.start(classifier, x_train_data=X, y_train_data=y, tuner="hyperopt", tuner_args=SEARCH_SPACE)

TPE is being used as the default algorithm.


  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]




  5%|▌         | 1/20 [00:03<01:02,  3.31s/trial, best loss: 0.2581988897471611]




 10%|█         | 2/20 [00:06<00:56,  3.12s/trial, best loss: 0.2581988897471611]




 15%|█▌        | 3/20 [00:09<00:55,  3.29s/trial, best loss: 0.2581988897471611]




 20%|██        | 4/20 [00:12<00:50,  3.19s/trial, best loss: 0.2581988897471611]




 25%|██▌       | 5/20 [00:15<00:47,  3.15s/trial, best loss: 0.2581988897471611]




 30%|███       | 6/20 [00:19<00:46,  3.29s/trial, best loss: 0.2581988897471611]




 35%|███▌      | 7/20 [00:22<00:43,  3.31s/trial, best loss: 0.2581988897471611]




 40%|████      | 8/20 [00:26<00:40,  3.35s/trial, best loss: 0.2581988897471611]




 45%|████▌     | 9/20 [00:29<00:37,  3.44s/trial, best loss: 0.2581988897471611]




 50%|█████     | 10/20 [00:33<00:34,  3.44s/trial, best loss: 0.2581988897471611]




 55%|█████▌    | 11/20 [00:36<00:29,  3.28s/trial, best loss: 0.2581988897471611]




 60%|██████    | 12/20 [00:39<00:25,  3.23s/trial, best loss: 0.2581988897471611]




 65%|██████▌   | 13/20 [00:43<00:24,  3.47s/trial, best loss: 0.2581988897471611]




 70%|███████   | 14/20 [00:46<00:20,  3.41s/trial, best loss: 0.2581988897471611]




 75%|███████▌  | 15/20 [00:50<00:17,  3.42s/trial, best loss: 0.2581988897471611]




 80%|████████  | 16/20 [00:53<00:13,  3.44s/trial, best loss: 0.2581988897471611]




 85%|████████▌ | 17/20 [00:57<00:10,  3.57s/trial, best loss: 0.2581988897471611]




 90%|█████████ | 18/20 [01:00<00:06,  3.49s/trial, best loss: 0.2581988897471611]




 95%|█████████▌| 19/20 [01:04<00:03,  3.68s/trial, best loss: 0.2581988897471611]




100%|██████████| 20/20 [01:07<00:00,  3.39s/trial, best loss: 0.2581988897471611]


# Deployment  


## Define The Deployment Auth Target
In this e.g, we would employ the GKE authentication since we already have a project cluster there.

In [10]:
from klops.deployment.auth import GKEAuthentication
from klops.deployment import Deployment

gke = GKEAuthentication(
    project_id="koinworks-data-staging",
    zone="asia-southeast2",
    cluster_id="seldon-system-dev")

### Deploy Using Auto deployment built-in in Experiment Module

Our Experiment module have built-in method `deploy` to invoke the deployment. All we need is to just define the `artifact_uri` (the experiment result uri), define our `deployment_name`, define the `model_name`, the cluster's `namespace`, and  the cluster authentication instance. As mentioned before, we would employ the GKE authentication since we already have a cluster there.

First, we need to find the experiment result, the easiest way, we could find it out by opening our MLflow Tracking UI. We would findout our experiment by selecting our experiments name there, and klick the desired models on the experiment id page. Then you would see this page, and copy your experiment artifact address as marked by red rectangle in below example.  

![Example Deployment](./resources/images/experiment_result.png)

In [11]:
# Now we would deploy it.
# TODO:
# add model registry as source model.
# make sure the model already registered in model_registry. e.g: model-v2.1

experiment.deploy(
    artifact_uri="gs://koinworks-mlflow-dev/mlruns/18/431638e6fa68449a8c5be36023fc4316/artifacts/model",
    deployment_name="mlflow-test",
    model_name="iris-mlflow",
    authentication=gke,
    namespace="seldon")

Klops path: /home/jupyter/asrul_workspace/development/klops/klops


{'apiVersion': 'machinelearning.seldon.io/v1alpha2',
 'kind': 'SeldonDeployment',
 'metadata': {'creationTimestamp': '2022-08-28T23:47:47Z',
  'generation': 1,
  'managedFields': [{'apiVersion': 'machinelearning.seldon.io/v1alpha2',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:spec': {'.': {}, 'f:name': {}}},
    'manager': 'OpenAPI-Generator',
    'operation': 'Update',
    'time': '2022-08-28T23:47:47Z'},
   {'apiVersion': 'machinelearning.seldon.io/v1',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:status': {'.': {},
      'f:address': {'.': {}, 'f:url': {}},
      'f:conditions': {},
      'f:deploymentStatus': {'.': {},
       'f:mlflow-test-default-0-classifier': {'.': {}, 'f:replicas': {}}},
      'f:description': {},
      'f:replicas': {},
      'f:state': {}}},
    'manager': 'manager',
    'operation': 'Update',
    'subresource': 'status',
    'time': '2022-08-28T23:57:48Z'}],
  'name': 'mlflow-test',
  'namespace': 'seldon',
  'resourceVersion': '33454724',
  'uid

### Deploy Manually Using Seldon Deployment Class  
The Deployment Class could be instantiated directly in case of we already have the model weights. To do so, we just need to instantiate the Deployment class. Then invoke the deploy method with deployment config as the argument.

In [12]:

deployment = Deployment(gke, "seldon")

config = deployment.load_deployment_configuration("notebooks/iris.json")
deployment.deploy(config)

{'apiVersion': 'machinelearning.seldon.io/v1alpha2',
 'kind': 'SeldonDeployment',
 'metadata': {'creationTimestamp': '2022-08-29T08:43:29Z',
  'generation': 1,
  'managedFields': [{'apiVersion': 'machinelearning.seldon.io/v1alpha2',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:spec': {'.': {},
      'f:name': {},
      'f:predictors': {},
      'f:protocol': {}}},
    'manager': 'OpenAPI-Generator',
    'operation': 'Update',
    'time': '2022-08-29T08:43:29Z'}],
  'name': 'sklearn',
  'namespace': 'seldon',
  'resourceVersion': '33765347',
  'uid': '7b6975ca-b2a4-41b0-8908-d86d319d1958'},
 'spec': {'name': 'iris-predict',
  'predictors': [{'graph': {'children': [],
     'implementation': 'SKLEARN_SERVER',
     'modelUri': 'gs://seldon-models/v1.15.0-dev/sklearn/iris',
     'name': 'classifier',
     'parameters': [{'name': 'method', 'type': 'STRING', 'value': 'predict'}]},
    'name': 'default'}],
  'protocol': 'v2'}}

### Remove The deployment  
When we have a model update, we should remove the current deployed. To do so, Deployment class have a method named `delete`. Passing the `deployment_name` as argument is required.

In [13]:
deployment.delete(deployment_name="mlflow-test")

True

## Versioning Our Data  
Versioning our data is required in every machine learning project. Here are some reasons why data versioning is needed.  
- **Ensure better training data**: ML comprises rapid experimentation, iteration, and training models on data. Thus, training on incorrect data can have disastrous results for the outcomes of an ML project.  
- **Track data schema**: Enterprise data is usually obtained in batches, and often minor changes in the ML schema are applied throughout a project. With proper versioning, you can easily track and evolve the data schema over time. You can also understand whether these changes are backward and forward compatible.  
- **Continual model training**: In production environments, data is refreshed periodically and may trigger a new run of the model training pipeline. When such automated retraining occurs, it is essential to have data versioned for tracking a model’s efficacy.  
- **Enhance traceability and reproducibility**: Data scientists must be able to track, identify the provenance of data, and point out which version of a dataset reinforces the outcomes of their experiments. They should re-run the entire ML pipeline and reproduce the exact results each time as it is a critical input for the modeling process. Thus, the original training data must always be available. Hence, from a reproducibility/traceability perspective, proper versioning is critical.  
- **Auditing**: Proper versioning ensures that the integrity of data-based activities is upheld by identifying when modifications are made. By monitoring and analyzing the actions of both users and models, auditors can identify intentional and accidental lapses in user behavior. Data science auditors can thus examine the effect of data changes on model accuracy and determine best ML practices for the enterprise.  
This can be done by using our Klops Versioning. Klops Versioning is a kind of version control based on DVC. It is a wrapper for DVC. This wrapper aimed to make every DVC command code minded. 

In [14]:
from klops.versioning import Versioning
version = Versioning()

In [15]:
version.init()

ERROR: failed to initiate DVC - /home/jupyter/asrul_workspace/development/klops/.dvc is ignored by your SCM tool. 
Make sure that it's tracked, for example, by adding '!.dvc' to .gitignore.
2022-08-29 15:43:31,033 ERROR    Command 'dvc init' returned non-zero exit status 1.


In [16]:
# Add DVC Remote

version.add_remote(name="iris",remote_url="gs://dvc-storage-ds-jkt/iris")

ERROR: configuration error - config file error: Not inside a DVC repo
2022-08-29 15:43:31,628 ERROR    Command 'dvc remote add -d iris gs://dvc-storage-ds-jkt/iris' returned non-zero exit status 251.


Setting 'iris' as a default remote.


In [17]:
# Track file
version.add("notebooks/")

ERROR: you are not inside of a DVC repository (checked up to mount point '/home/jupyter')
2022-08-29 15:43:32,416 ERROR    Command 'dvc add notebooks/' returned non-zero exit status 253.


In [18]:
# Push revision

version.push()

ERROR: you are not inside of a DVC repository (checked up to mount point '/home/jupyter')
2022-08-29 15:43:33,131 ERROR    Command 'dvc push' returned non-zero exit status 253.
