# sklearn IRIS

In [5]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# add venv PATH to shell command PATH
import sys, os
if sys.base_prefix not in os.environ['PATH']:
    os.environ['PATH'] = f"{sys.base_prefix}/bin:{os.environ['PATH']}"

In [21]:
NAME = 'sklearn_iris_mlflow'

# load data & train model

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import datasets

iris = datasets.load_iris()
x = iris.data[:, 2:]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

    
# add parameters for tuning
num_estimators = 100

# train the model
rf = RandomForestRegressor(n_estimators=num_estimators)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
print('predictions: ', predictions)

# log model performance 
mse = mean_squared_error(y_test, predictions)
print("  mse: %f" % mse)

predictions:  [2.   1.   0.   1.   1.58 0.   1.19 1.   0.   1.   2.   1.   0.   2.
 0.   1.82 2.   2.   0.   0.   1.   2.   1.   1.25 1.53 1.82 1.   1.
 2.   2.  ]
  mse: 0.090023


In [8]:
import pickle
tmpdir = 'mlflow_tmp'
data_path = os.path.join(tmpdir, 'skmodel.pkl')
with open(data_path, 'wb') as of:
    pickle.dump(rf, of)

In [28]:
%%writefile {NAME}.py
from __future__ import print_function

import os
import pickle

import pandas as pd
import numpy as np
import pytest
import six

import tensorflow as tf

import mlflow
import mlflow.pyfunc
import mlflow.pyfunc.model
from mlflow.models import Model


def _load_pyfunc(path):
    with open(path, 'rb') as of:
        data_model = pickle.load(of)
    class Model:
        def predict(self, inputs):
            inputs = inputs.to_numpy()
            outputs = data_model.predict(inputs)
            return outputs
    return Model()
        


if __name__ == '__main__':
    tmpdir = 'mlflow_tmp'
    data_path = os.path.join(tmpdir, 'skmodel.pkl')
    model_path = os.path.join(str(tmpdir), "model")

    model_config = Model(run_id="test")
    mlflow.pyfunc.save_model(path=model_path,
                             data_path=data_path,
                             loader_module=os.path.basename(__file__)[:-3],
                             code_path=[__file__],
                             mlflow_model=model_config)


Overwriting sklearn_iris_mlflow.py


In [55]:
!rm -r {tmpdir}/model
!python {NAME}.py
!mlflow models serve -m {tmpdir}/model

  import imp
2020/03/19 12:56:47 INFO mlflow.models.cli: Selected backend for flavor 'python_function'
2020/03/19 12:56:47 INFO mlflow.pyfunc.backend: === Running command 'gunicorn --timeout=60 -b 127.0.0.1:5000 -w 1 ${GUNICORN_CMD_ARGS} -- mlflow.pyfunc.scoring_server.wsgi:app'
[2020-03-19 12:56:47 +0800] [1555253] [INFO] Starting gunicorn 20.0.4
[2020-03-19 12:56:47 +0800] [1555253] [INFO] Listening at: http://127.0.0.1:5000 (1555253)
[2020-03-19 12:56:47 +0800] [1555253] [INFO] Using worker: sync
[2020-03-19 12:56:47 +0800] [1555267] [INFO] Booting worker with pid: 1555267
  import imp
^C

Aborted!
[2020-03-19 12:57:39 +0800] [1555253] [INFO] Handling signal: int
[2020-03-19 12:57:40 +0800] [1555267] [INFO] Worker exiting (pid: 1555267)


# Test with requests

In [None]:
import json
import requests
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
x = iris.data[:, 2:]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

headers = {"content-type": "application/json"}
raw_data = X_test
data = pd.DataFrame(raw_data,
                    columns=map(str, range(raw_data.shape[1]))).to_json(orient='split')

json_response = requests.post(f'http://127.0.0.1:5000/invocations',
                              data=data, headers=headers)
print(json_response)
print(json_response.json())

# Benchmark with locust

In [54]:
%%writefile benchmark_{NAME}.py
from locust import HttpLocust, TaskSet, task, constant
from functools import lru_cache

import pandas as pd
import json


@lru_cache(maxsize=1)
def data_producer():

    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    iris = datasets.load_iris()
    x = iris.data[:, 2:]
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

    headers = {"content-type": "application/json"}

    def _gen_data():
        raw_data = X_test
        data = pd.DataFrame(raw_data).to_json(orient='split')
        return headers, data

    return _gen_data


class WebsiteTasks(TaskSet):

    @task
    def index(self):
        headers, data = data_producer()()
        self.client.post("/invocations", data, headers=headers)

class WebsiteUser(HttpLocust):
    task_set = WebsiteTasks
    wait_time = constant(1)

Overwriting benchmark_sklearn_iris_mlflow.py


In [None]:
!locust -f benchmark_{NAME}.py -H http://127.0.0.1:5000