# sklearn IRIS

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# add venv PATH to shell command PATH
import sys, os
if sys.base_prefix not in os.environ['PATH']:
    os.environ['PATH'] = f"{sys.base_prefix}/bin:{os.environ['PATH']}"

In [2]:
NAME = 'sklearn_iris_mlflow'

# load data & train model

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import datasets

iris = datasets.load_iris()
x = iris.data[:, 2:]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

    
# add parameters for tuning
num_estimators = 100

# train the model
rf = RandomForestRegressor(n_estimators=num_estimators)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
print('predictions: ', predictions)

# log model performance 
mse = mean_squared_error(y_test, predictions)
print("  mse: %f" % mse)

predictions:  [2.   1.   0.   1.   1.62 0.   1.16 1.   0.   1.   2.   1.   0.   2.
 0.   1.91 2.   2.   0.   0.   1.   2.   1.   1.28 1.48 1.91 1.   1.
 2.   2.  ]
  mse: 0.093167


In [5]:
import pickle

tmpdir = 'mlflow_tmp'
!mkdir {tmpdir}

data_path = os.path.join(tmpdir, 'skmodel.pkl')
with open(data_path, 'wb') as of:
    pickle.dump(rf, of)

In [6]:
%%writefile {NAME}.py
from __future__ import print_function

import os
import pickle

import pandas as pd
import numpy as np
import pytest
import six

import tensorflow as tf

import mlflow
import mlflow.pyfunc
import mlflow.pyfunc.model
from mlflow.models import Model


def _load_pyfunc(path):
    with open(path, 'rb') as of:
        data_model = pickle.load(of)
    class Model:
        def predict(self, inputs):
            inputs = inputs.to_numpy()
            outputs = data_model.predict(inputs)
            return outputs
    return Model()
        


if __name__ == '__main__':
    tmpdir = 'mlflow_tmp'
    data_path = os.path.join(tmpdir, 'skmodel.pkl')
    model_path = os.path.join(str(tmpdir), "model")

    model_config = Model(run_id="test")
    mlflow.pyfunc.save_model(path=model_path,
                             data_path=data_path,
                             loader_module=os.path.basename(__file__)[:-3],
                             code_path=[__file__],
                             mlflow_model=model_config)


Writing sklearn_iris_mlflow.py


In [7]:
from bentoml.utils import detect_free_port
PORT = detect_free_port()
server_url = f'http://127.0.0.1:{PORT}/invocations'
print(server_url)

!rm -r {tmpdir}/model
!python {NAME}.py

print("Run this command to launch mlflow:")
print(f"mlflow models serve -m {tmpdir}/model --port {PORT}")
# !mlflow models serve -m {tmpdir}/model --port {PORT}

http://127.0.0.1:50737/invocations
rm: cannot remove ‘mlflow_tmp/model’: No such file or directory
Run this command to launch mlflow:
mlflow models serve -m mlflow_tmp/model --port 50737


# Test with requests

In [8]:
import json
import requests
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
x = iris.data[:, 2:]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

headers = {"content-type": "application/json"}
raw_data = X_test[:1]
data = pd.DataFrame(raw_data,
                    columns=map(str, range(raw_data.shape[1]))).to_json(orient='split')

print(data)
json_response = requests.post(server_url, data=data, headers=headers)
print(json_response)
print(json_response.json())

{"columns":["0","1"],"index":[0],"data":[[5.1,1.8]]}
<Response [200]>
[2.0]


# Benchmark

In [9]:
import pandas as pd
import json
import copy
import random


def get_request_producer():

    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    iris = datasets.load_iris()
    x = iris.data[:, 2:]
    y = iris.target

    url = server_url
    method = "POST"
    headers = {"content-type": "application/json"}
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=7)
    base_data = X_test[:1].tolist()

    def _gen_data():
        raw = copy.deepcopy(base_data)
        raw[0][0] += random.random() / 10000
        raw[0][1] += random.random() / 10000
        data = pd.DataFrame(raw).to_json(orient='split')
        return url, method, headers, data

    return _gen_data

get_request_producer()()

('http://127.0.0.1:50737/invocations',
 'POST',
 {'content-type': 'application/json'},
 '{"columns":[0,1],"index":[0],"data":[[5.1000379003,1.8000748634]]}')

In [10]:
from bentoml.utils.benchmark import BenchmarkClient
b = BenchmarkClient(get_request_producer(), lambda: 1, timeout=10)
b.start_session(60, 200, 600)


╒═════════════╤═════════════╤══════════╤═════════════════╤═══════════════════╕
│ Reqs/Fail   │   Failure % │   Reqs/s │ Avg Resp Time   │   Client Health % │
╞═════════════╪═════════════╪══════════╪═════════════════╪═══════════════════╡
│ 0/0         │           0 │        0 │ None            │               100 │
╘═════════════╧═════════════╧══════════╧═════════════════╧═══════════════════╛
------ 200 users spawned ------

╒═════════════╤═════════════╤══════════╤═════════════════╤═══════════════════╕
│ Reqs/Fail   │   Failure % │   Reqs/s │   Avg Resp Time │   Client Health % │
╞═════════════╪═════════════╪══════════╪═════════════════╪═══════════════════╡
│ 238/0       │           0 │    117.5 │        0.605616 │               100 │
╘═════════════╧═════════════╧══════════╧═════════════════╧═══════════════════╛

╒═════════════╤═════════════╤══════════╤═════════════════╤═══════════════════╕
│ Reqs/Fail   │   Failure % │   Reqs/s │   Avg Resp Time │   Client Health % │
╞═════════════╪══

In [35]:
b.killall()  # To cancel the session

# Benchmark with locust

In [29]:
%%writefile benchmark_{NAME}.py
from locust import HttpLocust, TaskSet, task, constant
from functools import lru_cache

import pandas as pd
import json


@lru_cache(maxsize=1)
def data_producer():

    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    iris = datasets.load_iris()
    x = iris.data[:, 2:]
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

    headers = {"content-type": "application/json"}

    def _gen_data():
        raw_data = X_test
        data = pd.DataFrame(raw_data).to_json(orient='split')
        return headers, data

    return _gen_data


class WebsiteTasks(TaskSet):

    @task
    def index(self):
        headers, data = data_producer()()
        self.client.post('', data, headers=headers)

class WebsiteUser(HttpLocust):
    task_set = WebsiteTasks
    wait_time = constant(1)

Overwriting benchmark_sklearn_iris_mlflow.py


In [30]:
!locust -f benchmark_{NAME}.py -H {server_url} --no-web -t 60s --csv {NAME} -c 900 -r 90

[2020-04-02 03:16:37,262] ip-172-31-15-25/INFO/locust.runners: Hatching and swarming 900 users at the rate 90 users/s (0 users already running)...
[2020-04-02 03:16:38,262] ip-172-31-15-25/INFO/locust.main: Run time limit set to 60 seconds
[2020-04-02 03:16:38,262] ip-172-31-15-25/INFO/locust.main: Starting Locust 0.14.5
 Name                                                          # reqs      # fails     Avg     Min     Max  |  Median   req/s failures/s
--------------------------------------------------------------------------------------------------------------------------------------------
 POST /invocations                                                 65     0(0.00%)       8       7      17  |       8    0.00    0.00
--------------------------------------------------------------------------------------------------------------------------------------------
 Aggregated                                                        65     0(0.00%)       8       7      17  |       8    0.0