bentoml · parano · Mar 10, 2021 · Mar 1, 2021 · Mar 3, 2021 · Mar 3, 2021
diff --git a/bentoml/cli/bento_service.py b/bentoml/cli/bento_service.py
@@ -1,34 +1,32 @@
 import argparse
-import click
-import sys
-
 import json
 import re
+import sys
+
+import click
 import psutil
 
 from bentoml import __version__
-from bentoml.configuration import BENTOML_CONFIG
-from bentoml.configuration.containers import BentoMLConfiguration, BentoMLContainer
-from bentoml.utils.lazy_loader import LazyLoader
-from bentoml.server import start_dev_server, start_prod_server
-from bentoml.server.open_api import get_open_api_spec_json
-from bentoml.utils import (
-    ProtoMessageToDict,
-    resolve_bundle_path,
-)
 from bentoml.cli.click_utils import (
     CLI_COLOR_SUCCESS,
-    _echo,
     BentoMLCommandGroup,
+    _echo,
     conditional_argument,
 )
 from bentoml.cli.utils import Spinner
+from bentoml.configuration import BENTOML_CONFIG
+from bentoml.configuration.containers import BentoMLConfiguration, BentoMLContainer
+from bentoml.entrypoint import start_prod_server
 from bentoml.saved_bundle import (
-    load_from_dir,
     load_bento_service_api,
     load_bento_service_metadata,
+    load_from_dir,
 )
+from bentoml.server import start_dev_server
+from bentoml.server.open_api import get_open_api_spec_json
+from bentoml.utils import ProtoMessageToDict, resolve_bundle_path
 from bentoml.utils.docker_utils import validate_tag
+from bentoml.utils.lazy_loader import LazyLoader
 from bentoml.yatai.client import get_yatai_client
 
 try:
@@ -310,23 +308,18 @@ def serve_gunicorn(
             bento, pip_installed_bundle_path, yatai_url
         )
 
-        container = BentoMLContainer()
-        config = BentoMLConfiguration(override_config_file=config)
-        config.override(["api_server", "port"], port)
-        config.override(["api_server", "workers"], workers)
-        config.override(["api_server", "timeout"], timeout)
-        config.override(["api_server", "enable_microbatch"], enable_microbatch)
-        config.override(["api_server", "enable_swagger"], enable_swagger)
-        config.override(["marshal_server", "max_batch_size"], mb_max_batch_size)
-        config.override(["marshal_server", "max_latency"], mb_max_latency)
-        config.override(["marshal_server", "workers"], microbatch_workers)
-        container.config.from_dict(config.as_dict())
-
-        from bentoml import marshal, server
-
-        container.wire(packages=[marshal, server])
-
-        start_prod_server(saved_bundle_path)
+        start_prod_server(
+            saved_bundle_path,
+            port=port,
+            workers=workers,
+            timeout=timeout,
+            enable_microbatch=enable_microbatch,
+            enable_swagger=enable_swagger,
+            mb_max_batch_size=mb_max_batch_size,
+            mb_max_latency=mb_max_latency,
+            microbatch_workers=microbatch_workers,
+            config_file=config,
+        )
 
     @bentoml_cli.command(
         help="Install shell command completion",

diff --git a/bentoml/configuration/containers.py b/bentoml/configuration/containers.py
@@ -18,7 +18,7 @@
 
 from deepmerge import always_merger
 from dependency_injector import containers, providers
-from schema import Schema, SchemaError, And, Or
+from schema import And, Or, Schema, SchemaError
 
 from bentoml.configuration import config
 from bentoml.exceptions import BentoMLConfigException
@@ -48,6 +48,7 @@
         "yatai": {"url": Or(str, None)},
         "tracing": {"zipkin_api_url": Or(str, None)},
         "instrument": {"namespace": str},
+        "logging": {"level": str},
     }
 )
 

diff --git a/bentoml/configuration/default_bentoml.yml b/bentoml/configuration/default_bentoml.yml
@@ -40,3 +40,6 @@ tracing:
 
 instrument:
   namespace: BENTOML
+
+logging:
+  level: INFO
diff --git a/bentoml/entrypoint/__init__.py b/bentoml/entrypoint/__init__.py
@@ -0,0 +1,133 @@
+# Copyright 2019 Atalaya Tech, Inc.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import multiprocessing
+from typing import Optional
+
+from bentoml.configuration.containers import BentoMLConfiguration, BentoMLContainer
+from bentoml.utils import reserve_free_port
+
+logger = logging.getLogger(__name__)
+
+
+def start_prod_server(
+    saved_bundle_path: str,
+    port: Optional[int] = None,
+    workers: Optional[int] = None,
+    timeout: Optional[int] = None,
+    enable_microbatch: Optional[bool] = None,
+    enable_swagger: Optional[bool] = None,
+    mb_max_batch_size: Optional[int] = None,
+    mb_max_latency: Optional[int] = None,
+    microbatch_workers: Optional[int] = None,
+    config_file: Optional[str] = None,
+):
+    import psutil
+
+    assert (
+        psutil.POSIX
+    ), "BentoML API Server production mode only supports POSIX platforms"
+
+    config = BentoMLConfiguration(override_config_file=config_file)
+    config.override(["api_server", "port"], port)
+    config.override(["api_server", "workers"], workers)
+    config.override(["api_server", "timeout"], timeout)
+    config.override(["api_server", "enable_microbatch"], enable_microbatch)
+    config.override(["api_server", "enable_swagger"], enable_swagger)
+    config.override(["marshal_server", "max_batch_size"], mb_max_batch_size)
+    config.override(["marshal_server", "max_latency"], mb_max_latency)
+    config.override(["marshal_server", "workers"], microbatch_workers)
+
+    if config.config['api_server'].get('enable_microbatch'):
+        prometheus_lock = multiprocessing.Lock()
+        with reserve_free_port() as api_server_port:
+            pass
+
+        model_server_job = multiprocessing.Process(
+            target=_start_prod_server,
+            kwargs=dict(
+                saved_bundle_path=saved_bundle_path,
+                port=api_server_port,
+                config=config,
+                prometheus_lock=prometheus_lock,
+            ),
+            daemon=True,
+        )
+        model_server_job.start()
+
+        try:
+            _start_prod_batching_server(
+                saved_bundle_path=saved_bundle_path,
+                config=config,
+                api_server_port=api_server_port,
+                prometheus_lock=prometheus_lock,
+            )
+        finally:
+            model_server_job.terminate()
+    else:
+        _start_prod_server(saved_bundle_path=saved_bundle_path, config=config)
+
+
+def _start_prod_server(
+    saved_bundle_path: str,
+    config: BentoMLConfiguration,
+    port: Optional[int] = None,
+    prometheus_lock: Optional[multiprocessing.Lock] = None,
+):
+
+    logger.info("Starting BentoML API server in production mode..")
+
+    container = BentoMLContainer()
+    container.config.from_dict(config.as_dict())
+
+    from bentoml import server
+
+    container.wire(packages=[server])
+
+    if port is None:
+        gunicorn_app = server.gunicorn_server.GunicornBentoServer(
+            saved_bundle_path, prometheus_lock=prometheus_lock,
+        )
+    else:
+        gunicorn_app = server.gunicorn_server.GunicornBentoServer(
+            saved_bundle_path, port=port, prometheus_lock=prometheus_lock,
+        )
+    gunicorn_app.run()
+
+
+def _start_prod_batching_server(
+    saved_bundle_path: str,
+    api_server_port: int,
+    config: BentoMLConfiguration,
+    prometheus_lock: Optional[multiprocessing.Lock] = None,
+):
+
+    logger.info("Starting BentoML Batching server in production mode..")
+
+    container = BentoMLContainer()
+    container.config.from_dict(config.as_dict())
+
+    from bentoml import marshal, server
+
+    container.wire(packages=[server, marshal])
+
+    # avoid load model before gunicorn fork
+    marshal_server = server.marshal_server.GunicornMarshalServer(
+        bundle_path=saved_bundle_path,
+        prometheus_lock=prometheus_lock,
+        outbound_host="localhost",
+        outbound_port=api_server_port,
+    )
+    marshal_server.run()
diff --git a/bentoml/marshal/marshal.py b/bentoml/marshal/marshal.py
@@ -150,7 +150,9 @@ def __init__(
             BentoMLContainer.config.api_server.max_request_size
         ],
         zipkin_api_url: str = Provide[BentoMLContainer.config.tracing.zipkin_api_url],
+        outbound_unix_socket: str = None,
     ):
+        self.outbound_unix_socket = outbound_unix_socket
         self.outbound_host = outbound_host
         self.outbound_port = outbound_port
         self.outbound_workers = outbound_workers
@@ -178,6 +180,7 @@ def __init__(
             "or launch more microbatch instances to accept more concurrent connection.",
             self.CONNECTION_LIMIT,
         )
+        self._client = None
 
     def set_outbound_port(self, outbound_port):
         self.outbound_port = outbound_port
@@ -187,6 +190,22 @@ def fetch_sema(self):
             self._outbound_sema = NonBlockSema(self.outbound_workers)
         return self._outbound_sema
 
+    def get_client(self):
+        if self._client is None:
+            jar = aiohttp.DummyCookieJar()
+            if self.outbound_unix_socket:
+                conn = aiohttp.UnixConnector(path=self.outbound_unix_socket,)
+            else:
+                conn = aiohttp.TCPConnector(limit=30)
+            self._client = aiohttp.ClientSession(
+                connector=conn, auto_decompress=False, cookie_jar=jar,
+            )
+        return self._client
+
+    def __del__(self):
+        if self._client is not None and not self._client.closed:
+            self._client.close()
+
     def add_batch_handler(self, api_route, max_latency, max_batch_size):
         '''
         Params:
@@ -268,11 +287,14 @@ async def relay_handler(self, request):
             span_name=f"[2]{url.path} relay",
         ) as trace_ctx:
             headers.update(make_http_headers(trace_ctx))
-            async with aiohttp.ClientSession(auto_decompress=False) as client:
+            try:
+                client = self.get_client()
                 async with client.request(
                     request.method, url, data=data, headers=request.headers
                 ) as resp:
                     body = await resp.read()
+            except aiohttp.client_exceptions.ClientConnectionError:
+                return aiohttp.web.Response(status=503, body=b"Service Unavailable")
         return aiohttp.web.Response(
             status=resp.status, body=body, headers=resp.headers,
         )
@@ -298,11 +320,9 @@ async def _batch_handler_template(self, requests, api_route):
             headers.update(make_http_headers(trace_ctx))
             reqs_s = DataLoader.merge_requests(requests)
             try:
-                async with aiohttp.ClientSession(auto_decompress=False) as client:
-                    async with client.post(
-                        api_url, data=reqs_s, headers=headers
-                    ) as resp:
-                        raw = await resp.read()
+                client = self.get_client()
+                async with client.post(api_url, data=reqs_s, headers=headers) as resp:
+                    raw = await resp.read()
             except aiohttp.client_exceptions.ClientConnectionError as e:
                 raise RemoteException(
                     e, payload=HTTPResponse(status=503, body=b"Service Unavailable")