sdks/python/apache_beam/runners/portability/portable_runner.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# pytype: skip-file
# mypy: check-untyped-defs

import atexit
import copy
import functools
import itertools
import logging
import threading
import time
from typing import TYPE_CHECKING
from typing import Any
from typing import Dict
from typing import Iterator
from typing import Optional
from typing import Tuple

import grpc

from apache_beam.metrics import metric
from apache_beam.metrics.execution import MetricResult
from apache_beam.options.pipeline_options import DebugOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import PortableOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.value_provider import ValueProvider
from apache_beam.portability import common_urns
from apache_beam.portability import python_urns
from apache_beam.portability.api import beam_artifact_api_pb2_grpc
from apache_beam.portability.api import beam_job_api_pb2
from apache_beam.portability.api import beam_runner_api_pb2
from apache_beam.runners import runner
from apache_beam.runners.job import utils as job_utils
from apache_beam.runners.portability import artifact_service
from apache_beam.runners.portability import job_server
from apache_beam.runners.portability import portable_metrics
from apache_beam.runners.portability.fn_api_runner.fn_runner import translations
from apache_beam.runners.worker import sdk_worker_main
from apache_beam.runners.worker import worker_pool_main
from apache_beam.transforms import environments

if TYPE_CHECKING:
  from google.protobuf import struct_pb2  # pylint: disable=ungrouped-imports
  from apache_beam.pipeline import Pipeline

__all__ = ['PortableRunner']

MESSAGE_LOG_LEVELS = {
    beam_job_api_pb2.JobMessage.MESSAGE_IMPORTANCE_UNSPECIFIED: logging.INFO,
    beam_job_api_pb2.JobMessage.JOB_MESSAGE_DEBUG: logging.DEBUG,
    beam_job_api_pb2.JobMessage.JOB_MESSAGE_DETAILED: logging.DEBUG,
    beam_job_api_pb2.JobMessage.JOB_MESSAGE_BASIC: logging.INFO,
    beam_job_api_pb2.JobMessage.JOB_MESSAGE_WARNING: logging.WARNING,
    beam_job_api_pb2.JobMessage.JOB_MESSAGE_ERROR: logging.ERROR,
}

TERMINAL_STATES = [
    beam_job_api_pb2.JobState.DONE,
    beam_job_api_pb2.JobState.DRAINED,
    beam_job_api_pb2.JobState.FAILED,
    beam_job_api_pb2.JobState.CANCELLED,
]

_LOGGER = logging.getLogger(__name__)


class JobServiceHandle(object):
  """
  Encapsulates the interactions necessary to submit a pipeline to a job service.

  The base set of interactions consists of 3 steps:
  - prepare
  - stage
  - run
  """
  def __init__(self, job_service, options, retain_unknown_options=False):
    self.job_service = job_service
    self.options = options
    self.timeout = options.view_as(PortableOptions).job_server_timeout
    self.artifact_endpoint = options.view_as(PortableOptions).artifact_endpoint
    self._retain_unknown_options = retain_unknown_options

  def submit(self, proto_pipeline):
    # type: (beam_runner_api_pb2.Pipeline) -> Tuple[str, Iterator[beam_job_api_pb2.JobStateEvent], Iterator[beam_job_api_pb2.JobMessagesResponse]]

    """
    Submit and run the pipeline defined by `proto_pipeline`.
    """
    prepare_response = self.prepare(proto_pipeline)
    artifact_endpoint = (
        self.artifact_endpoint or
        prepare_response.artifact_staging_endpoint.url)
    self.stage(
        proto_pipeline,
        artifact_endpoint,
        prepare_response.staging_session_token)
    return self.run(prepare_response.preparation_id)

  def get_pipeline_options(self):
    # type: () -> struct_pb2.Struct

    """
    Get `self.options` as a protobuf Struct
    """

    # fetch runner options from job service
    # retries in case the channel is not ready
    def send_options_request(max_retries=5):
      num_retries = 0
      while True:
        try:
          # This reports channel is READY but connections may fail
          # Seems to be only an issue on Mac with port forwardings
          return self.job_service.DescribePipelineOptions(
              beam_job_api_pb2.DescribePipelineOptionsRequest(),
              timeout=self.timeout)
        except grpc.FutureTimeoutError:
          # no retry for timeout errors
          raise
        except grpc.RpcError as e:
          num_retries += 1
          if num_retries > max_retries:
            raise e
          time.sleep(1)

    options_response = send_options_request()

    def add_runner_options(parser):
      for option in options_response.options:
        try:
          # no default values - we don't want runner options
          # added unless they were specified by the user
          add_arg_args = {'action': 'store', 'help': option.description}
          if option.type == beam_job_api_pb2.PipelineOptionType.BOOLEAN:
            add_arg_args['action'] = 'store_true' \
              if option.default_value != 'true' else 'store_false'
          elif option.type == beam_job_api_pb2.PipelineOptionType.INTEGER:
            add_arg_args['type'] = int
          elif option.type == beam_job_api_pb2.PipelineOptionType.ARRAY:
            add_arg_args['action'] = 'append'
          parser.add_argument("--%s" % option.name, **add_arg_args)
        except Exception as e:
          # ignore runner options that are already present
          # only in this case is duplicate not treated as error
          if 'conflicting option string' not in str(e):
            raise
          _LOGGER.debug("Runner option '%s' was already added" % option.name)

    all_options = self.options.get_all_options(
        add_extra_args_fn=add_runner_options,
        retain_unknown_options=self._retain_unknown_options)

    return self.encode_pipeline_options(all_options)

  @staticmethod
  def encode_pipeline_options(
      all_options: Dict[str, Any]) -> 'struct_pb2.Struct':
    def convert_pipeline_option_value(v):
      # convert int values: BEAM-5509
      if type(v) == int:
        return str(v)
      elif isinstance(v, ValueProvider):
        return convert_pipeline_option_value(
            v.get()) if v.is_accessible() else None
      return v

    # TODO: Define URNs for options.
    p_options = {
        'beam:option:' + k + ':v1': convert_pipeline_option_value(v)
        for k,
        v in all_options.items() if v is not None
    }
    return job_utils.dict_to_struct(p_options)

  def prepare(self, proto_pipeline):
    # type: (beam_runner_api_pb2.Pipeline) -> beam_job_api_pb2.PrepareJobResponse

    """Prepare the job on the job service"""
    return self.job_service.Prepare(
        beam_job_api_pb2.PrepareJobRequest(
            job_name='job',
            pipeline=proto_pipeline,
            pipeline_options=self.get_pipeline_options()),
        timeout=self.timeout)

  def stage(self,
            proto_pipeline,  # type: beam_runner_api_pb2.Pipeline
            artifact_staging_endpoint,
            staging_session_token
           ):
    # type: (...) -> None

    """Stage artifacts"""
    if artifact_staging_endpoint:
      artifact_service.offer_artifacts(
          beam_artifact_api_pb2_grpc.ArtifactStagingServiceStub(
              channel=grpc.insecure_channel(artifact_staging_endpoint)),
          artifact_service.ArtifactRetrievalService(
              artifact_service.BeamFilesystemHandler(None).file_reader),
          staging_session_token)

  def run(self, preparation_id):
    # type: (str) -> Tuple[str, Iterator[beam_job_api_pb2.JobStateEvent], Iterator[beam_job_api_pb2.JobMessagesResponse]]

    """Run the job"""
    try:
      state_stream = self.job_service.GetStateStream(
          beam_job_api_pb2.GetJobStateRequest(job_id=preparation_id),
          timeout=self.timeout)
      # If there's an error, we don't always get it until we try to read.
      # Fortunately, there's always an immediate current state published.
      state_stream = itertools.chain([next(state_stream)], state_stream)
      message_stream = self.job_service.GetMessageStream(
          beam_job_api_pb2.JobMessagesRequest(job_id=preparation_id),
          timeout=self.timeout)
    except Exception:
      # TODO(https://github.com/apache/beam/issues/19284): Unify preparation_id
      # and job_id for all runners.
      state_stream = message_stream = None

    # Run the job and wait for a result, we don't set a timeout here because
    # it may take a long time for a job to complete and streaming
    # jobs currently never return a response.
    run_response = self.job_service.Run(
        beam_job_api_pb2.RunJobRequest(preparation_id=preparation_id))

    if state_stream is None:
      state_stream = self.job_service.GetStateStream(
          beam_job_api_pb2.GetJobStateRequest(job_id=run_response.job_id))
      message_stream = self.job_service.GetMessageStream(
          beam_job_api_pb2.JobMessagesRequest(job_id=run_response.job_id))

    return run_response.job_id, message_stream, state_stream


class PortableRunner(runner.PipelineRunner):
  """
    Experimental: No backward compatibility guaranteed.
    A BeamRunner that executes Python pipelines via the Beam Job API.

    This runner is a stub and does not run the actual job.
    This runner schedules the job on a job service. The responsibility of
    running and managing the job lies with the job service used.
  """
  def __init__(self):
    self._dockerized_job_server = None  # type: Optional[job_server.JobServer]

  @staticmethod
  def _create_environment(options):
    # type: (PipelineOptions) -> environments.Environment
    return environments.Environment.from_options(
        options.view_as(PortableOptions))

  def default_job_server(self, options):
    raise NotImplementedError(
        'You must specify a --job_endpoint when using --runner=PortableRunner. '
        'Alternatively, you may specify which portable runner you intend to '
        'use, such as --runner=FlinkRunner or --runner=SparkRunner.')

  def create_job_service_handle(self, job_service, options):
    # type: (...) -> JobServiceHandle
    return JobServiceHandle(job_service, options)

  def create_job_service(self, options):
    # type: (PipelineOptions) -> JobServiceHandle

    """
    Start the job service and return a `JobServiceHandle`
    """
    job_endpoint = options.view_as(PortableOptions).job_endpoint
    if job_endpoint:
      if job_endpoint == 'embed':
        server = job_server.EmbeddedJobServer()  # type: job_server.JobServer
      else:
        job_server_timeout = options.view_as(PortableOptions).job_server_timeout
        server = job_server.ExternalJobServer(job_endpoint, job_server_timeout)
    else:
      server = self.default_job_server(options)
    return self.create_job_service_handle(server.start(), options)

  @staticmethod
  def get_proto_pipeline(pipeline, options):
    # type: (Pipeline, PipelineOptions) -> beam_runner_api_pb2.Pipeline
    proto_pipeline = pipeline.to_runner_api(
        default_environment=environments.Environment.from_options(
            options.view_as(PortableOptions)))

    return PortableRunner._optimize_pipeline(proto_pipeline, options)

  @staticmethod
  def _optimize_pipeline(
      proto_pipeline: beam_runner_api_pb2.Pipeline,
      options: PipelineOptions) -> beam_runner_api_pb2.Pipeline:
    # TODO: https://github.com/apache/beam/issues/19493
    # Eventually remove the 'pre_optimize' option alltogether and only perform
    # the equivalent of the 'default' case below (minus the 'lift_combiners'
    # part).
    pre_optimize = options.view_as(DebugOptions).lookup_experiment(
        'pre_optimize', 'default').lower()
    if (not options.view_as(StandardOptions).streaming and
        pre_optimize != 'none'):
      if pre_optimize == 'default':
        phases = [
            # TODO: https://github.com/apache/beam/issues/18584
            #       https://github.com/apache/beam/issues/18586
            # Eventually remove the 'lift_combiners' phase from 'default'.
            translations.pack_combiners,
            translations.lift_combiners,
            translations.sort_stages
        ]
        partial = True
      elif pre_optimize == 'all':
        phases = translations.standard_optimize_phases()
        partial = False
      elif pre_optimize == 'all_except_fusion':
        # TODO(https://github.com/apache/beam/issues/19422): Delete this branch
        # after PortableRunner supports beam:runner:executable_stage:v1.
        phases = translations.standard_optimize_phases()
        phases.remove(translations.greedily_fuse)
        partial = True
      else:
        phases = []
        for phase_name in pre_optimize.split(','):
          # For now, these are all we allow.
          if phase_name in ('pack_combiners', 'lift_combiners'):
            phases.append(getattr(translations, phase_name))
          else:
            raise ValueError(
                'Unknown or inapplicable phase for pre_optimize: %s' %
                phase_name)
        phases.append(translations.sort_stages)
        partial = True

      # All (known) portable runners (ie Flink and Spark) support these URNs.
      known_urns = frozenset([
          common_urns.composites.RESHUFFLE.urn,
          common_urns.primitives.IMPULSE.urn,
          common_urns.primitives.FLATTEN.urn,
          common_urns.primitives.GROUP_BY_KEY.urn
      ])
      proto_pipeline = translations.optimize_pipeline(
          proto_pipeline,
          phases=phases,
          known_runner_urns=known_urns,
          partial=partial)

    return proto_pipeline

  def run_portable_pipeline(
      self, pipeline: beam_runner_api_pb2.Pipeline,
      options: PipelineOptions) -> runner.PipelineResult:
    portable_options = options.view_as(PortableOptions)

    # Do not set a Runner. Otherwise this can cause problems in Java's
    # PipelineOptions, i.e. ClassNotFoundException, if the corresponding Runner
    # does not exist in the Java SDK. In portability, the entry point is clearly
    # defined via the JobService.
    portable_options.view_as(StandardOptions).runner = None

    cleanup_callbacks = self.start_and_replace_loopback_environments(
        pipeline, options)

    optimized_pipeline = self._optimize_pipeline(pipeline, options)
    job_service_handle = self.create_job_service(options)
    job_id, message_stream, state_stream = job_service_handle.submit(
        optimized_pipeline)

    result = PipelineResult(
        job_service_handle.job_service,
        job_id,
        message_stream,
        state_stream,
        cleanup_callbacks)
    if cleanup_callbacks:
      # Register an exit handler to ensure cleanup on exit.
      atexit.register(functools.partial(result._cleanup, on_exit=True))
      _LOGGER.info(
          'Environment "%s" has started a component necessary for the '
          'execution. Be sure to run the pipeline using\n'
          '  with Pipeline() as p:\n'
          '    p.apply(..)\n'
          'This ensures that the pipeline finishes before this program exits.',
          portable_options.environment_type)
    return result

  @staticmethod
  def start_and_replace_loopback_environments(pipeline, options):
    portable_options = copy.deepcopy(options.view_as(PortableOptions))
    experiments = options.view_as(DebugOptions).experiments or []
    cleanup_callbacks = []
    for env in pipeline.components.environments.values():
      if env.urn == python_urns.EMBEDDED_PYTHON_LOOPBACK:
        # Start a worker and change the environment to point to that worker.
        use_loopback_process_worker = options.view_as(
            DebugOptions).lookup_experiment(
                'use_loopback_process_worker', False)
        portable_options.environment_type = 'EXTERNAL'
        portable_options.environment_config, server = (
            worker_pool_main.BeamFnExternalWorkerPoolServicer.start(
                state_cache_size=
                sdk_worker_main._get_state_cache_size_bytes(
                  options=options),
                data_buffer_time_limit_ms=
                sdk_worker_main._get_data_buffer_time_limit_ms(experiments),
                use_process=use_loopback_process_worker))
        external_env = environments.ExternalEnvironment.from_options(
            portable_options).to_runner_api(None)  # type: ignore
        env.urn = external_env.urn
        env.payload = external_env.payload
        cleanup_callbacks.append(functools.partial(server.stop, 1))
    return cleanup_callbacks


class PortableMetrics(metric.MetricResults):
  def __init__(self, job_metrics_response):
    metrics = job_metrics_response.metrics
    self.attempted = portable_metrics.from_monitoring_infos(metrics.attempted)
    self.committed = portable_metrics.from_monitoring_infos(metrics.committed)

  @staticmethod
  def _combine(committed, attempted, filter):
    all_keys = set(committed.keys()) | set(attempted.keys())
    return [
        MetricResult(key, committed.get(key), attempted.get(key))
        for key in all_keys if metric.MetricResults.matches(filter, key)
    ]

  def query(self, filter=None):
    counters, distributions, gauges = [
        self._combine(x, y, filter)
        for x, y in zip(self.committed, self.attempted)
    ]

    return {
        self.COUNTERS: counters,
        self.DISTRIBUTIONS: distributions,
        self.GAUGES: gauges
    }


class PipelineResult(runner.PipelineResult):
  def __init__(
      self,
      job_service,
      job_id,
      message_stream,
      state_stream,
      cleanup_callbacks=()):
    super().__init__(beam_job_api_pb2.JobState.UNSPECIFIED)
    self._job_service = job_service
    self._job_id = job_id
    self._messages = []
    self._message_stream = message_stream
    self._state_stream = state_stream
    self._cleanup_callbacks = cleanup_callbacks
    self._metrics = None
    self._runtime_exception = None

  def cancel(self):
    # type: () -> None
    try:
      self._job_service.Cancel(
          beam_job_api_pb2.CancelJobRequest(job_id=self._job_id))
    finally:
      self._cleanup()

  @property
  def state(self):
    runner_api_state = self._job_service.GetState(
        beam_job_api_pb2.GetJobStateRequest(job_id=self._job_id)).state
    self._state = self.runner_api_state_to_pipeline_state(runner_api_state)
    return self._state

  @staticmethod
  def runner_api_state_to_pipeline_state(runner_api_state):
    return getattr(
        runner.PipelineState,
        beam_job_api_pb2.JobState.Enum.Name(runner_api_state))

  @staticmethod
  def pipeline_state_to_runner_api_state(pipeline_state):
    if pipeline_state == runner.PipelineState.PENDING:
      return beam_job_api_pb2.JobState.STARTING
    else:
      try:
        return beam_job_api_pb2.JobState.Enum.Value(pipeline_state)
      except ValueError:
        return beam_job_api_pb2.JobState.UNSPECIFIED

  def metrics(self):
    if not self._metrics:

      job_metrics_response = self._job_service.GetJobMetrics(
          beam_job_api_pb2.GetJobMetricsRequest(job_id=self._job_id))

      self._metrics = PortableMetrics(job_metrics_response)
    return self._metrics

  def _last_error_message(self):
    # type: () -> str
    # Filter only messages with the "message_response" and error messages.
    messages = [
        m.message_response for m in self._messages
        if m.HasField('message_response')
    ]
    error_messages = [
        m for m in messages
        if m.importance == beam_job_api_pb2.JobMessage.JOB_MESSAGE_ERROR
    ]
    if error_messages:
      return error_messages[-1].message_text
    else:
      return 'unknown error'

  def wait_until_finish(self, duration=None):
    """
    :param duration: The maximum time in milliseconds to wait for the result of
    the execution. If None or zero, will wait until the pipeline finishes.
    :return: The result of the pipeline, i.e. PipelineResult.
    """
    def read_messages():
      # type: () -> None
      previous_state = -1
      for message in self._message_stream:
        if message.HasField('message_response'):
          logging.log(
              MESSAGE_LOG_LEVELS[message.message_response.importance],
              "%s",
              message.message_response.message_text)
        else:
          current_state = message.state_response.state
          if current_state != previous_state:
            _LOGGER.info(
                "Job state changed to %s",
                self.runner_api_state_to_pipeline_state(current_state))
            previous_state = current_state
        self._messages.append(message)

    message_thread = threading.Thread(
        target=read_messages, name='wait_until_finish_read')
    message_thread.daemon = True
    message_thread.start()

    if duration:
      state_thread = threading.Thread(
          target=functools.partial(self._observe_state, message_thread),
          name='wait_until_finish_state_observer')
      state_thread.daemon = True
      state_thread.start()
      start_time = time.time()
      duration_secs = duration / 1000
      while (time.time() - start_time < duration_secs and
             state_thread.is_alive()):
        time.sleep(1)
    else:
      self._observe_state(message_thread)

    if self._runtime_exception:
      raise self._runtime_exception

    return self._state

  def _observe_state(self, message_thread):
    try:
      for state_response in self._state_stream:
        self._state = self.runner_api_state_to_pipeline_state(
            state_response.state)
        if state_response.state in TERMINAL_STATES:
          # Wait for any last messages.
          message_thread.join(10)
          break
      if self._state != runner.PipelineState.DONE:
        self._runtime_exception = RuntimeError(
            'Pipeline %s failed in state %s: %s' %
            (self._job_id, self._state, self._last_error_message()))
    except Exception as e:
      self._runtime_exception = e
    finally:
      self._cleanup()

  def _cleanup(self, on_exit=False):
    # type: (bool) -> None
    if on_exit and self._cleanup_callbacks:
      _LOGGER.info(
          'Running cleanup on exit. If your pipeline should continue running, '
          'be sure to use the following syntax:\n'
          '  with Pipeline() as p:\n'
          '    p.apply(..)\n'
          'This ensures that the pipeline finishes before this program exits.')
    callback_exceptions = []
    for callback in self._cleanup_callbacks:
      try:
        callback()
      except Exception as e:
        callback_exceptions.append(e)

    self._cleanup_callbacks = ()
    if callback_exceptions:
      formatted_exceptions = ''.join(
          [f"\n\t{repr(e)}" for e in callback_exceptions])
      raise RuntimeError('Errors: {}'.format(formatted_exceptions))