diff --git a/.github/workflows/gae.yml b/.github/workflows/gae.yml index 39c8bbd89464..67d28879ba13 100644 --- a/.github/workflows/gae.yml +++ b/.github/workflows/gae.yml @@ -69,9 +69,11 @@ jobs: RUN_JAVA_TESTS: ON run: | # default install to "/opt/graphscope" - make gae ENABLE_JAVA_SDK=ON BUILD_TEST=ON + make gae ENABLE_JAVA_SDK=ON BUILD_TEST=ON NUMPROC=1 + sudo make gae-install # also make coordinator and client for python test - make client && make coordinator + make client + make coordinator - name: Run Cpp Test env: diff --git a/.github/workflows/gss.yml b/.github/workflows/gss.yml index 723c5585de76..052327550f25 100644 --- a/.github/workflows/gss.yml +++ b/.github/workflows/gss.yml @@ -81,7 +81,7 @@ jobs: export RUSTC_WRAPPER=/usr/local/bin/sccache sccache --start-server cd ${GITHUB_WORKSPACE}/interactive_engine - mvn clean install -P groot,groot-assembly -Drust.compile.mode=debug -DskipTests -Dgroot.compile.feature="maxgraph-ffi/column_filter_push_down" --quiet + mvn clean install -P groot,groot-assembly -Drust.compile.mode=debug -DskipTests -Dgroot.compile.feature="column_filter_push_down" --quiet sccache --show-stats diff --git a/.github/workflows/networkx-forward-algo-nightly.yml b/.github/workflows/networkx-forward-algo-nightly.yml index 99c31e11d2c3..cb840c379e8a 100644 --- a/.github/workflows/networkx-forward-algo-nightly.yml +++ b/.github/workflows/networkx-forward-algo-nightly.yml @@ -26,7 +26,7 @@ jobs: - name: Build GAE and coordinator run: | pushd ${GITHUB_WORKSPACE} - make gae ENABLE_JAVA_SDK=OFF BUILD_TEST=OFF + make gae ENABLE_JAVA_SDK=OFF BUILD_TEST=OFF NUMPROC=1 # also make coordinator and client for python test make coordinator && make client popd diff --git a/Makefile b/Makefile index ed89e0c09a0e..566a8e194328 100644 --- a/Makefile +++ b/Makefile @@ -1,178 +1,186 @@ - -MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -WORKING_DIR := $(dir $(MKFILE_PATH)) - -VERSION ?= 0.1.0 -INSTALL_PREFIX ?= /opt/graphscope - -BUILD_TYPE ?= release +MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +WORKING_DIR := $(dir $(MKFILE_PATH)) +GAE_DIR := $(WORKING_DIR)/analytical_engine +GIE_DIR := $(WORKING_DIR)/interactive_engine +GLE_DIR := $(WORKING_DIR)/learning_engine/graph-learn +GAE_BUILD_DIR := $(GAE_DIR)/build +GLE_BUILD_DIR := $(GLE_DIR)/cmake-build +CLIENT_DIR := $(WORKING_DIR)/python +COORDINATOR_DIR := $(WORKING_DIR)/coordinator +K8S_DIR := $(WORKING_DIR)/k8s +DOCS_DIR := $(WORKING_DIR)/docs + +VERSION ?= 0.18.0 + +BUILD_TYPE ?= release # GAE build options -NETWORKX ?= ON +NETWORKX ?= ON # testing build option -BUILD_TEST ?= OFF +BUILD_TEST ?= OFF # build java sdk option -ENABLE_JAVA_SDK ?= ON +ENABLE_JAVA_SDK ?= ON -.PHONY: all -all: graphscope +# PREFIX is environment variable, but if it is not set, then set default value +ifeq ($(INSTALL_PREFIX),) + INSTALL_PREFIX := /opt/graphscope +endif -.PHONY: graphscope -graphscope: install +UNAME := $(shell uname) +ifeq ($(UNAME),Linux) + NUMPROC := $(shell grep -c ^processor /proc/cpuinfo) + SUFFIX := so +endif +ifeq ($(UNAME),Darwin) + NUMPROC := $(shell sysctl hw.ncpu | awk '{print $2}') + SUFFIX := dylib +endif -.PHONY: gsruntime-image -gsruntime-image: - $(MAKE) -C $(WORKING_DIR)/k8s/ gsruntime-image VERSION=$(VERSION) -.PHONY: gsvineyard-image -gsvineyard-image: - $(MAKE) -C $(WORKING_DIR)/k8s/ gsvineyard-image VERSION=$(VERSION) +## Common +.PHONY: all graphscope install clean -.PHONY: graphscope-image -graphscope-image: - $(MAKE) -C $(WORKING_DIR)/k8s/ graphscope-image VERSION=$(VERSION) +# all: graphscope +# graphscope: gle client coordinator gae gie +all: gle client coordinator gae gie +graphscope: all -.PHONY: jupyter-image -jupyter-image: - $(MAKE) -C $(WORKING_DIR)/k8s/ jupyter-image VERSION=$(VERSION) +install: gae-install gie-install gle-install client coordinator + # client + pip3 install --user --editable $(CLIENT_DIR) + rm -rf $(CLIENT_DIR)/*.egg-info + # coordinator + pip3 install --user --editable $(COORDINATOR_DIR) + rm -rf $(COORDINATOR_DIR)/*.egg-info -.PHONY: dataset-image -dataset-image: - $(MAKE) -C $(WORKING_DIR)/k8s/ dataset-image VERSION=$(VERSION) + echo "Run the following command to correctly set environment variable" + echo "export GRAPHSCOPE_HOME=$(INSTALL_PREFIX)" -# bulld graphscope image from source code without wheel package -.PHONY: graphscope-dev-image -graphscope-dev-image: - $(MAKE) -C $(WORKING_DIR)/k8s/ graphscope-dev-image VERSION=$(VERSION) +clean: + rm -rf $(GAE_BUILD_DIR) $(GAE_DIR)/proto + cd $(GAE_DIR)/java && mvn clean -.PHONY: graphscope-store-image -graphscope-store-image: - $(MAKE) -C $(WORKING_DIR)/k8s/ graphscope-store-image VERSION=$(VERSION) + cd $(GIE_DIR) && mvn clean + # TODO: use maven clean to clean ir target + rm -rf $(GIE_DIR)/executor/ir/target -.PHONY: push -push: - $(MAKE) -C $(WORKING_DIR)/k8s/ push + rm -rf $(GLE_BUILD_DIR) $(GLE_DIR)/proto/*.h $(GLE_DIR)/proto/*.cc -.PHONY: install -install: gle client gae gie coordinator + cd $(CLIENT_DIR) && python3 setup.py clean --all + + cd $(COORDINATOR_DIR) && python3 setup.py clean --all + +## Modules +.PHONY: client coordinator gae gie gle -.PHONY: client client: gle - cd $(WORKING_DIR)/python && \ + cd $(CLIENT_DIR) && \ pip3 install -r requirements.txt -r requirements-dev.txt --user && \ python3 setup.py build_ext --inplace --user - pip3 install --user --editable $(WORKING_DIR)/python -.PHONY: coordinator coordinator: client - cd $(WORKING_DIR)/coordinator && \ + cd $(COORDINATOR_DIR) && \ pip3 install -r requirements.txt -r requirements-dev.txt --user && \ python3 setup.py build_builtin - if [ ! -d "/var/log/graphscope" ]; then \ - sudo mkdir /var/log/graphscope; \ - fi - sudo chown -R `id -u`:`id -g` /var/log/graphscope - -.PHONY: gae -gae: - mkdir -p $(WORKING_DIR)/analytical_engine/build - cd $(WORKING_DIR)/analytical_engine/build && \ - cmake -DCMAKE_INSTALL_PREFIX=$(INSTALL_PREFIX) -DNETWORKX=$(NETWORKX) -DBUILD_TESTS=${BUILD_TEST} -DENABLE_JAVA_SDK=${ENABLE_JAVA_SDK} .. && \ - make -j1 && \ - sudo make install && \ - sudo cp -r $(WORKING_DIR)/k8s/kube_ssh $(INSTALL_PREFIX)/bin/ -ifneq ($(INSTALL_PREFIX), /usr/local) - sudo rm -fr /usr/local/include/graphscope && \ - sudo ln -sf $(INSTALL_PREFIX)/bin/* /usr/local/bin/ && \ - sudo ln -sfn $(INSTALL_PREFIX)/include/graphscope /usr/local/include/graphscope && \ - sudo ln -sf ${INSTALL_PREFIX}/lib/*so* /usr/local/lib && \ - sudo ln -sf ${INSTALL_PREFIX}/lib/*dylib* /usr/local/lib && \ - if [ -d "${INSTALL_PREFIX}/lib64/cmake/graphscope-analytical" ]; then \ - sudo rm -fr /usr/local/lib64/cmake/graphscope-analytical; \ - sudo ln -sfn ${INSTALL_PREFIX}/lib64/cmake/graphscope-analytical /usr/local/lib64/cmake/graphscope-analytical; \ - sudo mkdir -p ${INSTALL_PREFIX}/lib/cmake; \ - sudo cp -r ${INSTALL_PREFIX}/lib64/cmake/* ${INSTALL_PREFIX}/lib/cmake/; \ - else \ - sudo ln -sfn ${INSTALL_PREFIX}/lib/cmake/graphscope-analytical /usr/local/lib/cmake/graphscope-analytical; \ - fi -endif -.PHONY: gie -gie: - # frontend/executor - cd $(WORKING_DIR)/interactive_engine && \ - mvn clean package -DskipTests -Drust.compile.mode=$(BUILD_TYPE) -P graphscope,graphscope-assembly --quiet - # install - mkdir -p $(WORKING_DIR)/.install_prefix && \ - tar -xf $(WORKING_DIR)/interactive_engine/assembly/target/graphscope.tar.gz --strip-components 1 -C $(WORKING_DIR)/.install_prefix && \ - sudo cp -r $(WORKING_DIR)/.install_prefix/* $(INSTALL_PREFIX) && \ - rm -fr $(WORKING_DIR)/.install_prefix - -.PHONY: gle -gle: - cd ${WORKING_DIR} && \ - git submodule update --init && \ - cd $(WORKING_DIR)/learning_engine/graph-learn && \ - git submodule update --init third_party/pybind11 && \ - mkdir -p cmake-build && cd cmake-build && \ - cmake -DCMAKE_INSTALL_PREFIX=$(INSTALL_PREFIX) -DWITH_VINEYARD=ON -DTESTING=${BUILD_TEST} .. && \ - make -j`nproc` && \ - sudo make install -ifneq ($(INSTALL_PREFIX), /usr/local) - sudo ln -sf ${INSTALL_PREFIX}/lib/*so* /usr/local/lib && \ - sudo ln -sf ${INSTALL_PREFIX}/lib/*dylib* /usr/local/lib -endif +.PHONY: gae-install gie-install gle-install + +gae-install: gae + $(MAKE) -C $(GAE_BUILD_DIR) install + install $(K8S_DIR)/kube_ssh $(INSTALL_PREFIX)/bin/ + install -d $(INSTALL_PREFIX)/lib/cmake/graphscope-analytical/cmake + install $(INSTALL_PREFIX)/lib64/cmake/graphscope-analytical/*.cmake $(INSTALL_PREFIX)/lib/cmake/graphscope-analytical + install $(INSTALL_PREFIX)/lib64/cmake/graphscope-analytical/cmake/* $(INSTALL_PREFIX)/lib/cmake/graphscope-analytical/cmake + +gae: $(GAE_BUILD_DIR)/grape_engine + +$(GAE_BUILD_DIR)/grape_engine: + mkdir -p $(GAE_BUILD_DIR) && \ + cd $(GAE_BUILD_DIR) && \ + cmake -DCMAKE_INSTALL_PREFIX=$(INSTALL_PREFIX) \ + -DNETWORKX=$(NETWORKX) \ + -DBUILD_TESTS=${BUILD_TEST} \ + -DENABLE_JAVA_SDK=${ENABLE_JAVA_SDK} .. && \ + $(MAKE) -j$(NUMPROC) + +gie-install: gie + tar -xf $(GIE_DIR)/assembly/target/graphscope.tar.gz --strip-components 1 -C $(INSTALL_PREFIX) +gie: $(GIE_DIR)/assembly/target/graphscope.tar.gz + +$(GIE_DIR)/assembly/target/graphscope.tar.gz: + # frontend/executor + cd $(GIE_DIR) && \ + mvn package -DskipTests -Drust.compile.mode=$(BUILD_TYPE) -P graphscope,graphscope-assembly --quiet + +gle-install: gle + $(MAKE) -C $(GLE_BUILD_DIR) install +gle: $(GLE_DIR)/built/lib/libgraphlearn_shared.$(SUFFIX) + +$(GLE_DIR)/built/lib/libgraphlearn_shared.$(SUFFIX): + git submodule update --init + cd $(GLE_DIR) && git submodule update --init third_party/pybind11 + mkdir -p $(GLE_BUILD_DIR) + cd $(GLE_BUILD_DIR) && \ + cmake -DCMAKE_INSTALL_PREFIX=$(INSTALL_PREFIX) \ + -DWITH_VINEYARD=ON \ + -DTESTING=${BUILD_TEST} .. && \ + $(MAKE) -j$(NUMPROC) + +## wheels +.PHONY: graphscope-py3-package graphscope-client-py3-package prepare-client graphscope-docs -# wheels -.PHONY: graphscope-py3-package graphscope-py3-package: - $(MAKE) -C $(WORKING_DIR)/k8s/ graphscope-py3-package + $(MAKE) -C $(K8S_DIR) graphscope-py3-package -.PHONY: graphscope-client-py3-package graphscope-client-py3-package: - $(MAKE) -C $(WORKING_DIR)/k8s/ graphscope-client-py3-package + $(MAKE) -C $(K8S_DIR) graphscope-client-py3-package -.PHONY: prepare-client prepare-client: - cd $(WORKING_DIR)/python && \ + cd $(CLIENT_DIR) && \ pip3 install -r requirements.txt --user && \ pip3 install -r requirements-dev.txt --user && \ python3 setup.py build_proto -.PHONY: graphscope-docs graphscope-docs: prepare-client - $(MAKE) -C $(WORKING_DIR)/docs/ html + $(MAKE) -C $(DOCS_DIR)/ html + + +## Images +.PHONY: graphscope-image jupyter-image dataset-image graphscope-store-image push + +graphscope-image: + $(MAKE) -C $(K8S_DIR) graphscope-image VERSION=$(VERSION) + +jupyter-image: + $(MAKE) -C $(K8S_DIR) jupyter-image VERSION=$(VERSION) + +dataset-image: + $(MAKE) -C $(K8S_DIR) dataset-image VERSION=$(VERSION) + +graphscope-store-image: + $(MAKE) -C $(K8S_DIR) graphscope-store-image VERSION=$(VERSION) + +push: + $(MAKE) -C $(K8S_DIR) push + + +## Tests +.PHONY: test unittest minitest k8stest -.PHONY: test test: unittest minitest k8stest -.PHONY: unittest unittest: - cd $(WORKING_DIR)/python && \ + cd $(CLIENT_DIR) && \ python3 -m pytest --cov=graphscope --cov-config=.coveragerc --cov-report=xml --cov-report=term -s -v ./graphscope/tests/unittest -.PHONY: minitest minitest: - cd $(WORKING_DIR)/python && \ - pip3 install tensorflow==2.5.2 "pandas<1.5.0" && \ + pip3 install tensorflow==2.5.2 "pandas<1.5.0" + cd $(CLIENT_DIR) && \ python3 -m pytest --cov=graphscope --cov-config=.coveragerc --cov-report=xml --cov-report=term -s -v ./graphscope/tests/minitest -.PHONY: k8stest k8stest: - cd $(WORKING_DIR)/python && \ - pip3 install tensorflow==2.5.2 "pandas<1.5.0" && \ + pip3 install tensorflow==2.5.2 "pandas<1.5.0" + cd $(CLIENT_DIR) && \ python3 -m pytest --cov=graphscope --cov-config=.coveragerc --cov-report=xml --cov-report=term -s -v ./graphscope/tests/kubernetes - -.PHONY: clean -clean: - rm -fr $(WORKING_DIR)/analytical_engine/build/ || true && \ - rm -fr $(WORKING_DIR)/analytical_engine/proto/ || true && \ - rm -fr $(WORKING_DIR)/learning_engine/graph-learn/cmake-build/ || true && \ - rm -fr $(WORKING_DIR)/learning_engine/graph-learn/proto/*.h || true && \ - rm -fr $(WORKING_DIR)/learning_engine/graph-learn/proto/*.cc || true && \ - rm -fr $(WORKING_DIR)/interactive_engine/executor/target || true && \ - rm -fr $(WORKING_DIR)/interactive_engine/assembly/target || true && \ - cd $(WORKING_DIR)/python && python3 setup.py clean --all && \ - cd $(WORKING_DIR)/coordinator && python3 setup.py clean --all diff --git a/README-zh.md b/README-zh.md index 314c60c0faf3..3a1b17f7cbf8 100644 --- a/README-zh.md +++ b/README-zh.md @@ -306,7 +306,7 @@ sess.close() ```bash # 编译所有组件,包括 Python 包和 引擎可执行文件 -make graphscope +sudo make install # 或者只编译指定的引擎 # make gie diff --git a/README.md b/README.md index 68180480a60d..9574464234d4 100644 --- a/README.md +++ b/README.md @@ -339,7 +339,7 @@ Then you can build GraphScope with pre-configured `make` commands. ```bash # to make graphscope whole package, including python package + engine binaries. -make graphscope +sudo make install # or make the engine components # make gie diff --git a/analytical_engine/CMakeLists.txt b/analytical_engine/CMakeLists.txt index f7a673c10b6f..3bae900a0213 100644 --- a/analytical_engine/CMakeLists.txt +++ b/analytical_engine/CMakeLists.txt @@ -486,7 +486,8 @@ endmacro() install_gsa_binary(grape_engine) install_gsa_binary(gs_proto) install_gsa_binary(gs_util) -if(ENABLE_JAVA_SDK) + +if (ENABLE_JAVA_SDK) install_gsa_binary(graphx_runner) endif() diff --git a/analytical_engine/core/grape_instance.cc b/analytical_engine/core/grape_instance.cc index fb8417c6d15c..e686ce099ea8 100644 --- a/analytical_engine/core/grape_instance.cc +++ b/analytical_engine/core/grape_instance.cc @@ -1205,7 +1205,7 @@ bl::result GrapeInstance::registerGraphType(const rpc::GSParams& params) { VLOG(1) << "Registering Graph, graph type: " << rpc::graph::GraphTypePb_Name(graph_type) - << ", Type sigature: " << type_sig << ", lib path: " << lib_path; + << ", Type signature: " << type_sig << ", lib path: " << lib_path; if (object_manager_.HasObject(type_sig)) { VLOG(1) << "Graph already registered, signature is: " << type_sig; diff --git a/analytical_engine/core/loader/arrow_fragment_loader.h b/analytical_engine/core/loader/arrow_fragment_loader.h index 6a79008c2f88..8bded7fe5a0b 100644 --- a/analytical_engine/core/loader/arrow_fragment_loader.h +++ b/analytical_engine/core/loader/arrow_fragment_loader.h @@ -158,7 +158,7 @@ class ArrowFragmentLoader { labels << graph_info_->vertices[i]->label; } - if (!graph_info_->vertices.empty()) { + if (!graph_info_->edges.empty()) { labels << " and "; } for (size_t i = 0; i < graph_info_->edges.size(); ++i) { diff --git a/analytical_engine/core/server/dispatcher.cc b/analytical_engine/core/server/dispatcher.cc index 9b2b0283ee8e..73ec0750f64d 100644 --- a/analytical_engine/core/server/dispatcher.cc +++ b/analytical_engine/core/server/dispatcher.cc @@ -136,7 +136,8 @@ std::shared_ptr Dispatcher::processCmd( }); #endif - if (!r->message().empty()) { + if (!r->message().empty() && + comm_spec_.worker_id() == grape::kCoordinatorRank) { LOG(ERROR) << "Worker " + std::to_string(r->worker_id()) + ": " + r->message(); } diff --git a/coordinator/gscoordinator/coordinator.py b/coordinator/gscoordinator/coordinator.py index f596ad5a7a87..407362f615d6 100644 --- a/coordinator/gscoordinator/coordinator.py +++ b/coordinator/gscoordinator/coordinator.py @@ -20,7 +20,6 @@ import argparse import atexit -import datetime import functools import json import logging @@ -34,11 +33,7 @@ import sys import threading import traceback -import urllib.parse -import urllib.request -import zipfile from concurrent import futures -from io import BytesIO import grpc from packaging import version @@ -46,69 +41,34 @@ from gscoordinator.io_utils import StdStreamWrapper # capture system stdout +from gscoordinator.launcher import AbstractLauncher +from gscoordinator.local_launcher import LocalLauncher + sys.stdout = StdStreamWrapper(sys.stdout) sys.stderr = StdStreamWrapper(sys.stderr) -from graphscope.client.utils import GRPCUtils -from graphscope.framework import utils -from graphscope.framework.dag_utils import create_graph -from graphscope.framework.dag_utils import create_loader -from graphscope.framework.errors import AnalyticalEngineInternalError -from graphscope.framework.graph_utils import normalize_parameter_edges -from graphscope.framework.graph_utils import normalize_parameter_vertices -from graphscope.framework.loader import Loader from graphscope.framework.utils import PipeMerger -from graphscope.framework.utils import find_java -from graphscope.framework.utils import get_tempdir -from graphscope.framework.utils import normalize_data_type_str -from graphscope.proto import attr_value_pb2 +from graphscope.framework.utils import i_to_attr +from graphscope.framework.utils import s_to_attr from graphscope.proto import coordinator_service_pb2_grpc -from graphscope.proto import engine_service_pb2_grpc from graphscope.proto import error_codes_pb2 -from graphscope.proto import graph_def_pb2 from graphscope.proto import message_pb2 -from graphscope.proto import op_def_pb2 from graphscope.proto import types_pb2 -from gscoordinator.cluster import KubernetesClusterLauncher from gscoordinator.dag_manager import DAGManager from gscoordinator.dag_manager import GSEngine -from gscoordinator.dag_manager import split_op_result -from gscoordinator.launcher import LocalLauncher +from gscoordinator.kubernetes_launcher import KubernetesClusterLauncher from gscoordinator.monitor import Monitor -from gscoordinator.object_manager import GraphMeta -from gscoordinator.object_manager import GremlinResultSet from gscoordinator.object_manager import InteractiveQueryManager from gscoordinator.object_manager import LearningInstanceManager -from gscoordinator.object_manager import LibMeta from gscoordinator.object_manager import ObjectManager -from gscoordinator.utils import ANALYTICAL_ENGINE_JAVA_INIT_CLASS_PATH -from gscoordinator.utils import ANALYTICAL_ENGINE_JAVA_JVM_OPTS -from gscoordinator.utils import GRAPHSCOPE_HOME -from gscoordinator.utils import INTERACTIVE_ENGINE_THREADS_PER_WORKER -from gscoordinator.utils import RESOURCE_DIR_NAME -from gscoordinator.utils import WORKSPACE +from gscoordinator.op_executor import OperationExecutor +from gscoordinator.utils import GS_GRPC_MAX_MESSAGE_LENGTH from gscoordinator.utils import check_gremlin_server_ready -from gscoordinator.utils import compile_app -from gscoordinator.utils import compile_graph_frame from gscoordinator.utils import create_single_op_dag -from gscoordinator.utils import dump_string -from gscoordinator.utils import get_app_sha256 -from gscoordinator.utils import get_graph_sha256 -from gscoordinator.utils import get_lib_path -from gscoordinator.utils import op_pre_process from gscoordinator.utils import str2bool -from gscoordinator.utils import to_maxgraph_schema from gscoordinator.version import __version__ -# endpoint of prelaunch analytical engine -GS_DEBUG_ENDPOINT = os.environ.get("GS_DEBUG_ENDPOINT", "") - -# 2 GB -GS_GRPC_MAX_MESSAGE_LENGTH = 2 * 1024 * 1024 * 1024 - 1 - -logger = logging.getLogger("graphscope") - def catch_unknown_errors(response_on_error=None, using_yield=False): """A catcher that catches all (unknown) exceptions in gRPC handlers to ensure @@ -127,7 +87,7 @@ def handler_execution(self, request, context): except Exception as exc: error_message = repr(exc) error_traceback = traceback.format_exc() - context.set_code(error_codes_pb2.COORDINATOR_INTERNAL_ERROR) + context.set_code(grpc.StatusCode.ABORTED) context.set_details( 'Error occurs in handler: "%s", with traceback: ' % error_message + error_traceback @@ -140,470 +100,159 @@ def handler_execution(self, request, context): return catch_exceptions +def config_logging(log_level): + """Set log level basic on config. + Args: + log_level (str): Log level of stdout handler + """ + logging.basicConfig(level=logging.CRITICAL) + + if log_level: + log_level = log_level.upper() + + logger = logging.getLogger("graphscope") + logger.setLevel(log_level) + + stdout_handler = logging.StreamHandler(sys.stdout) + stdout_handler.setLevel(log_level) + stdout_handler.addFilter(lambda record: record.levelno <= logging.INFO) + stderr_handler = logging.StreamHandler(sys.stderr) + stderr_handler.setLevel(logging.WARNING) + + formatter = logging.Formatter( + "%(asctime)s [%(levelname)s][%(module)s:%(lineno)d]: %(message)s" + ) + stdout_handler.setFormatter(formatter) + stderr_handler.setFormatter(formatter) + + logger.addHandler(stdout_handler) + logger.addHandler(stderr_handler) + + +logger = logging.getLogger("graphscope") + + class CoordinatorServiceServicer( coordinator_service_pb2_grpc.CoordinatorServiceServicer ): """Provides methods that implement functionality of master service server. Holding: - 1. process: the grape-engine process. + 1. launcher: the engine launcher. 2. session_id: the handle for a particular session to engine - 3. vineyard_ipc_socket: returned by grape-engine - 4. vineyard_rpc_socket: returned by grape-engine - 5. engine_endpoint: the endpoint of grape-engine - 6. engine_servicer: grpc connection to grape-engine - + 3. object_manager: the object manager for the session + 4. operation_executor: the operation executor for the session """ - def __init__(self, launcher, dangling_timeout_seconds, log_level="INFO"): - self._launcher = launcher + def __init__( + self, launcher: AbstractLauncher, dangling_timeout_seconds, log_level="INFO" + ): + config_logging(log_level) - self._request = None self._object_manager = ObjectManager() - self._grpc_utils = GRPCUtils() - self._dangling_detecting_timer = None - self._config_logging(log_level) # only one connection is allowed at the same time - # generate session id when a client connection is established + # session id will be generated when connection from client is established self._session_id = None + self._connected = False - # launch engines - if len(GS_DEBUG_ENDPOINT) > 0: - logger.info( - "Coordinator will connect to engine with endpoint: " + GS_DEBUG_ENDPOINT - ) - self._launcher._analytical_engine_endpoint = GS_DEBUG_ENDPOINT - else: - if not self._launcher.start(): - raise RuntimeError("Coordinator Launching failed.") - - self._launcher_type = self._launcher.type() - self._instance_id = self._launcher.instance_id - # string of a list of hosts, comma separated - self._engine_hosts = self._launcher.hosts - self._k8s_namespace = "" - if self._launcher_type == types_pb2.K8S: - self._k8s_namespace = self._launcher.get_namespace() - - # analytical engine - self._analytical_engine_stub = self._create_grpc_stub() - self._analytical_engine_config = None - self._analytical_engine_endpoint = None - - self._builtin_workspace = os.path.join(WORKSPACE, "builtin") - # udf app workspace should be bound to a specific session when client connect. - self._udf_app_workspace = None - # java class path should contains - # 1) java runtime path - # 2) add resources, the recents added resource will be placed first. - self._java_class_path = ANALYTICAL_ENGINE_JAVA_INIT_CLASS_PATH - logger.info("Java initial class path set to: {}".format(self._java_class_path)) - self._jvm_opts = ANALYTICAL_ENGINE_JAVA_JVM_OPTS + self._launcher = launcher # control log fetching - self._streaming_logs = True + self._streaming_logs = False self._pipe_merged = PipeMerger(sys.stdout, sys.stderr) # dangling check self._dangling_timeout_seconds = dangling_timeout_seconds - if self._dangling_timeout_seconds >= 0: - self._dangling_detecting_timer = threading.Timer( - interval=self._dangling_timeout_seconds, - function=self._cleanup, - args=( - True, - True, - ), - ) - self._dangling_detecting_timer.start() + self._dangling_detecting_timer = None + self._cleanup_instance = False + self._set_dangling_timer(cleanup_instance=True) + + self._operation_executor: OperationExecutor = None # a lock that protects the coordinator self._lock = threading.Lock() - - atexit.register(self._cleanup) + atexit.register(self.cleanup) def __del__(self): - self._cleanup() - - def _generate_session_id(self): - return "session_" + "".join( - [random.choice(string.ascii_lowercase) for _ in range(8)] - ) - - def _config_logging(self, log_level): - """Set log level basic on config. - Args: - log_level (str): Log level of stdout handler - """ - logging.basicConfig(level=logging.CRITICAL) - - if log_level: - log_level = log_level.upper() - - logger = logging.getLogger("graphscope") - logger.setLevel(log_level) - - stdout_handler = logging.StreamHandler(sys.stdout) - stdout_handler.setLevel(log_level) - stdout_handler.addFilter(lambda record: record.levelno <= logging.INFO) - stderr_handler = logging.StreamHandler(sys.stderr) - stderr_handler.setLevel(logging.WARNING) - - formatter = logging.Formatter( - "%(asctime)s [%(levelname)s][%(module)s:%(lineno)d]: %(message)s" - ) - stdout_handler.setFormatter(formatter) - stderr_handler.setFormatter(formatter) - - logger.addHandler(stdout_handler) - logger.addHandler(stderr_handler) - - def ConnectSession(self, request, context): - for result in self.ConnectSessionWrapped(request, context): - return result + self.cleanup() @Monitor.connectSession - def _ConnectSession(self, request, context): + def ConnectSession(self, request, context): # A session is already connected. - if self._request: + if self._connected: if getattr(request, "reconnect", False): return message_pb2.ConnectSessionResponse( session_id=self._session_id, cluster_type=self._launcher.type(), num_workers=self._launcher.num_workers, - engine_config=json.dumps(self._analytical_engine_config), - pod_name_list=self._engine_hosts.split(","), - namespace=self._k8s_namespace, + namespace=self._launcher.get_namespace(), ) - # connect failed, more than one connection at the same time. - context.set_code(grpc.StatusCode.ALREADY_EXISTS) - context.set_details( - "Cannot setup more than one connection at the same time." - ) + else: + # connect failed, more than one connection at the same time. + context.set_code(grpc.StatusCode.ALREADY_EXISTS) + context.set_details( + "Cannot setup more than one connection at the same time." + ) + return message_pb2.ConnectSessionResponse() + # check version compatibility from client + sv = version.parse(__version__) + cv = version.parse(request.version) + if sv.major != cv.major or sv.minor != cv.minor: + error_msg = f"Version between client and server is inconsistent: {request.version} vs {__version__}" + logger.warning(error_msg) + context.set_code(grpc.StatusCode.FAILED_PRECONDITION) + context.set_details(error_msg) return message_pb2.ConnectSessionResponse() + # Connect to serving coordinator. - self._key_to_op = {} - # dict of op_def_pb2.OpResult - self._op_result_pool = {} - self._request = request - try: - self._analytical_engine_config = self._get_engine_config() - except grpc.RpcError as e: - logger.error( - "Get engine config failed, code: %s, details: %s", - e.code().name, - e.details(), - ) - context.set_code(e.code()) - context.set_details(e.details()) - return message_pb2.ConnectSessionResponse() - # Generate session id + self._connected = True self._session_id = self._generate_session_id() - self._udf_app_workspace = os.path.join( - WORKSPACE, self._instance_id, self._session_id - ) - self._resource_dir = os.path.join( - WORKSPACE, self._instance_id, self._session_id, RESOURCE_DIR_NAME - ) self._launcher.set_session_workspace(self._session_id) + self._operation_executor = OperationExecutor( + self._session_id, self._launcher, self._object_manager + ) + + # Cleanup after timeout seconds + self._dangling_timeout_seconds = request.dangling_timeout_seconds + # If true, also delete graphscope instance (such as pods) in closing process + self._cleanup_instance = request.cleanup_instance + # Session connected, fetch logs via gRPC. self._streaming_logs = True sys.stdout.drop(False) - - # check version compatibility from client - sv = version.parse(__version__) - cv = version.parse(self._request.version) - if sv.major != cv.major or sv.minor != cv.minor: - error_msg = f"Version between client and server is inconsistent: {self._request.version} vs {__version__}" - logger.warning(error_msg) - context.set_code(error_codes_pb2.CONNECTION_ERROR) - context.set_details(error_msg) - return message_pb2.ConnectSessionResponse() - return message_pb2.ConnectSessionResponse( session_id=self._session_id, cluster_type=self._launcher.type(), num_workers=self._launcher.num_workers, - engine_config=json.dumps(self._analytical_engine_config), - pod_name_list=self._engine_hosts.split(","), - namespace=self._k8s_namespace, + namespace=self._launcher.get_namespace(), ) - ConnectSessionWrapped = catch_unknown_errors(message_pb2.ConnectSessionResponse())( - _ConnectSession - ) + @Monitor.closeSession + def CloseSession(self, request, context): + """ + Disconnect session, note that it won't clean up any resources if self._cleanup_instance is False. + """ + if not self._check_session_consistency(request, context): + return message_pb2.CloseSessionResponse() - def HeartBeat(self, request, context): - for result in self.HeartBeatWrapped(request, context): - return result + self._connected = False + self._session_id = None - def _HeartBeat(self, request, context): - if self._request and self._request.dangling_timeout_seconds >= 0: - # Reset dangling detect timer - if self._dangling_detecting_timer: - self._dangling_detecting_timer.cancel() + self.cleanup(cleanup_instance=self._cleanup_instance, is_dangling=False) + self._operation_executor = None - self._dangling_detecting_timer = threading.Timer( - interval=self._request.dangling_timeout_seconds, - function=self._cleanup, - args=( - self._request.cleanup_instance, - True, - ), - ) - self._dangling_detecting_timer.start() + # Session closed, stop streaming logs + sys.stdout.drop(True) + self._streaming_logs = False + return message_pb2.CloseSessionResponse() + def HeartBeat(self, request, context): + self._reset_dangling_timer(self._connected, self._cleanup_instance) # analytical engine - request = message_pb2.HeartBeatRequest() - if self._analytical_engine_stub is None: - raise RuntimeError( - "Analytical engine is not launched or has already been terminated." - ) - return self._analytical_engine_stub.HeartBeat(request) - - HeartBeatWrapped = catch_unknown_errors(message_pb2.HeartBeatResponse())(_HeartBeat) - - @Monitor.runOnAnalyticalEngine - def run_on_analytical_engine( # noqa: C901 - self, - dag_def: op_def_pb2.DagDef, - dag_bodies, - loader_op_bodies: dict, - ): - def _generate_runstep_request(session_id, dag_def, dag_bodies): - runstep_requests = [] - # head - runstep_requests.append( - message_pb2.RunStepRequest( - head=message_pb2.RunStepRequestHead( - session_id=session_id, dag_def=dag_def - ) - ) - ) - runstep_requests.extend(dag_bodies) - for item in runstep_requests: - yield item - - # preprocess of op before run on analytical engine - for op in dag_def.op: - self._key_to_op[op.key] = op - op_pre_process( - op, - self._op_result_pool, - self._key_to_op, - engine_hosts=self._engine_hosts, - engine_config=self._analytical_engine_config, - engine_java_class_path=self._java_class_path, # may be needed in CREATE_GRAPH or RUN_APP - engine_jvm_opts=self._jvm_opts, - ) - - # Handle op that depends on loader (data source) - if op.op == types_pb2.CREATE_GRAPH or op.op == types_pb2.ADD_LABELS: - for key_of_parent_op in op.parents: - parent_op = self._key_to_op[key_of_parent_op] - if parent_op.op == types_pb2.DATA_SOURCE: - # handle bodies of loader op - if parent_op.key in loader_op_bodies: - dag_bodies.extend(loader_op_bodies[parent_op.key]) - - # Compile app or not. - if op.op == types_pb2.BIND_APP: - op, app_sig, app_lib_path = self._maybe_compile_app(op) - - # Compile graph or not - # arrow property graph and project graph need to compile - # If engine crashed, we will get a SocketClosed grpc Exception. - # In that case, we should notify client the engine is dead. - if ( - ( - op.op == types_pb2.CREATE_GRAPH - and op.attr[types_pb2.GRAPH_TYPE].i == graph_def_pb2.ARROW_PROPERTY - ) - or op.op == types_pb2.TRANSFORM_GRAPH - or op.op == types_pb2.PROJECT_TO_SIMPLE - or op.op == types_pb2.ADD_LABELS - ): - op = self._maybe_register_graph(op, self._session_id) - # generate runstep requests, and run on analytical engine - requests = _generate_runstep_request(self._session_id, dag_def, dag_bodies) - # response - response_head = None - response_bodies = [] - try: - responses = self._analytical_engine_stub.RunStep(requests) - for response in responses: - if response.HasField("head"): - response_head = response - else: - response_bodies.append(response) - except grpc.RpcError as e: - logger.error( - "Engine RunStep failed, code: %s, details: %s", - e.code().name, - e.details(), - ) - if e.code() == grpc.StatusCode.INTERNAL: - # TODO: make the stacktrace seperated from normal error messages - # Too verbose. - if len(e.details()) > 3072: # 3k bytes - msg = f"{e.details()[:3072]} ... [truncated]" - else: - msg = e.details() - raise AnalyticalEngineInternalError(msg) - else: - raise - - # handle result from response stream - if response_head is None: - raise AnalyticalEngineInternalError( - "Missing head from the response stream." - ) - for op_result in response_head.head.results: - # record result in coordinator, which doesn't contains large data - self._op_result_pool[op_result.key] = op_result - # get the op corresponding to the result - op = self._key_to_op[op_result.key] - # register graph and dump graph schema - if op.op in ( - types_pb2.CREATE_GRAPH, - types_pb2.PROJECT_GRAPH, - types_pb2.PROJECT_TO_SIMPLE, - types_pb2.TRANSFORM_GRAPH, - types_pb2.ADD_LABELS, - types_pb2.ADD_COLUMN, - ): - schema_path = os.path.join( - get_tempdir(), op_result.graph_def.key + ".json" - ) - vy_info = graph_def_pb2.VineyardInfoPb() - op_result.graph_def.extension.Unpack(vy_info) - self._object_manager.put( - op_result.graph_def.key, - GraphMeta( - op_result.graph_def.key, - vy_info.vineyard_id, - op_result.graph_def, - schema_path, - ), - ) - if op_result.graph_def.graph_type == graph_def_pb2.ARROW_PROPERTY: - dump_string( - to_maxgraph_schema(vy_info.property_schema_json), - schema_path, - ) - vy_info.schema_path = schema_path - op_result.graph_def.extension.Pack(vy_info) - # register app - elif op.op == types_pb2.BIND_APP: - self._object_manager.put( - app_sig, - LibMeta(op_result.result.decode("utf-8"), "app", app_lib_path), - ) - # unregister graph - elif op.op == types_pb2.UNLOAD_GRAPH: - self._object_manager.pop(op.attr[types_pb2.GRAPH_NAME].s.decode()) - # unregister app - elif op.op == types_pb2.UNLOAD_APP: - self._object_manager.pop(op.attr[types_pb2.APP_NAME].s.decode()) - return response_head, response_bodies - - @Monitor.runOnInteractiveEngine - def run_on_interactive_engine(self, dag_def: op_def_pb2.DagDef): - response_head = message_pb2.RunStepResponse( - head=message_pb2.RunStepResponseHead() - ) - response_bodies = [] - for op in dag_def.op: - self._key_to_op[op.key] = op - op_pre_process( - op, - self._op_result_pool, - self._key_to_op, - engine_hosts=self._engine_hosts, - engine_config=self._analytical_engine_config, - ) - if op.op == types_pb2.CREATE_INTERACTIVE_QUERY: - op_result = self._create_interactive_instance(op) - elif op.op == types_pb2.GREMLIN_QUERY: - op_result = self._execute_gremlin_query(op) - elif op.op == types_pb2.FETCH_GREMLIN_RESULT: - op_result = self._fetch_gremlin_result(op) - elif op.op == types_pb2.CLOSE_INTERACTIVE_QUERY: - op_result = self._close_interactive_instance(op) - elif op.op == types_pb2.SUBGRAPH: - op_result = self._gremlin_to_subgraph(op) - else: - raise RuntimeError("Unsupport op type: " + str(op.op)) - splited_result = split_op_result(op_result) - response_head.head.results.append(op_result) - for i, chunk in enumerate(splited_result): - has_next = True - if i + 1 == len(splited_result): - has_next = False - response_bodies.append( - message_pb2.RunStepResponse( - body=message_pb2.RunStepResponseBody( - chunk=chunk, has_next=has_next - ) - ) - ) - # record op result - self._op_result_pool[op.key] = op_result - return response_head, response_bodies - - def run_on_learning_engine(self, dag_def: op_def_pb2.DagDef): - response_head = message_pb2.RunStepResponse( - head=message_pb2.RunStepResponseHead() - ) - response_bodies = [] - for op in dag_def.op: - self._key_to_op[op.key] = op - op_pre_process( - op, - self._op_result_pool, - self._key_to_op, - engine_hosts=self._engine_hosts, - engine_config=self._analytical_engine_config, - ) - if op.op == types_pb2.CREATE_LEARNING_INSTANCE: - op_result = self._create_learning_instance(op) - elif op.op == types_pb2.CLOSE_LEARNING_INSTANCE: - op_result = self._close_learning_instance(op) - else: - raise RuntimeError("Unsupport op type: " + str(op.op)) - response_head.head.results.append(op_result) - self._op_result_pool[op.key] = op_result - return response_head, response_bodies - - def run_on_coordinator( - self, - dag_def: op_def_pb2.DagDef, - dag_bodies, - loader_op_bodies: dict, - ): - response_head = message_pb2.RunStepResponse( - head=message_pb2.RunStepResponseHead() - ) - response_bodies = [] - for op in dag_def.op: - self._key_to_op[op.key] = op - op_pre_process( - op, - self._op_result_pool, - self._key_to_op, - engine_hosts=self._engine_hosts, - engine_config=self._analytical_engine_config, - ) - if op.op == types_pb2.DATA_SOURCE: - op_result = self._process_data_source(op, dag_bodies, loader_op_bodies) - elif op.op == types_pb2.DATA_SINK: - op_result = self._process_data_sink(op) - else: - raise RuntimeError("Unsupport op type: " + str(op.op)) - response_head.head.results.append(op_result) - self._op_result_pool[op.key] = op_result - return response_head, response_bodies + # if self._operation_executor is not None: + # return self._operation_executor.heart_beat(request) + return message_pb2.HeartBeatResponse() def RunStep(self, request_iterator, context): with self._lock: @@ -616,38 +265,37 @@ def _RunStep(self, request_iterator, context): loader_op_bodies = {} # response list for stream - responses = [] - # head - responses.append( + responses = [ message_pb2.RunStepResponse(head=message_pb2.RunStepResponseHead()) - ) + ] while not dag_manager.empty(): run_dag_on, dag, dag_bodies = dag_manager.next_dag() error_code = error_codes_pb2.COORDINATOR_INTERNAL_ERROR - head = None - bodies = None - + head, bodies = None, None + # logger.info('dag: %s', dag) try: # run on analytical engine if run_dag_on == GSEngine.analytical_engine: # need dag_bodies to load graph from pandas/numpy error_code = error_codes_pb2.ANALYTICAL_ENGINE_INTERNAL_ERROR - head, bodies = self.run_on_analytical_engine( + head, bodies = self._operation_executor.run_on_analytical_engine( dag, dag_bodies, loader_op_bodies ) # run on interactive engine elif run_dag_on == GSEngine.interactive_engine: error_code = error_codes_pb2.INTERACTIVE_ENGINE_INTERNAL_ERROR - head, bodies = self.run_on_interactive_engine(dag) + head, bodies = self._operation_executor.run_on_interactive_engine( + dag + ) # run on learning engine elif run_dag_on == GSEngine.learning_engine: error_code = error_codes_pb2.LEARNING_ENGINE_INTERNAL_ERROR - head, bodies = self.run_on_learning_engine(dag) + head, bodies = self._operation_executor.run_on_learning_engine(dag) # run on coordinator elif run_dag_on == GSEngine.coordinator: error_code = error_codes_pb2.COORDINATOR_INTERNAL_ERROR - head, bodies = self.run_on_coordinator( + head, bodies = self._operation_executor.run_on_coordinator( dag, dag_bodies, loader_op_bodies ) # merge the responses @@ -662,11 +310,7 @@ def _RunStep(self, request_iterator, context): except Exception as exc: response_head = responses[0] response_head.head.code = error_code - response_head.head.error_msg = ( - "Error occurred during preprocessing, The traceback is: {0}".format( - traceback.format_exc() - ) - ) + response_head.head.error_msg = f"Error occurred during RunStep, The traceback is: {traceback.format_exc()}" response_head.head.full_exception = pickle.dumps(exc) for response in responses: yield response @@ -678,92 +322,6 @@ def _RunStep(self, request_iterator, context): message_pb2.RunStepResponse(head=message_pb2.RunStepResponseHead()), True )(_RunStep) - def _maybe_compile_app(self, op): - app_sig = get_app_sha256(op.attr, self._java_class_path) - # try to get compiled file from GRAPHSCOPE_HOME/precompiled - space = os.path.join(GRAPHSCOPE_HOME, "precompiled", "builtin") - app_lib_path = get_lib_path(os.path.join(space, app_sig), app_sig) - if not os.path.isfile(app_lib_path): - space = self._builtin_workspace - if ( - (types_pb2.GAR in op.attr) - or (op.attr[types_pb2.APP_ALGO].s.decode("utf-8").startswith("giraph:")) - or op.attr[types_pb2.APP_ALGO].s.decode("utf-8").startswith("java_pie:") - ): - space = self._udf_app_workspace - # try to get compiled file from workspace - app_lib_path = get_lib_path(os.path.join(space, app_sig), app_sig) - if not os.path.isfile(app_lib_path): - # compile and distribute - compiled_path = self._compile_lib_and_distribute( - compile_app, app_sig, op - ) - if app_lib_path != compiled_path: - raise RuntimeError( - f"Computed application library path not equal to compiled path, {app_lib_path} versus {compiled_path}" - ) - op.attr[types_pb2.APP_LIBRARY_PATH].CopyFrom( - attr_value_pb2.AttrValue(s=app_lib_path.encode("utf-8")) - ) - return op, app_sig, app_lib_path - - def _maybe_register_graph(self, op, session_id): - graph_sig = get_graph_sha256(op.attr) - # try to get compiled file from GRAPHSCOPE_HOME/precompiled - space = os.path.join(GRAPHSCOPE_HOME, "precompiled", "builtin") - graph_lib_path = get_lib_path(os.path.join(space, graph_sig), graph_sig) - if not os.path.isfile(graph_lib_path): - space = self._builtin_workspace - # try to get compiled file from workspace - graph_lib_path = get_lib_path(os.path.join(space, graph_sig), graph_sig) - if not os.path.isfile(graph_lib_path): - # compile and distribute - compiled_path = self._compile_lib_and_distribute( - compile_graph_frame, graph_sig, op - ) - if graph_lib_path != compiled_path: - raise RuntimeError( - f"Computed graph library path not equal to compiled path, {graph_lib_path} versus {compiled_path}" - ) - if graph_sig not in self._object_manager: - # register graph - op_def = op_def_pb2.OpDef(op=types_pb2.REGISTER_GRAPH_TYPE) - op_def.attr[types_pb2.GRAPH_LIBRARY_PATH].CopyFrom( - attr_value_pb2.AttrValue(s=graph_lib_path.encode("utf-8")) - ) - op_def.attr[types_pb2.TYPE_SIGNATURE].CopyFrom( - attr_value_pb2.AttrValue(s=graph_sig.encode("utf-8")) - ) - op_def.attr[types_pb2.GRAPH_TYPE].CopyFrom( - attr_value_pb2.AttrValue(i=op.attr[types_pb2.GRAPH_TYPE].i) - ) - dag_def = op_def_pb2.DagDef() - dag_def.op.extend([op_def]) - try: - response_head, _ = self.run_on_analytical_engine(dag_def, [], {}) - except grpc.RpcError as e: - logger.error( - "Register graph failed, code: %s, details: %s", - e.code().name, - e.details(), - ) - if e.code() == grpc.StatusCode.INTERNAL: - raise AnalyticalEngineInternalError(e.details()) - else: - raise - self._object_manager.put( - graph_sig, - LibMeta( - response_head.head.results[0].result, - "graph_frame", - graph_lib_path, - ), - ) - op.attr[types_pb2.TYPE_SIGNATURE].CopyFrom( - attr_value_pb2.AttrValue(s=graph_sig.encode("utf-8")) - ) - return op - def FetchLogs(self, request, context): while self._streaming_logs: try: @@ -771,7 +329,7 @@ def FetchLogs(self, request, context): except queue.Empty: info_message, error_message = "", "" except Exception as e: - info_message, error_message = "WARNING: failed to read log: %s" % e, "" + info_message, error_message = f"WARNING: failed to read log: {e}", "" if info_message or error_message: if self._streaming_logs: @@ -780,70 +338,32 @@ def FetchLogs(self, request, context): ) def AddLib(self, request, context): - for result in self.AddLibWrapped(request, context): - return result - - def _AddLib(self, request, context): - if request.session_id != self._session_id: - context.set_code(grpc.StatusCode.INVALID_ARGUMENT) - context.set_details( - f"Session handle not matched, {request.session_id} versus {self._session_id}" - ) - os.makedirs(self._resource_dir, exist_ok=True) - gar = request.gar - fp = BytesIO(gar) - filename = None - with zipfile.ZipFile(fp, "r") as zip_ref: - zip_ref.extractall(self._resource_dir) - logger.info( - "Coordinator recieved add lib request contains file {}".format( - zip_ref.namelist() - ) - ) - if len(zip_ref.namelist()) != 1: - raise RuntimeError("Expect only one resource in one gar") - filename = zip_ref.namelist()[0] - full_filename = os.path.join(self._resource_dir, filename) - self._launcher.distribute_file(full_filename) - logger.info("Successfully distributed {}".format(full_filename)) - if full_filename.endswith(".jar"): - logger.info("adding lib to java class path since it ends with .jar") - self._java_class_path = full_filename + ":" + self._java_class_path - logger.info("current java class path: {}".format(self._java_class_path)) + try: + self._operation_executor.add_lib(request) + except Exception as e: + context.abort(grpc.StatusCode.ABORTED, str(e)) return message_pb2.AddLibResponse() - AddLibWrapped = catch_unknown_errors(message_pb2.AddLibResponse())(_AddLib) - - def CloseSession(self, request, context): - for result in self.CloseSessionWrapped(request, context): - return result - - @Monitor.closeSession - def _CloseSession(self, request, context): - """ - Disconnect session, note that it doesn't clean up any resources. - """ - if request.session_id != self._session_id: - context.set_code(grpc.StatusCode.INVALID_ARGUMENT) - context.set_details( - f"Session handle not matched, {request.session_id} versus {self._session_id}" - ) - - self._cleanup( - cleanup_instance=self._request.cleanup_instance, is_dangling=False + def CreateAnalyticalInstance(self, request, context): + try: + self._launcher.start() + # create GAE rpc service + self._launcher.create_analytical_instance() + engine_config = self._operation_executor.get_analytical_engine_config() + engine_config.update(self._launcher.get_engine_config()) + except grpc.RpcError as e: + context.set_code(e.code()) + context.set_details("Get engine config failed: " + e.details()) + return message_pb2.CreateAnalyticalInstanceResponse() + except Exception as e: + context.abort(grpc.StatusCode.ABORTED, str(e)) + return message_pb2.CreateAnalyticalInstanceResponse() + return message_pb2.CreateAnalyticalInstanceResponse( + engine_config=json.dumps(engine_config), + host_names=self._launcher.hosts.split(","), ) - self._request = None - - # Session closed, stop streaming logs - sys.stdout.drop(True) - self._streaming_logs = False - return message_pb2.CloseSessionResponse() - CloseSessionWrapped = catch_unknown_errors(message_pb2.CloseSessionResponse())( - _CloseSession - ) - - def _create_interactive_instance(self, op: op_def_pb2.OpDef): + def CreateInteractiveInstance(self, request, context): def _match_frontend_endpoint(pattern, lines): for line in lines.split("\n"): rlt = re.findall(pattern, line) @@ -851,492 +371,119 @@ def _match_frontend_endpoint(pattern, lines): return rlt[0].strip() return "" - # vineyard object id of graph - object_id = op.attr[types_pb2.VINEYARD_ID].i # maxgraph endpoint pattern FRONTEND_PATTERN = re.compile("(?<=FRONTEND_ENDPOINT:).*$") + # maxgraph external endpoint, for clients that are outside of cluster to connect + # only available in kubernetes mode, exposed by NodePort or LoadBalancer FRONTEND_EXTERNAL_PATTERN = re.compile("(?<=FRONTEND_EXTERNAL_ENDPOINT:).*$") - # maxgraph endpoint - maxgraph_endpoint = None - # maxgraph external endpoint, for client and gremlin function test - maxgraph_external_endpoint = None + # create instance - proc = self._launcher.create_interactive_instance(op.attr) + object_id = request.object_id + schema_path = request.schema_path try: + proc = self._launcher.create_interactive_instance(object_id, schema_path) + gie_manager = InteractiveQueryManager(object_id) + # Put it to object_manager to ensure it could be killed during coordinator cleanup + # If coordinator is shutdown by force when creating interactive instance + self._object_manager.put(object_id, gie_manager) # 60 seconds is enough, see also GH#1024; try 120 # already add errs to outs - outs, _ = proc.communicate(timeout=120) + outs, _ = proc.communicate(timeout=120) # throws TimeoutError return_code = proc.poll() - if return_code == 0: - # match maxgraph endpoint and check for ready - maxgraph_endpoint = _match_frontend_endpoint(FRONTEND_PATTERN, outs) - if check_gremlin_server_ready(maxgraph_endpoint): - logger.info( - "build maxgraph frontend %s for graph %ld", - maxgraph_endpoint, - object_id, - ) - maxgraph_external_endpoint = _match_frontend_endpoint( - FRONTEND_EXTERNAL_PATTERN, outs - ) - - self._object_manager.put( - op.key, - InteractiveQueryManager(op.key, maxgraph_endpoint, object_id), - ) - endpoint = maxgraph_external_endpoint or maxgraph_endpoint - result = {"endpoint": endpoint, "object_id": object_id} - return op_def_pb2.OpResult( - code=error_codes_pb2.OK, - key=op.key, - result=json.dumps(result).encode("utf-8"), + if return_code != 0: + raise RuntimeError(f"Error code: {return_code}, message {outs}") + # match maxgraph endpoint and check for ready + endpoint = _match_frontend_endpoint(FRONTEND_PATTERN, outs) + # coordinator use internal endpoint + gie_manager.set_endpoint(endpoint) + if check_gremlin_server_ready(endpoint): # throws TimeoutError + logger.info( + "Built interactive frontend %s for graph %ld", endpoint, object_id ) - raise RuntimeError("Error code: {0}, message {1}".format(return_code, outs)) except Exception as e: - proc.kill() + context.set_code(grpc.StatusCode.ABORTED) + context.set_details("Create interactive instance failed: " + str(e)) self._launcher.close_interactive_instance(object_id) - raise RuntimeError("Create interactive instance failed.") from e - - def _execute_gremlin_query(self, op: op_def_pb2.OpDef): - message = op.attr[types_pb2.GIE_GREMLIN_QUERY_MESSAGE].s.decode() - request_options = None - if types_pb2.GIE_GREMLIN_REQUEST_OPTIONS in op.attr: - request_options = json.loads( - op.attr[types_pb2.GIE_GREMLIN_REQUEST_OPTIONS].s.decode() - ) - key_of_parent_op = op.parents[0] - - gremlin_client = self._object_manager.get(key_of_parent_op) - try: - rlt = gremlin_client.submit(message, request_options=request_options) - except Exception as e: - raise RuntimeError("Gremlin query failed.") from e - self._object_manager.put(op.key, GremlinResultSet(op.key, rlt)) - return op_def_pb2.OpResult(code=error_codes_pb2.OK, key=op.key) - - def _fetch_gremlin_result(self, op: op_def_pb2.OpDef): - fetch_result_type = op.attr[types_pb2.GIE_GREMLIN_FETCH_RESULT_TYPE].s.decode() - key_of_parent_op = op.parents[0] - result_set = self._object_manager.get(key_of_parent_op).result_set - try: - if fetch_result_type == "one": - rlt = result_set.one() - elif fetch_result_type == "all": - rlt = result_set.all().result() - except Exception as e: - raise RuntimeError("Fetch gremlin result failed") from e - meta = op_def_pb2.OpResult.Meta(has_large_result=True) - return op_def_pb2.OpResult( - code=error_codes_pb2.OK, - key=op.key, - meta=meta, - result=pickle.dumps(rlt), - ) - - def _process_data_sink(self, op: op_def_pb2.OpDef): - import vineyard - import vineyard.io - - storage_options = json.loads(op.attr[types_pb2.STORAGE_OPTIONS].s.decode()) - fd = op.attr[types_pb2.FD].s.decode() - df = op.attr[types_pb2.VINEYARD_ID].s.decode() - engine_config = self._analytical_engine_config - vineyard_endpoint = engine_config["vineyard_rpc_endpoint"] - vineyard_ipc_socket = engine_config["vineyard_socket"] - deployment, hosts = self._launcher.get_vineyard_stream_info() - dfstream = vineyard.io.open( - "vineyard://" + str(df), - mode="r", - vineyard_ipc_socket=vineyard_ipc_socket, - vineyard_endpoint=vineyard_endpoint, - deployment=deployment, - hosts=hosts, - ) - vineyard.io.open( - fd, - dfstream, - mode="w", - vineyard_ipc_socket=vineyard_ipc_socket, - vineyard_endpoint=vineyard_endpoint, - storage_options=storage_options, - deployment=deployment, - hosts=hosts, + self._object_manager.pop(object_id) + return message_pb2.CreateInteractiveInstanceResponse() + external_endpoint = _match_frontend_endpoint(FRONTEND_EXTERNAL_PATTERN, outs) + # client use external endpoint (k8s mode), or internal endpoint (standalone mode) + endpoint = external_endpoint or endpoint + return message_pb2.CreateInteractiveInstanceResponse( + gremlin_endpoint=endpoint, object_id=object_id ) - return op_def_pb2.OpResult(code=error_codes_pb2.OK, key=op.key) - def _process_data_source( - self, op: op_def_pb2.OpDef, dag_bodies, loader_op_bodies: dict - ): - def _spawn_vineyard_io_stream(source, storage_options, read_options): - import vineyard - import vineyard.io - - engine_config = self._analytical_engine_config - vineyard_endpoint = engine_config["vineyard_rpc_endpoint"] - vineyard_ipc_socket = engine_config["vineyard_socket"] - deployment, hosts = self._launcher.get_vineyard_stream_info() - num_workers = self._launcher.num_workers - stream_id = repr( - vineyard.io.open( - source, - mode="r", - vineyard_endpoint=vineyard_endpoint, - vineyard_ipc_socket=vineyard_ipc_socket, - hosts=hosts, - num_workers=num_workers, - deployment=deployment, - read_options=read_options, - storage_options=storage_options, - ) - ) - return "vineyard", stream_id - - def _process_loader_func(loader): - # loader is type of attr_value_pb2.Chunk - protocol = loader.attr[types_pb2.PROTOCOL].s.decode() - if protocol in ("hdfs", "hive", "oss", "s3"): - source = loader.attr[types_pb2.SOURCE].s.decode() - storage_options = json.loads( - loader.attr[types_pb2.STORAGE_OPTIONS].s.decode() - ) - read_options = json.loads( - loader.attr[types_pb2.READ_OPTIONS].s.decode() - ) - new_protocol, new_source = _spawn_vineyard_io_stream( - source, storage_options, read_options - ) - loader.attr[types_pb2.PROTOCOL].CopyFrom(utils.s_to_attr(new_protocol)) - loader.attr[types_pb2.SOURCE].CopyFrom(utils.s_to_attr(new_source)) - - for loader in op.large_attr.chunk_meta_list.items: - # handle vertex or edge loader - if loader.attr[types_pb2.CHUNK_TYPE].s.decode() == "loader": - # set op bodies, this is for loading graph from numpy/pandas - op_bodies = [] - for bodies in dag_bodies: - if bodies.body.op_key == op.key: - op_bodies.append(bodies) - loader_op_bodies[op.key] = op_bodies - _process_loader_func(loader) - - return op_def_pb2.OpResult(code=error_codes_pb2.OK, key=op.key) - - def _close_interactive_instance(self, op: op_def_pb2.OpDef): + def CreateLearningInstance(self, request, context): + object_id = request.object_id + logger.info("Create learning instance with object id %ld", object_id) + handle, config = request.handle, request.config try: - key_of_parent_op = op.parents[0] - gremlin_client = self._object_manager.get(key_of_parent_op) - object_id = gremlin_client.object_id - proc = self._launcher.close_interactive_instance(object_id) - # 60s is enough - proc.wait(timeout=60) - gremlin_client.close() - except Exception as e: - raise RuntimeError( - f"Failed to close interactive instance {object_id}" - ) from e - return op_def_pb2.OpResult( - code=error_codes_pb2.OK, - key=op.key, - ) - - def _gremlin_to_subgraph(self, op: op_def_pb2.OpDef): - gremlin_script = op.attr[types_pb2.GIE_GREMLIN_QUERY_MESSAGE].s.decode() - oid_type = op.attr[types_pb2.OID_TYPE].s.decode() - request_options = None - if types_pb2.GIE_GREMLIN_REQUEST_OPTIONS in op.attr: - request_options = json.loads( - op.attr[types_pb2.GIE_GREMLIN_REQUEST_OPTIONS].s.decode() + endpoints = self._launcher.create_learning_instance( + object_id, handle, config ) - key_of_parent_op = op.parents[0] - gremlin_client = self._object_manager.get(key_of_parent_op) - - def create_global_graph_builder(graph_name, num_workers, threads_per_executor): - import vineyard - - vineyard_client = vineyard.connect( - *self._analytical_engine_config["vineyard_rpc_endpoint"].split(":") - ) - - instances = [key for key in vineyard_client.meta] - - # duplicate each instances for each thread per worker. - chunk_instances = [ - key for key in instances for _ in range(threads_per_executor) - ] - - # build the vineyard::GlobalPGStream - metadata = vineyard.ObjectMeta() - metadata.set_global(True) - metadata["typename"] = "vineyard::htap::GlobalPGStream" - metadata["local_stream_chunks"] = threads_per_executor - metadata["total_stream_chunks"] = len(chunk_instances) - - # build the parallel stream for edge - edge_metadata = vineyard.ObjectMeta() - edge_metadata.set_global(True) - edge_metadata["typename"] = "vineyard::ParallelStream" - edge_metadata["__streams_-size"] = len(chunk_instances) - - # build the parallel stream for vertex - vertex_metadata = vineyard.ObjectMeta() - vertex_metadata.set_global(True) - vertex_metadata["typename"] = "vineyard::ParallelStream" - vertex_metadata["__streams_-size"] = len(chunk_instances) - - # NB: we don't respect `num_workers`, instead, we create a substream - # on each vineyard instance. - # - # Such a choice is to handle cases where thet etcd instance still contains - # information about dead instances. - # - # It should be ok, as each engine work will get its own local stream. But, - # generally it should be equal to `num_workers`. - for worker, instance_id in enumerate(chunk_instances): - edge_stream = vineyard.ObjectMeta() - edge_stream["typename"] = "vineyard::RecordBatchStream" - edge_stream["nbytes"] = 0 - edge_stream["params_"] = json.dumps( - { - "graph_name": graph_name, - "kind": "edge", - } - ) - edge = vineyard_client.create_metadata(edge_stream, instance_id) - vineyard_client.persist(edge.id) - edge_metadata.add_member("__streams_-%d" % worker, edge) - - vertex_stream = vineyard.ObjectMeta() - vertex_stream["typename"] = "vineyard::RecordBatchStream" - vertex_stream["nbytes"] = 0 - vertex_stream["params_"] = json.dumps( - { - "graph_name": graph_name, - "kind": "vertex", - } - ) - vertex = vineyard_client.create_metadata(vertex_stream, instance_id) - vineyard_client.persist(vertex.id) - vertex_metadata.add_member("__streams_-%d" % worker, vertex) - - chunk_stream = vineyard.ObjectMeta() - chunk_stream["typename"] = "vineyard::htap::PropertyGraphOutStream" - chunk_stream["graph_name"] = graph_name - chunk_stream["graph_schema"] = "{}" - chunk_stream["nbytes"] = 0 - chunk_stream["stream_index"] = worker - chunk_stream.add_member("edge_stream", edge) - chunk_stream.add_member("vertex_stream", vertex) - chunk = vineyard_client.create_metadata(chunk_stream, instance_id) - vineyard_client.persist(chunk.id) - metadata.add_member("stream_chunk_%d" % worker, chunk) - - # build the vineyard::GlobalPGStream - graph = vineyard_client.create_metadata(metadata) - vineyard_client.persist(graph.id) - vineyard_client.put_name(graph.id, graph_name) - - # build the parallel stream for edge - edge = vineyard_client.create_metadata(edge_metadata) - vineyard_client.persist(edge.id) - vineyard_client.put_name(edge.id, "__%s_edge_stream" % graph_name) - - # build the parallel stream for vertex - vertex = vineyard_client.create_metadata(vertex_metadata) - vineyard_client.persist(vertex.id) - vineyard_client.put_name(vertex.id, "__%s_vertex_stream" % graph_name) - - return repr(graph.id), repr(edge.id), repr(vertex.id) - - def load_subgraph( - graph_name, - total_builder_chunks, - oid_type, - edge_stream_id, - vertex_stream_id, - ): - import vineyard - - # wait all flags been created, see also - # - # `PropertyGraphOutStream::Initialize(Schema schema)` - vineyard_client = vineyard.connect( - *self._analytical_engine_config["vineyard_rpc_endpoint"].split(":") - ) - - # wait for all stream been created by GAIA executor in FFI - for worker in range(total_builder_chunks): - name = "__%s_%d_streamed" % (graph_name, worker) - vineyard_client.get_name(name, wait=True) - - vertices = [Loader(vineyard.ObjectID(vertex_stream_id))] - edges = [Loader(vineyard.ObjectID(edge_stream_id))] - oid_type = normalize_data_type_str(oid_type) - v_labels = normalize_parameter_vertices(vertices, oid_type) - e_labels = normalize_parameter_edges(edges, oid_type) - loader_op = create_loader(v_labels + e_labels) - config = { - types_pb2.DIRECTED: utils.b_to_attr(True), - types_pb2.OID_TYPE: utils.s_to_attr(oid_type), - types_pb2.GENERATE_EID: utils.b_to_attr(False), - types_pb2.VID_TYPE: utils.s_to_attr("uint64_t"), - types_pb2.IS_FROM_VINEYARD_ID: utils.b_to_attr(False), - } - new_op = create_graph( - self._session_id, - graph_def_pb2.ARROW_PROPERTY, - inputs=[loader_op], - attrs=config, - ) - # spawn a vineyard stream loader on coordinator - loader_op_def = loader_op.as_op_def() - coordinator_dag = op_def_pb2.DagDef() - coordinator_dag.op.extend([loader_op_def]) - # set the same key from subgraph to new op - new_op_def = new_op.as_op_def() - new_op_def.key = op.key - dag = op_def_pb2.DagDef() - dag.op.extend([new_op_def]) - self.run_on_coordinator(coordinator_dag, [], {}) - response_head, _ = self.run_on_analytical_engine(dag, [], {}) - logger.info("subgraph has been loaded") - return response_head.head.results[-1] - - # generate a random graph name - now_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - random_num = random.randint(0, 10000000) - graph_name = "subgraph-%s-%s" % (str(now_time), str(random_num)) - - threads_per_worker = int( - os.environ.get("THREADS_PER_WORKER", INTERACTIVE_ENGINE_THREADS_PER_WORKER) - ) - - if self._launcher_type == types_pb2.HOSTS: - # only 1 GIE executor on local cluster - executor_workers_num = 1 - threads_per_executor = self._launcher.num_workers * threads_per_worker - else: - executor_workers_num = self._launcher.num_workers - threads_per_executor = threads_per_worker - total_builder_chunks = executor_workers_num * threads_per_executor - - ( - _graph_builder_id, - edge_stream_id, - vertex_stream_id, - ) = create_global_graph_builder( - graph_name, executor_workers_num, threads_per_executor - ) - - # start a thread to launch the graph - pool = futures.ThreadPoolExecutor() - subgraph_task = pool.submit( - load_subgraph, - graph_name, - total_builder_chunks, - oid_type, - edge_stream_id, - vertex_stream_id, - ) - - # add subgraph vertices and edges - subgraph_script = "{0}.subgraph('{1}')".format( - gremlin_script, - graph_name, + self._object_manager.put(object_id, LearningInstanceManager(object_id)) + except Exception as e: + context.set_code(grpc.StatusCode.ABORTED) + context.set_details("Create learning instance failed: " + str(e)) + self._launcher.close_learning_instance(object_id) + self._object_manager.pop(object_id) + return message_pb2.CreateLearningInstanceResponse() + return message_pb2.CreateLearningInstanceResponse( + object_id=object_id, handle=handle, config=config, endpoints=endpoints ) - gremlin_client.submit( - subgraph_script, request_options=request_options - ).all().result() - - return subgraph_task.result() - def _create_learning_instance(self, op: op_def_pb2.OpDef): - object_id = op.attr[types_pb2.VINEYARD_ID].i - logger.info( - "Coordinator create learning instance with object id %ld", - object_id, - ) - handle = op.attr[types_pb2.GLE_HANDLE].s.decode("utf-8") - config = op.attr[types_pb2.GLE_CONFIG].s.decode("utf-8") - endpoints = self._launcher.create_learning_instance(object_id, handle, config) - self._object_manager.put(op.key, LearningInstanceManager(op.key, object_id)) - result = { - "handle": handle, - "config": config, - "endpoints": endpoints, - "object_id": object_id, - } - return op_def_pb2.OpResult( - code=error_codes_pb2.OK, - key=op.key, - result=json.dumps(result).encode("utf-8"), - ) + def CloseAnalyticalInstance(self, request, context): + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("CloseAnalyticalInstance is not implemented") + return message_pb2.CloseAnalyticalInstanceResponse() - def _close_learning_instance(self, op: op_def_pb2.OpDef): - key_of_parent_op = op.parents[0] - learning_instance_manager = self._object_manager.get(key_of_parent_op) - object_id = learning_instance_manager.object_id - logger.info( - "Coordinator close learning instance with object id %ld", - object_id, - ) - self._launcher.close_learning_instance(object_id) - learning_instance_manager.closed = True - return op_def_pb2.OpResult( - code=error_codes_pb2.OK, - key=op.key, - ) + def CloseInteractiveInstance(self, request, context): + object_id = request.object_id + if object_id in self._object_manager: + self._object_manager.pop(object_id) + try: + self._launcher.close_interactive_instance(object_id) + except Exception as e: + context.set_code(grpc.StatusCode.ABORTED) + context.set_details("Close interactive instance failed: " + str(e)) + return message_pb2.CloseInteractiveInstanceResponse() + + def CloseLearningInstance(self, request, context): + object_id = request.object_id + if object_id in self._object_manager: + self._object_manager.pop(object_id) + logger.info("Close learning instance with object id %ld", object_id) + try: + self._launcher.close_learning_instance(object_id) + except Exception as e: + context.set_code(grpc.StatusCode.ABORTED) + context.set_details("Close learning instance failed: " + str(e)) + return message_pb2.CloseLearningInstanceResponse() @Monitor.cleanup - def _cleanup(self, cleanup_instance=True, is_dangling=False): + def cleanup(self, cleanup_instance=True, is_dangling=False): # clean up session resources. - for key in self._object_manager.keys(): - obj = self._object_manager.get(key) - obj_type = obj.type - unload_type, config = None, None - - if obj_type == "app": - unload_type = types_pb2.UNLOAD_APP - config = { - types_pb2.APP_NAME: attr_value_pb2.AttrValue( - s=obj.key.encode("utf-8") - ) - } - elif obj_type == "graph": - unload_type = types_pb2.UNLOAD_GRAPH - config = { - types_pb2.GRAPH_NAME: attr_value_pb2.AttrValue( - s=obj.key.encode("utf-8") - ) - } - # dynamic graph doesn't have a vineyard id - if obj.vineyard_id != -1: - config[types_pb2.VINEYARD_ID] = attr_value_pb2.AttrValue( - i=obj.vineyard_id - ) - elif obj_type == "gie_manager": - if not obj.closed: - self._close_interactive_instance( - op=op_def_pb2.OpDef( - op=types_pb2.CLOSE_INTERACTIVE_QUERY, parents=[key] - ) - ) - - elif obj_type == "gle_manager": - if not obj.closed: - self._close_learning_instance( - op=op_def_pb2.OpDef( - op=types_pb2.CLOSE_LEARNING_INSTANCE, - parents=[key], - ) - ) - - if unload_type: - dag_def = create_single_op_dag(unload_type, config) - request = self._grpc_utils.generate_runstep_requests( - session_id=self._session_id, dag_def=dag_def - ) + logger.info("Cleaning up resources in coordinator") + for _, obj in self._object_manager.items(): + op_type, config = None, {} + if obj.type == "app": + op_type = types_pb2.UNLOAD_APP + config[types_pb2.APP_NAME] = s_to_attr(obj.key) + elif obj.type == "graph": + op_type = types_pb2.UNLOAD_GRAPH + config[types_pb2.GRAPH_NAME] = s_to_attr(obj.key) + # dynamic graph doesn't have a object id + if obj.object_id != -1: + config[types_pb2.VINEYARD_ID] = i_to_attr(obj.object_id) + elif obj.type == "gie_manager": + self._launcher.close_interactive_instance(obj.object_id) + elif obj.type == "gle_manager": + self._launcher.close_learning_instance(obj.object_id) + + if op_type is not None: + dag_def = create_single_op_dag(op_type, config) try: - self._analytical_engine_stub.RunStep(request) + self._operation_executor.run_step(dag_def, []) except grpc.RpcError as e: logger.error( "Cleanup failed, code: %s, details: %s", @@ -1345,82 +492,48 @@ def _cleanup(self, cleanup_instance=True, is_dangling=False): ) self._object_manager.clear() + self._cancel_dangling_timer() - self._request = None - - # cancel dangling detect timer - if self._dangling_detecting_timer: - self._dangling_detecting_timer.cancel() - self._dangling_detecting_timer = None - - # close engines if cleanup_instance: - self._analytical_engine_stub = None - self._analytical_engine_endpoint = None self._launcher.stop(is_dangling=is_dangling) - self._session_id = None + @staticmethod + def _generate_session_id(): + return "session_" + "".join( + [random.choice(string.ascii_lowercase) for _ in range(8)] + ) - def _create_grpc_stub(self): - options = [ - ("grpc.max_send_message_length", GS_GRPC_MAX_MESSAGE_LENGTH), - ("grpc.max_receive_message_length", GS_GRPC_MAX_MESSAGE_LENGTH), - ("grpc.max_metadata_size", GS_GRPC_MAX_MESSAGE_LENGTH), - ] + def _set_dangling_timer(self, cleanup_instance: bool): + if self._dangling_timeout_seconds > 0: + self._dangling_detecting_timer = threading.Timer( + interval=self._dangling_timeout_seconds, + function=self.cleanup, + args=( + cleanup_instance, + True, + ), + ) + self._dangling_detecting_timer.start() - channel = grpc.insecure_channel( - self._launcher.analytical_engine_endpoint, options=options - ) - return engine_service_pb2_grpc.EngineServiceStub(channel) + def _cancel_dangling_timer(self): + if self._dangling_detecting_timer is not None: + self._dangling_detecting_timer.cancel() + self._dangling_detecting_timer = None - def _get_engine_config(self): - dag_def = create_single_op_dag(types_pb2.GET_ENGINE_CONFIG) - try: - response_head, _ = self.run_on_analytical_engine(dag_def, [], {}) - except grpc.RpcError as e: - logger.error( - "Get engine config failed, code: %s, details: %s", - e.code().name, - e.details(), + def _reset_dangling_timer(self, reset: bool, cleanup_instance: bool): + if reset: + self._cancel_dangling_timer() + self._set_dangling_timer(cleanup_instance) + + def _check_session_consistency(self, request, context): + if request.session_id != self._session_id: + context.set_code(grpc.StatusCode.INVALID_ARGUMENT) + context.set_details( + f"Session handle not matched, {request.session_id} versus {self._session_id}" ) - if e.code() == grpc.StatusCode.INTERNAL: - raise AnalyticalEngineInternalError(e.details()) - else: - raise - config = json.loads(response_head.head.results[0].result.decode("utf-8")) - config.update(self._launcher.get_engine_config()) - # Disable ENABLE_JAVA_SDK when java is not installed on coordinator - if config["enable_java_sdk"] == "ON": - try: - _ = find_java() - except RuntimeError: - logger.warning( - "Disable java sdk support since java is not installed on coordinator" - ) - config["enable_java_sdk"] = "OFF" - return config - - def _compile_lib_and_distribute(self, compile_func, lib_name, op): - space = self._builtin_workspace - if ( - (types_pb2.GAR in op.attr) - or (op.attr[types_pb2.APP_ALGO].s.decode("utf-8").startswith("giraph:")) - or (op.attr[types_pb2.APP_ALGO].s.decode("utf-8").startswith("java_pie:")) - ): - space = self._udf_app_workspace - app_lib_path, java_jar_path, java_ffi_path, app_type = compile_func( - space, - lib_name, - op.attr, - self._analytical_engine_config, - self._java_class_path, - ) - # for java app compilation, we need to distribute the jar and ffi generated - if app_type == "java_pie": - self._launcher.distribute_file(java_jar_path) - self._launcher.distribute_file(java_ffi_path) - self._launcher.distribute_file(app_lib_path) - return app_lib_path + return False + else: + return True def parse_sys_args(): @@ -1523,7 +636,7 @@ def parse_sys_args(): "--k8s_image_pull_secrets", type=str, default="graphscope", - help="A list of comma sparated secrets to pull image.", + help="A list of comma separated secrets to pull image.", ) parser.add_argument( "--k8s_vineyard_daemonset", @@ -1535,7 +648,7 @@ def parse_sys_args(): "--k8s_vineyard_cpu", type=float, default=1.0, - help="CPU cores of vinayard container.", + help="CPU cores of vineyard container.", ) parser.add_argument( "--k8s_vineyard_mem", @@ -1645,7 +758,7 @@ def parse_sys_args(): "--k8s_volumes", type=str, default="{}", - help="A json string spcifies the kubernetes volumes to mount.", + help="A json string specifies the kubernetes volumes to mount.", ) parser.add_argument( "--timeout_seconds", @@ -1707,7 +820,11 @@ def parse_sys_args(): def launch_graphscope(): args = parse_sys_args() logger.info("Launching with args %s", args) + launcher = get_launcher(args) + start_server(launcher, args) + +def get_launcher(args): if args.cluster_type == "k8s": launcher = KubernetesClusterLauncher( namespace=args.k8s_namespace, @@ -1763,7 +880,10 @@ def launch_graphscope(): ) else: raise RuntimeError("Expect hosts or k8s of cluster_type parameter") + return launcher + +def start_server(launcher, args): coordinator_service_servicer = CoordinatorServiceServicer( launcher=launcher, dangling_timeout_seconds=args.dangling_timeout_seconds, @@ -1795,22 +915,20 @@ def launch_graphscope(): ) except Exception as e: logger.error( - "Failed to start monitor server 0.0.0.0:{0} : {1}".format( - args.monitor_port, e - ) + "Failed to start monitor server 0.0.0.0:%d : %s", args.monitor_port, e ) # handle SIGTERM signal def terminate(signum, frame): - coordinator_service_servicer._cleanup() + coordinator_service_servicer.cleanup() signal.signal(signal.SIGTERM, terminate) try: - # Grpc has handled SIGINT + # GRPC has handled SIGINT server.wait_for_termination() except KeyboardInterrupt: - coordinator_service_servicer._cleanup() + coordinator_service_servicer.cleanup() if __name__ == "__main__": diff --git a/coordinator/gscoordinator/dag_manager.py b/coordinator/gscoordinator/dag_manager.py index fd739949cc43..e17b652d4ad5 100644 --- a/coordinator/gscoordinator/dag_manager.py +++ b/coordinator/gscoordinator/dag_manager.py @@ -62,17 +62,12 @@ class DAGManager(object): ] _interactive_engine_split_op = [ - types_pb2.CREATE_INTERACTIVE_QUERY, types_pb2.SUBGRAPH, types_pb2.GREMLIN_QUERY, types_pb2.FETCH_GREMLIN_RESULT, - types_pb2.CLOSE_INTERACTIVE_QUERY, ] - _learning_engine_split_op = [ - types_pb2.CREATE_LEARNING_INSTANCE, - types_pb2.CLOSE_LEARNING_INSTANCE, - ] + _learning_engine_split_op = [] _coordinator_split_op = [ types_pb2.DATA_SOURCE, # spawn an io stream to read/write data from/to vineyard diff --git a/coordinator/gscoordinator/hook/prestop/__main__.py b/coordinator/gscoordinator/hook/prestop/__main__.py index 542e69662bcd..78a51604d916 100644 --- a/coordinator/gscoordinator/hook/prestop/__main__.py +++ b/coordinator/gscoordinator/hook/prestop/__main__.py @@ -49,7 +49,6 @@ def cleanup(self): if __name__ == "__main__": - path = DEFAULT_PATH resources = KubernetesResources() resources.load_json_file(path) diff --git a/coordinator/gscoordinator/cluster.py b/coordinator/gscoordinator/kubernetes_launcher.py similarity index 94% rename from coordinator/gscoordinator/cluster.py rename to coordinator/gscoordinator/kubernetes_launcher.py index b8d9d3176702..0f990e534aa0 100644 --- a/coordinator/gscoordinator/cluster.py +++ b/coordinator/gscoordinator/kubernetes_launcher.py @@ -57,10 +57,9 @@ from graphscope.deploy.kubernetes.utils import resolve_api_client from graphscope.framework.utils import PipeWatcher from graphscope.framework.utils import get_tempdir -from graphscope.framework.utils import is_free_port from graphscope.proto import types_pb2 -from gscoordinator.launcher import Launcher +from gscoordinator.launcher import AbstractLauncher from gscoordinator.utils import ANALYTICAL_ENGINE_PATH from gscoordinator.utils import GRAPHSCOPE_HOME from gscoordinator.utils import INTERACTIVE_ENGINE_SCRIPT @@ -132,7 +131,7 @@ def dump_with_extra_resource(self, resource): json.dump(rlt, f) -class KubernetesClusterLauncher(Launcher): +class KubernetesClusterLauncher(AbstractLauncher): _gs_etcd_builder_cls = GSEtcdBuilder _gs_engine_builder_cls = GSEngineBuilder _gs_mars_scheduler_builder_cls = GSEngineBuilder @@ -198,7 +197,7 @@ def __init__( timeout_seconds=None, waiting_for_delete=None, delete_namespace=None, - **kwargs + **kwargs, ): super().__init__() @@ -226,7 +225,7 @@ def __init__( self._coordinator_name = coordinator_name self._coordinator_service_name = coordinator_service_name - self._resource_object = ResourceManager(self._api_client) + self._resource_object: ResourceManager = ResourceManager(self._api_client) # etcd pod info self._etcd_num_pods = max(1, self._saved_locals["etcd_num_pods"]) @@ -252,7 +251,7 @@ def __init__( self._engine_pod_node_selector = dict() self._host0 = None - self._pod_name_list = None + self._pod_name_list = [] self._pod_ip_list = None self._pod_host_ip_list = None @@ -277,24 +276,21 @@ def __init__( self._learning_instance_processes = {} # workspace - self._instance_workspace = os.path.join( - WORKSPACE, self._saved_locals["instance_id"] - ) + instance_id = self._saved_locals["instance_id"] + self._instance_workspace = os.path.join(WORKSPACE, instance_id) os.makedirs(self._instance_workspace, exist_ok=True) self._session_workspace = None # component service name if self._exists_vineyard_daemonset(self._saved_locals["vineyard_daemonset"]): self._vineyard_service_name = ( - self._saved_locals["vineyard_daemonset"] + "-rpc" + f"{self._saved_locals['vineyard_daemonset']}-rpc" ) else: self._vineyard_service_name = ( - self._vineyard_service_name_prefix + self._saved_locals["instance_id"] + f"{self._vineyard_service_name_prefix}{instance_id}" ) - self._mars_service_name = ( - self._mars_service_name_prefix + self._saved_locals["instance_id"] - ) + self._mars_service_name = f"{self._mars_service_name_prefix}{instance_id}" def __del__(self): self.stop() @@ -311,9 +307,6 @@ def get_vineyard_rpc_endpoint(self): def get_mars_scheduler_endpoint(self): return self._mars_service_endpoint - def get_pods_list(self): - return self._pod_name_list - def waiting_for_delete(self): return self._saved_locals["waiting_for_delete"] @@ -367,15 +360,28 @@ def distribute_file(self, path): ] ) - def create_interactive_instance(self, config: dict): + def close_analytical_instance(self): + pass + + def launch_vineyard(self): + """Launch vineyardd in k8s cluster.""" + # TODO: vineyard is launched by engine by now. + pass + + def close_etcd(self): + # TODO: Delete etcd pods and service. + pass + + def close_vineyard(self): + pass + + def create_interactive_instance(self, object_id: int, schema_path: str): """ Args: config (dict): dict of op_def_pb2.OpDef.attr """ - object_id = config[types_pb2.VINEYARD_ID].i - schema_path = config[types_pb2.SCHEMA_PATH].s.decode() env = os.environ.copy() - env.update({"GRAPHSCOPE_HOME": GRAPHSCOPE_HOME}) + env["GRAPHSCOPE_HOME"] = GRAPHSCOPE_HOME cmd = [ INTERACTIVE_ENGINE_SCRIPT, "create_gremlin_instance_on_k8s", @@ -390,7 +396,7 @@ def create_interactive_instance(self, config: dict): self._coordinator_name, ] self._interactive_port += 3 - logger.info("Create GIE instance with command: {0}".format(" ".join(cmd))) + logger.info("Create GIE instance with command: %s", " ".join(cmd)) process = subprocess.Popen( cmd, start_new_session=True, @@ -408,7 +414,7 @@ def create_interactive_instance(self, config: dict): def close_interactive_instance(self, object_id): env = os.environ.copy() - env.update({"GRAPHSCOPE_HOME": GRAPHSCOPE_HOME}) + env["GRAPHSCOPE_HOME"] = GRAPHSCOPE_HOME cmd = [ INTERACTIVE_ENGINE_SCRIPT, "close_gremlin_instance_on_k8s", @@ -417,7 +423,7 @@ def close_interactive_instance(self, object_id): self.hosts, self._engine_container_name, ] - logger.info("Close GIE instance with command: {0}".format(" ".join(cmd))) + logger.info("Close GIE instance with command: %s", " ".join(cmd)) process = subprocess.Popen( cmd, start_new_session=True, @@ -705,7 +711,7 @@ def _create_engine_replicaset(self): ) ) - def _create_etcd(self): + def launch_etcd(self): logger.info("Launching etcd ...") labels = { @@ -785,13 +791,14 @@ def _create_vineyard_service(self): ) def _get_vineyard_service_endpoint(self): - # Always len(endpoints) >= 1 + # len(endpoints) >= 1 endpoints = get_service_endpoints( api_client=self._api_client, namespace=self._saved_locals["namespace"], name=self._vineyard_service_name, service_type=self._saved_locals["service_type"], ) + assert len(endpoints) >= 1 return endpoints[0] def _get_mars_scheduler_service_endpoint(self): @@ -865,17 +872,15 @@ def get_engine_config(self): } return config - def _create_interactive_engine_service(self): - pass - - def _config_etcd_endpoint(self): + def configure_etcd_endpoint(self): if self._etcd_addrs is None: - self._create_etcd() + self.launch_etcd() self._etcd_endpoint = self._get_etcd_service_endpoint() - logger.info("Etcd created, endpoint is %s", self._etcd_endpoint) + logger.info("etcd cluster created") else: self._etcd_endpoint = self._etcd_addrs - logger.info("External Etcd endpoint is %s", self._etcd_endpoint) + logger.info("Using external etcd cluster") + logger.info("etcd endpoint is %s", self._etcd_endpoint) def _get_etcd_endpoints(self): etcd_addrs = [] @@ -890,11 +895,7 @@ def _get_etcd_endpoints(self): return etcd_endpoints def _create_services(self): - self._config_etcd_endpoint() - - # create interactive engine service - logger.info("Creating interactive engine service...") - self._create_interactive_engine_service() + self.configure_etcd_endpoint() if self._saved_locals["with_mars"]: # scheduler used by mars @@ -931,8 +932,8 @@ def _waiting_for_services_ready(self): # check container status selector = "" for k, v in rs.spec.selector.match_labels.items(): - selector += k + "=" + v + "," - selector = selector[:-1] + selector += f"{k}={v}," + selector = selector[:-1] # remove last comma engine_pod_selector = selector pods = self._core_api.list_namespaced_pod( @@ -950,7 +951,7 @@ def _waiting_for_services_ready(self): timeout_seconds=1, ) for event in stream: - msg = "[{}]: {}".format(pod_name, event["object"].message) + msg = f"[{pod_name}]: {event['object'].message}" if msg not in event_messages: event_messages.append(msg) logger.info(msg) @@ -985,13 +986,17 @@ def _waiting_for_services_ready(self): # get vineyard service endpoint self._vineyard_service_endpoint = self._get_vineyard_service_endpoint() - logger.debug("vineyard rpc runs on %s", self._vineyard_service_endpoint) if self._saved_locals["with_mars"]: self._mars_service_endpoint = ( "http://" + self._get_mars_scheduler_service_endpoint() ) - logger.debug("mars scheduler runs on %s", self._mars_service_endpoint) logger.info("GraphScope engines pod is ready.") + logger.info("Engines pod name list: %s", self._pod_name_list) + logger.info("Engines pod ip list: %s", self._pod_ip_list) + logger.info("Engines pod host ip list: %s", self._pod_host_ip_list) + logger.info("Vineyard service endpoint: %s", self._vineyard_service_endpoint) + if self._saved_locals["with_mars"]: + logger.info("Mars service endpoint: %s", self._mars_service_endpoint) def _dump_resource_object(self): resource = {} @@ -1014,7 +1019,7 @@ def _get_etcd_service_endpoint(self): ) return endpoints[0] - def _launch_analytical_engine_locally(self): + def create_analytical_instance(self): logger.info( "Starting GAE rpc service on {} ...".format( str(self._analytical_engine_endpoint) @@ -1085,6 +1090,7 @@ def _launch_analytical_engine_locally(self): ) setattr(self._analytical_engine_process, "stdout_watcher", stdout_watcher) setattr(self._analytical_engine_process, "stderr_watcher", stderr_watcher) + time.sleep(2) # TODO: monitor engine process instead of sleep def _delete_dangling_coordinator(self): # delete service @@ -1130,25 +1136,13 @@ def _exists_vineyard_daemonset(self, release): ) except K8SApiException: return False - else: - return True + return True def start(self): try: self._create_services() self._waiting_for_services_ready() self._dump_resource_object() - logger.info("Engines pod name list: {}".format(self._pod_name_list)) - logger.info("Engines pod ip list: {}".format(self._pod_ip_list)) - logger.info("Engines pod host ip list: {}".format(self._pod_host_ip_list)) - logger.info( - "Vineyard service endpoint: {}".format(self._vineyard_service_endpoint) - ) - if self._saved_locals["with_mars"]: - logger.info( - "Mars service endpoint: {}".format(self._mars_service_endpoint) - ) - self._launch_analytical_engine_locally() except Exception as e: time.sleep(1) logger.error( @@ -1207,11 +1201,6 @@ def stop(self, is_dangling=False): self._delete_dangling_coordinator() self._closed = True - def poll(self): - if self._analytical_engine_process: - return self._analytical_engine_process.poll() - return -1 - def create_learning_instance(self, object_id, handle, config): # allocate service for ports self._create_graphlearn_service( diff --git a/coordinator/gscoordinator/launcher.py b/coordinator/gscoordinator/launcher.py index 98bdcdd6580c..aa51ab119956 100644 --- a/coordinator/gscoordinator/launcher.py +++ b/coordinator/gscoordinator/launcher.py @@ -16,651 +16,144 @@ # limitations under the License. # -import base64 -import json import logging import os -import shutil -import socket -import subprocess -import sys -import time from abc import ABCMeta from abc import abstractmethod -from graphscope.framework.utils import PipeWatcher -from graphscope.framework.utils import get_free_port -from graphscope.framework.utils import get_java_version -from graphscope.framework.utils import get_tempdir -from graphscope.framework.utils import is_free_port -from graphscope.proto import types_pb2 - -from gscoordinator.utils import ANALYTICAL_ENGINE_PATH from gscoordinator.utils import GRAPHSCOPE_HOME -from gscoordinator.utils import INTERACTIVE_ENGINE_SCRIPT -from gscoordinator.utils import INTERACTIVE_ENGINE_THREADS_PER_WORKER -from gscoordinator.utils import WORKSPACE -from gscoordinator.utils import ResolveMPICmdPrefix -from gscoordinator.utils import get_timestamp -from gscoordinator.utils import parse_as_glog_level logger = logging.getLogger("graphscope") -class Launcher(metaclass=ABCMeta): +def configure_environ(): + # add `${GRAPHSCOPE_HOME}/bin` to ${PATH} + os.environ["PATH"] += os.pathsep + os.path.join(GRAPHSCOPE_HOME, "bin") + # OPAL_PREFIX for openmpi + if os.path.isdir(os.path.join(GRAPHSCOPE_HOME, "openmpi")): + os.environ["OPAL_PREFIX"] = os.path.join(GRAPHSCOPE_HOME, "openmpi") + # Darwin is open-mpi + if os.path.isdir(os.path.join(GRAPHSCOPE_HOME, "open-mpi")): + os.environ["OPAL_PREFIX"] = os.path.join(GRAPHSCOPE_HOME, "open-mpi") + # add '${GRAPHSCOPE_HOME}/lib' to ${LD_LIBRARY_PATH} to find libvineyard_internal_registry.so(dylib) + if "LD_LIBRARY_PATH" in os.environ: + os.environ["LD_LIBRARY_PATH"] = ( + os.path.join(GRAPHSCOPE_HOME, "lib") + + os.pathsep + + os.environ["LD_LIBRARY_PATH"] + ) + else: + os.environ["LD_LIBRARY_PATH"] = os.path.join(GRAPHSCOPE_HOME, "lib") + if "DYLD_LIBRARY_PATH" in os.environ: + os.environ["DYLD_LIBRARY_PATH"] = ( + os.path.join(GRAPHSCOPE_HOME, "lib") + + os.pathsep + + os.environ["DYLD_LIBRARY_PATH"] + ) + else: + os.environ["DYLD_LIBRARY_PATH"] = os.path.join(GRAPHSCOPE_HOME, "lib") + + +class AbstractLauncher(metaclass=ABCMeta): def __init__(self): self._instance_id = None self._num_workers = None + self._hosts = "" self._analytical_engine_endpoint = None - - # add `${GRAPHSCOPE_HOME}/bin` to ${PATH} - os.environ["PATH"] += os.pathsep + os.path.join(GRAPHSCOPE_HOME, "bin") - # OPAL_PREFIX for openmpi - if os.path.isdir(os.path.join(GRAPHSCOPE_HOME, "openmpi")): - os.environ["OPAL_PREFIX"] = os.path.join(GRAPHSCOPE_HOME, "openmpi") - # Darwin is open-mpi - if os.path.isdir(os.path.join(GRAPHSCOPE_HOME, "open-mpi")): - os.environ["OPAL_PREFIX"] = os.path.join(GRAPHSCOPE_HOME, "open-mpi") - # add '${GRAPHSCOPE_HOME}/lib' to ${LD_LIBRARY_PATH} to find libvineyard_internal_registry.so(dylib) - if "LD_LIBRARY_PATH" in os.environ: - os.environ["LD_LIBRARY_PATH"] = ( - os.path.join(GRAPHSCOPE_HOME, "lib") - + os.pathsep - + os.environ["LD_LIBRARY_PATH"] - ) - else: - os.environ["LD_LIBRARY_PATH"] = os.path.join(GRAPHSCOPE_HOME, "lib") - if "DYLD_LIBRARY_PATH" in os.environ: - os.environ["DYLD_LIBRARY_PATH"] = ( - os.path.join(GRAPHSCOPE_HOME, "lib") - + os.pathsep - + os.environ["DYLD_LIBRARY_PATH"] - ) - else: - os.environ["DYLD_LIBRARY_PATH"] = os.path.join(GRAPHSCOPE_HOME, "lib") - - @property - def analytical_engine_endpoint(self): - if self._analytical_engine_endpoint is None: - raise RuntimeError("Get None value of analytical engine endpoint.") - return str(self._analytical_engine_endpoint) - - @property - def num_workers(self): - if self._num_workers is None: - raise RuntimeError("Get None value of workers number.") - return int(self._num_workers) - - @property - def instance_id(self): - if self._instance_id is None: - raise RuntimeError("Get None value of instance id.") - return self._instance_id + self._session_workspace = None + configure_environ() @abstractmethod - def type(self): + def create_analytical_instance(self): pass @abstractmethod - def start(self): + def create_interactive_instance(self, object_id: int, schema_path: str): pass @abstractmethod - def stop(self, is_dangling=False): + def create_learning_instance(self, object_id: int, handle: str, config: str): pass @abstractmethod - def poll(self): + def close_analytical_instance(self): pass + @abstractmethod + def close_interactive_instance(self, object_id: int): + pass -class LocalLauncher(Launcher): - """ - Launch engine localy with serveral hosts. - """ - - def __init__( - self, - num_workers, - hosts, - etcd_addrs, - etcd_listening_client_port, - etcd_listening_peer_port, - vineyard_socket, - shared_mem, - log_level, - instance_id, - timeout_seconds, - ): - super().__init__() - self._num_workers = num_workers - self._hosts = hosts - self._etcd_addrs = etcd_addrs - self._etcd_listening_client_port = etcd_listening_client_port - self._etcd_listening_peer_port = etcd_listening_peer_port - self._vineyard_socket = vineyard_socket - self._shared_mem = shared_mem - self._glog_level = parse_as_glog_level(log_level) - self._instance_id = instance_id - self._timeout_seconds = timeout_seconds - - self._vineyard_socket_prefix = os.path.join(get_tempdir(), "vineyard.sock.") - - # A graphsope instance may has multiple session by reconnecting to coordinator - self._instance_workspace = os.path.join(WORKSPACE, self._instance_id) - os.makedirs(self._instance_workspace, exist_ok=True) - # setting during client connect to coordinator - self._session_workspace = None - - # etcd - self._etcd_peer_port = None - self._etcd_client_port = None - self._etcd_process = None - # vineyardd - self._vineyard_rpc_port = None - self._vineyardd_process = None - # analytical engine - self._analytical_engine_process = None + @abstractmethod + def close_learning_instance(self, object_id: int): + pass - # interactive engine - # executor inter-processing port - # executor rpc port - # frontend port - self._interactive_port = 8233 - while not is_free_port(self._interactive_port): - self._interactive_port += 10 + @abstractmethod + def launch_etcd(self): + pass - # learning instance processes - self._learning_instance_processes = {} + @abstractmethod + def launch_vineyard(self): + pass - self._closed = True + @abstractmethod + def close_etcd(self): + pass - def type(self): - return types_pb2.HOSTS + @abstractmethod + def close_vineyard(self): + pass - def start(self): - try: - self._closed = False - self._create_services() - except Exception as e: - logger.error("Error when launching GraphScope locally: %s", str(e)) - self.stop() - return False - return True + @abstractmethod + def configure_etcd_endpoint(self): + pass - def stop(self, is_dangling=False): - if not self._closed: - self._stop_interactive_engine_service() - self._stop_analytical_engine() - self._stop_vineyard() - self._stop_etcd() - self._closed = True + @abstractmethod + def get_engine_config(self): + pass - def set_session_workspace(self, session_id): - self._session_workspace = os.path.join(self._instance_workspace, session_id) - os.makedirs(self._session_workspace, exist_ok=True) + @abstractmethod + def get_vineyard_stream_info(self): + pass + @abstractmethod def distribute_file(self, path): - d = os.path.dirname(path) - for host in self._hosts.split(","): - if host not in ("localhost", "127.0.0.1"): - subprocess.check_call( - [shutil.which("ssh"), host, "mkdir -p {}".format(d)] - ) - subprocess.check_call( - [shutil.which("scp"), "-r", path, "{}:{}".format(host, path)] - ) - - def poll(self): - if self._analytical_engine_process: - return self._analytical_engine_process.poll() - return -1 + pass @property - def hosts(self): - return self._hosts + def analytical_engine_endpoint(self): + if self._analytical_engine_endpoint is None: + raise RuntimeError("Analytical engine endpoint not set.") + return self._analytical_engine_endpoint @property - def vineyard_socket(self): - return self._vineyard_socket + def num_workers(self): + if self._num_workers is None: + raise RuntimeError("Get None value of workers number.") + return int(self._num_workers) @property - def etcd_port(self): - return self._etcd_client_port - - def get_engine_config(self): - config = { - "engine_hosts": self.hosts, - "mars_endpoint": None, - } - return config - - def get_vineyard_stream_info(self): - return "ssh", self._hosts.split(",") - - def create_interactive_instance(self, config: dict): - """ - Args: - config (dict): dict of op_def_pb2.OpDef.attr. - """ - # check java version - java_version = get_java_version() - logger.info("Java version: %s", java_version) - - object_id = config[types_pb2.VINEYARD_ID].i - schema_path = config[types_pb2.SCHEMA_PATH].s.decode() - env = os.environ.copy() - if ".install_prefix" in INTERACTIVE_ENGINE_SCRIPT: - env.update( - { - "GRAPHSCOPE_HOME": os.path.dirname( - os.path.dirname(INTERACTIVE_ENGINE_SCRIPT) - ) - } - ) - else: - env.update({"GRAPHSCOPE_HOME": GRAPHSCOPE_HOME}) - - # only one GIE/GAIA executor will be launched locally, even there are - # multiple GAE engines - threads_per_worker = int( - os.environ.get("THREADS_PER_WORKER", INTERACTIVE_ENGINE_THREADS_PER_WORKER) - ) - env["THREADS_PER_WORKER"] = str(threads_per_worker * self._num_workers) - - cmd = [ - INTERACTIVE_ENGINE_SCRIPT, - "create_gremlin_instance_on_local", - self._session_workspace, - str(object_id), - schema_path, - "0", # server id - str(self._interactive_port), # executor port - str(self._interactive_port + 1), # executor rpc port - str(self._interactive_port + 2), # frontend port - self.vineyard_socket, - ] - logger.info("Create GIE instance with command: %s", " ".join(cmd)) - self._interactive_port += 3 - process = subprocess.Popen( - cmd, - start_new_session=True, - cwd=os.getcwd(), - env=env, - encoding="utf-8", - errors="replace", - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True, - bufsize=1, - ) - return process - - def close_interactive_instance(self, object_id): - env = os.environ.copy() - if ".install_prefix" in INTERACTIVE_ENGINE_SCRIPT: - env.update( - { - "GRAPHSCOPE_HOME": os.path.dirname( - os.path.dirname(INTERACTIVE_ENGINE_SCRIPT) - ) - } - ) - else: - env.update({"GRAPHSCOPE_HOME": GRAPHSCOPE_HOME}) - cmd = [ - INTERACTIVE_ENGINE_SCRIPT, - "close_gremlin_instance_on_local", - self._session_workspace, - str(object_id), - ] - logger.info("Close GIE instance with command: %s", " ".join(cmd)) - process = subprocess.Popen( - cmd, - start_new_session=True, - cwd=os.getcwd(), - env=env, - encoding="utf-8", - errors="replace", - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True, - bufsize=1, - ) - return process - - def _find_etcd(self): - etcd = shutil.which("etcd") - if not etcd: - etcd = [sys.executable, "-m", "etcd_distro.etcd"] - if not isinstance(etcd, list): - etcd = [etcd] - return etcd - - def _config_etcd(self): - if self._etcd_addrs is None: - self._launch_etcd() - else: - self._etcd_endpoint = "http://" + self._etcd_addrs - logger.info("External Etcd endpoint is %s", self._etcd_endpoint) - - def _launch_etcd(self): - etcd_exec = self._find_etcd() - if is_free_port(self._etcd_listening_client_port): - self._etcd_client_port = self._etcd_listening_client_port - else: - self._etcd_client_port = get_free_port() - if is_free_port(self._etcd_listening_peer_port): - self._etcd_peer_port = self._etcd_listening_peer_port - else: - self._etcd_peer_port = get_free_port() - if len(self._hosts) > 1: - try: - local_hostname = socket.gethostname() - socket.gethostbyname( - local_hostname - ) # make sure the hostname is dns-resolvable - except Exception: - local_hostname = "127.0.0.1" # fallback to a must-correct hostname - self._etcd_endpoint = "http://{0}:{1}".format( - local_hostname, str(self._etcd_client_port) - ) - else: - self._etcd_endpoint = "http://127.0.0.1:{0}".format( - str(self._etcd_client_port) - ) - - env = os.environ.copy() - env.update({"ETCD_MAX_TXN_OPS": "102400"}) - - cmd = etcd_exec + [ - "--data-dir", - str(self._instance_workspace), - "--listen-peer-urls", - "http://0.0.0.0:{0}".format(str(self._etcd_peer_port)), - "--listen-client-urls", - "http://0.0.0.0:{0}".format(str(self._etcd_client_port)), - "--advertise-client-urls", - self._etcd_endpoint, - "--initial-cluster", - "default=http://127.0.0.1:{0}".format(str(self._etcd_peer_port)), - "--initial-advertise-peer-urls", - "http://127.0.0.1:{0}".format(str(self._etcd_peer_port)), - ] - logger.info("Launch etcd with command: %s", " ".join(cmd)) - - process = subprocess.Popen( - cmd, - start_new_session=True, - cwd=os.getcwd(), - env=env, - encoding="utf-8", - errors="replace", - stdin=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - universal_newlines=True, - bufsize=1, - ) - - logger.info("Server is initializing etcd.") - self._etcd_process = process - - start_time = time.time() - - while is_free_port(self._etcd_client_port): - time.sleep(1) - if ( - self._timeout_seconds - and self._timeout_seconds + start_time < time.time() - ): - raise RuntimeError("Launch etcd service failed due to timeout.") - logger.info( - "Etcd is ready, endpoint is localhost:{0}".format(self._etcd_client_port) - ) - - def _find_vineyardd(self): - vineyardd = "" - if "VINEYARD_HOME" in os.environ: - vineyardd = os.path.expandvars("$VINEYARD_HOME/vineyardd") - if not vineyardd: - vineyardd = shutil.which("vineyardd") - if not vineyardd: - vineyardd = [sys.executable, "-m", "vineyard"] - if not isinstance(vineyardd, list): - vineyardd = [vineyardd] - return vineyardd - - def _create_vineyard(self): - if self._vineyard_socket is not None: - return - - multiple_hosts = [] - for host in self._hosts.split(","): - if ":" in host: - multiple_hosts.append(host + ":1") - else: - multiple_hosts.append(host.split(":")[0] + ":1") - - if len(multiple_hosts) > 1: - rmcp = ResolveMPICmdPrefix() - cmd, mpi_env = rmcp.resolve(len(multiple_hosts), ",".join(multiple_hosts)) - else: - cmd, mpi_env = [], {} - - ts = get_timestamp() - vineyard_socket = f"{self._vineyard_socket_prefix}{ts}" - self._vineyard_rpc_port = 9600 if is_free_port(9600) else get_free_port() - - cmd.extend(self._find_vineyardd()) - cmd.extend(["--socket", vineyard_socket]) - cmd.extend(["--rpc_socket_port", str(self._vineyard_rpc_port)]) - cmd.extend(["--size", self._shared_mem]) - cmd.extend(["-etcd_endpoint", self._etcd_endpoint]) - cmd.extend(["-etcd_prefix", f"vineyard.gsa.{ts}"]) - env = os.environ.copy() - env["GLOG_v"] = str(self._glog_level) - env.update(mpi_env) - - logger.info("Launch vineyardd with command: %s", " ".join(cmd)) - - process = subprocess.Popen( - cmd, - start_new_session=True, - cwd=os.getcwd(), - env=env, - encoding="utf-8", - errors="replace", - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True, - bufsize=1, - ) - - logger.info("Server is initializing vineyardd.") - stdout_watcher = PipeWatcher( - process.stdout, - sys.stdout, - suppressed=(not logger.isEnabledFor(logging.DEBUG)), - ) - setattr(process, "stdout_watcher", stdout_watcher) - - self._vineyard_socket = vineyard_socket - self._vineyardd_process = process - - start_time = time.time() - if len(multiple_hosts) > 1: - time.sleep(5) # should be OK - else: - while not os.path.exists(self._vineyard_socket): - time.sleep(1) - if ( - self._timeout_seconds - and self._timeout_seconds + start_time < time.time() - ): - raise RuntimeError("Launch vineyardd failed due to timeout.") - logger.info( - "Vineyardd is ready, ipc socket is {0}".format(self._vineyard_socket) - ) - - def _start_analytical_engine(self): - rmcp = ResolveMPICmdPrefix() - cmd, mpi_env = rmcp.resolve(self._num_workers, self._hosts) - - master = self._hosts.split(",")[0] - rpc_port = get_free_port(master) - self._analytical_engine_endpoint = f"{master}:{rpc_port}" - - cmd.append(ANALYTICAL_ENGINE_PATH) - cmd.extend(["--host", "0.0.0.0"]) - cmd.extend(["--port", str(rpc_port)]) - cmd.extend(["--vineyard_shared_mem", self._shared_mem]) - - if rmcp.openmpi(): - cmd.extend(["-v", str(self._glog_level)]) - else: - mpi_env["GLOG_v"] = str(self._glog_level) - - if self._vineyard_socket: - cmd.extend(["--vineyard_socket", self._vineyard_socket]) - - env = os.environ.copy() - env.update(mpi_env) - - logger.info("Launch analytical engine with command: %s", " ".join(cmd)) - - process = subprocess.Popen( - cmd, - start_new_session=True, - cwd=os.getcwd(), - env=env, - encoding="utf-8", - errors="replace", - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - bufsize=1, - ) - - logger.info("Server is initializing analytical engine.") - stdout_watcher = PipeWatcher(process.stdout, sys.stdout) - stderr_watcher = PipeWatcher(process.stderr, sys.stderr) - setattr(process, "stdout_watcher", stdout_watcher) - setattr(process, "stderr_watcher", stderr_watcher) - - self._analytical_engine_process = process - - start_time = time.time() - - while is_free_port(rpc_port): - time.sleep(1) - if ( - self._timeout_seconds - and self._timeout_seconds + start_time < time.time() - ): - raise RuntimeError("Launch analytical engine failed due to timeout.") - logger.info( - "Analytical engine is ready, endpoint is {0}".format( - self._analytical_engine_endpoint - ) - ) - - def _create_services(self): - # create etcd - self._config_etcd() - # create vineyard - self._create_vineyard() - # create GAE rpc service - self._start_analytical_engine() - if self.poll() is not None and self.poll() != 0: - raise RuntimeError("Initializing analytical engine failed.") - - def create_learning_instance(self, object_id, handle, config): - # prepare argument - handle = json.loads(base64.b64decode(handle.encode("utf-8")).decode("utf-8")) - - server_list = [] - for i in range(self._num_workers): - server_list.append(f"localhost:{str(get_free_port('localhost'))}") - hosts = ",".join(server_list) - handle["server"] = hosts - handle = base64.b64encode(json.dumps(handle).encode("utf-8")).decode("utf-8") - - # launch the server - env = os.environ.copy() - # set coordinator dir to PYTHONPATH - if "PYTHONPATH" in env: - env["PYTHONPATH"] = ( - os.path.join(os.path.dirname(__file__), "..") - + os.pathsep - + env["PYTHONPATH"] - ) - else: - env["PYTHONPATH"] = os.path.join(os.path.dirname(__file__), "..") - - self._learning_instance_processes[object_id] = [] - for index in range(self._num_workers): - cmd = [ - sys.executable, - "-m", - "gscoordinator.learning", - handle, - config, - str(index), - ] - logger.debug("launching learning server: %s", " ".join(cmd)) - - proc = subprocess.Popen( - cmd, - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - encoding="utf-8", - errors="replace", - universal_newlines=True, - bufsize=1, - ) - stdout_watcher = PipeWatcher( - proc.stdout, - sys.stdout, - suppressed=(not logger.isEnabledFor(logging.DEBUG)), - ) - setattr(proc, "stdout_watcher", stdout_watcher) - self._learning_instance_processes[object_id].append(proc) - - return server_list - - def close_learning_instance(self, object_id): - if object_id not in self._learning_instance_processes: - return + def instance_id(self): + return self._instance_id - # terminate the process - for proc in self._learning_instance_processes[object_id]: - self._stop_subprocess(proc, kill=True) - self._learning_instance_processes.clear() + @property + def hosts(self): + return self._hosts - def _stop_etcd(self): - self._stop_subprocess(self._etcd_process) + @abstractmethod + def type(self): + pass - def _stop_vineyard(self): - self._stop_subprocess(self._vineyardd_process, kill=True) + @abstractmethod + def start(self): + pass - def _stop_interactive_engine_service(self): + @abstractmethod + def stop(self, is_dangling=False): pass - def _stop_analytical_engine(self): - self._stop_subprocess(self._analytical_engine_process, kill=True) - self._analytical_engine_endpoint = None + @abstractmethod + def set_session_workspace(self, session_id): + pass - def _stop_subprocess(self, proc, kill=False): - if proc: - if kill: - proc.kill() - else: - proc.terminate() - proc.wait() - proc = None + def get_namespace(self): + pass diff --git a/coordinator/gscoordinator/learning.py b/coordinator/gscoordinator/learning.py index 4d10a19567c0..f565bb4bfc96 100644 --- a/coordinator/gscoordinator/learning.py +++ b/coordinator/gscoordinator/learning.py @@ -19,7 +19,6 @@ import base64 import json import logging -import os import sys import graphscope.learning.graphlearn as gl diff --git a/coordinator/gscoordinator/local_launcher.py b/coordinator/gscoordinator/local_launcher.py new file mode 100644 index 000000000000..e6863a2e1560 --- /dev/null +++ b/coordinator/gscoordinator/local_launcher.py @@ -0,0 +1,527 @@ +import base64 +import json +import logging +import os +import shutil +import socket +import subprocess +import sys +import time + +from graphscope.framework.utils import PipeWatcher +from graphscope.framework.utils import get_free_port +from graphscope.framework.utils import get_java_version +from graphscope.framework.utils import get_tempdir +from graphscope.framework.utils import is_free_port +from graphscope.proto import types_pb2 + +from gscoordinator.launcher import AbstractLauncher +from gscoordinator.utils import ANALYTICAL_ENGINE_PATH +from gscoordinator.utils import GRAPHSCOPE_HOME +from gscoordinator.utils import INTERACTIVE_ENGINE_SCRIPT +from gscoordinator.utils import INTERACTIVE_ENGINE_THREADS_PER_WORKER +from gscoordinator.utils import WORKSPACE +from gscoordinator.utils import ResolveMPICmdPrefix +from gscoordinator.utils import get_timestamp +from gscoordinator.utils import parse_as_glog_level + +logger = logging.getLogger("graphscope") + + +class LocalLauncher(AbstractLauncher): + def __init__( + self, + num_workers: int, + hosts: str, + etcd_addrs: str, + etcd_listening_client_port: int, + etcd_listening_peer_port: int, + vineyard_socket: str, + shared_mem: str, + log_level: str, + instance_id: str, + timeout_seconds: int, + ): + super().__init__() + self._num_workers = num_workers + self._hosts = hosts + + self._external_etcd_addr = etcd_addrs + self._etcd_listening_client_port = etcd_listening_client_port + self._etcd_listening_peer_port = etcd_listening_peer_port + self._vineyard_socket = vineyard_socket + self._shared_mem = shared_mem + + self._glog_level = parse_as_glog_level(log_level) + self._instance_id = instance_id + self._timeout_seconds = timeout_seconds + + self._vineyard_socket_prefix = os.path.join(get_tempdir(), "vineyard.sock.") + + # A graphscope instance may have multiple session by reconnecting to coordinator + self._instance_workspace = os.path.join(WORKSPACE, self._instance_id) + os.makedirs(self._instance_workspace, exist_ok=True) + # setting during client connect to coordinator + self._session_workspace = None + + # etcd + self._etcd_peer_port = None + self._etcd_client_port = None + self._etcd_process = None + self._etcd_endpoint = None + # vineyardd + self._vineyard_rpc_port = None + self._vineyardd_process = None + # analytical engine + self._analytical_engine_process = None + + # interactive engine + # executor inter-processing port + # executor rpc port + # frontend port + self._interactive_port = 8233 + while not is_free_port(self._interactive_port): + self._interactive_port += 10 + + # learning instance processes + self._learning_instance_processes = {} + + def type(self): + return types_pb2.HOSTS + + def stop(self, is_dangling=False): + self.close_analytical_instance() + self.close_vineyard() + self.close_etcd() + + def set_session_workspace(self, session_id): + self._session_workspace = os.path.join(self._instance_workspace, session_id) + os.makedirs(self._session_workspace, exist_ok=True) + + def get_namespace(self): + return "" + + @property + def hosts(self): + return self._hosts + + @property + def vineyard_socket(self): + return self._vineyard_socket + + @property + def etcd_port(self): + return self._etcd_client_port + + def create_analytical_instance(self): + mpi_resolver = ResolveMPICmdPrefix() + cmd, mpi_env = mpi_resolver.resolve(self._num_workers, self._hosts) + + master = self.hosts.split(",")[0] + rpc_port = get_free_port(master) + self._analytical_engine_endpoint = f"{master}:{rpc_port}" + + cmd.append(ANALYTICAL_ENGINE_PATH) + cmd.extend(["--host", "0.0.0.0"]) + cmd.extend(["--port", str(rpc_port)]) + cmd.extend(["--vineyard_shared_mem", self._shared_mem]) + + if mpi_resolver.openmpi(): + cmd.extend(["-v", str(self._glog_level)]) + else: + mpi_env["GLOG_v"] = str(self._glog_level) + + if self.vineyard_socket is not None: + cmd.extend(["--vineyard_socket", self.vineyard_socket]) + + env = os.environ.copy() + env.update(mpi_env) + + logger.info("Launch analytical engine with command: %s", " ".join(cmd)) + + process = subprocess.Popen( + cmd, + start_new_session=True, + cwd=os.getcwd(), + env=env, + encoding="utf-8", + errors="replace", + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + bufsize=1, + ) + + logger.info("Server is initializing analytical engine.") + stdout_watcher = PipeWatcher(process.stdout, sys.stdout) + stderr_watcher = PipeWatcher(process.stderr, sys.stderr) + setattr(process, "stdout_watcher", stdout_watcher) + setattr(process, "stderr_watcher", stderr_watcher) + + self._analytical_engine_process = process + + start_time = time.time() + while is_free_port(rpc_port): + time.sleep(1) + if self._timeout_seconds + start_time < time.time(): + self._analytical_engine_process.kill() + raise RuntimeError("Launch analytical engine failed due to timeout.") + logger.info( + "Analytical engine is listening on %s", self._analytical_engine_endpoint + ) + + def create_interactive_instance(self, object_id: int, schema_path: str): + # check java version + java_version = get_java_version() + logger.info("Java version: %s", java_version) + + env = os.environ.copy() + env["GRAPHSCOPE_HOME"] = GRAPHSCOPE_HOME + if ".install_prefix" in INTERACTIVE_ENGINE_SCRIPT: + env["GRAPHSCOPE_HOME"] = os.path.dirname( + os.path.dirname(INTERACTIVE_ENGINE_SCRIPT) + ) + + # only one GIE/GAIA executor will be launched locally, even there are + # multiple GAE engines + threads_per_worker = int( + os.environ.get("THREADS_PER_WORKER", INTERACTIVE_ENGINE_THREADS_PER_WORKER) + ) + env["THREADS_PER_WORKER"] = str(threads_per_worker * self._num_workers) + + cmd = [ + INTERACTIVE_ENGINE_SCRIPT, + "create_gremlin_instance_on_local", + self._session_workspace, + str(object_id), + schema_path, + "0", # server id + str(self._interactive_port), # executor port + str(self._interactive_port + 1), # executor rpc port + str(self._interactive_port + 2), # frontend port + self.vineyard_socket, + ] + logger.info("Create GIE instance with command: %s", " ".join(cmd)) + self._interactive_port += 3 + process = subprocess.Popen( + cmd, + start_new_session=True, + cwd=os.getcwd(), + env=env, + encoding="utf-8", + errors="replace", + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + bufsize=1, + ) + return process + + def create_learning_instance(self, object_id, handle, config): + # prepare argument + handle = json.loads(base64.b64decode(handle.encode("utf-8")).decode("utf-8")) + + server_list = [ + f"localhost:{get_free_port('localhost')}" for _ in range(self.num_workers) + ] + hosts = ",".join(server_list) + handle["server"] = hosts + handle = base64.b64encode(json.dumps(handle).encode("utf-8")).decode("utf-8") + + # launch the server + env = os.environ.copy() + # set coordinator dir to PYTHONPATH + python_path = ( + env.get("PYTHONPATH", "") + + os.pathsep + + os.path.dirname(os.path.dirname(__file__)) + ) + env["PYTHONPATH"] = python_path + + self._learning_instance_processes[object_id] = [] + for index in range(self._num_workers): + cmd = [ + sys.executable, + "-m", + "gscoordinator.learning", + handle, + config, + str(index), + ] + logger.debug("launching learning server: %s", " ".join(cmd)) + + proc = subprocess.Popen( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding="utf-8", + errors="replace", + universal_newlines=True, + bufsize=1, + ) + stdout_watcher = PipeWatcher( + proc.stdout, + sys.stdout, + suppressed=(not logger.isEnabledFor(logging.DEBUG)), + ) + setattr(proc, "stdout_watcher", stdout_watcher) + self._learning_instance_processes[object_id].append(proc) + return server_list + + def close_analytical_instance(self): + self._stop_subprocess(self._analytical_engine_process, kill=True) + self._analytical_engine_endpoint = None + + def close_interactive_instance(self, object_id): + env = os.environ.copy() + env["GRAPHSCOPE_HOME"] = GRAPHSCOPE_HOME + if ".install_prefix" in INTERACTIVE_ENGINE_SCRIPT: + env["GRAPHSCOPE_HOME"] = os.path.dirname( + os.path.dirname(INTERACTIVE_ENGINE_SCRIPT) + ) + cmd = [ + INTERACTIVE_ENGINE_SCRIPT, + "close_gremlin_instance_on_local", + self._session_workspace, + str(object_id), + ] + logger.info("Close GIE instance with command: %s", " ".join(cmd)) + process = subprocess.Popen( + cmd, + start_new_session=True, + cwd=os.getcwd(), + env=env, + encoding="utf-8", + errors="replace", + universal_newlines=True, + bufsize=1, + ) + # 60 seconds is enough + process.wait(timeout=60) + return process + + def close_learning_instance(self, object_id): + if object_id not in self._learning_instance_processes: + return + + # terminate the process + for proc in self._learning_instance_processes[object_id]: + self._stop_subprocess(proc, kill=True) + self._learning_instance_processes.clear() + + def launch_etcd(self): + if is_free_port(self._etcd_listening_client_port): + self._etcd_client_port = self._etcd_listening_client_port + else: + self._etcd_client_port = get_free_port() + if is_free_port(self._etcd_listening_peer_port): + self._etcd_peer_port = self._etcd_listening_peer_port + else: + self._etcd_peer_port = get_free_port() + + local_hostname = "127.0.0.1" + if len(self._hosts) > 1: + try: + local_hostname = socket.gethostname() + socket.gethostbyname( + local_hostname + ) # make sure the hostname is dns-resolvable + except: # noqa: E722 + local_hostname = "127.0.0.1" # fallback to a must-correct hostname + + self._etcd_endpoint = f"http://{local_hostname}:{self._etcd_client_port}" + + env = os.environ.copy() + env.update({"ETCD_MAX_TXN_OPS": "102400"}) + etcd_exec = self.find_etcd() + cmd = etcd_exec + [ + "--data-dir", + str(self._instance_workspace), + "--listen-peer-urls", + f"http://0.0.0.0:{self._etcd_peer_port}", + "--listen-client-urls", + f"http://0.0.0.0:{self._etcd_client_port}", + "--advertise-client-urls", + self._etcd_endpoint, + "--initial-cluster", + f"default=http://127.0.0.1:{self._etcd_peer_port}", + "--initial-advertise-peer-urls", + f"http://127.0.0.1:{self._etcd_peer_port}", + ] + logger.info("Launch etcd with command: %s", " ".join(cmd)) + logger.info("Server is initializing etcd.") + + self._etcd_process = subprocess.Popen( + cmd, + start_new_session=True, + cwd=os.getcwd(), + env=env, + encoding="utf-8", + errors="replace", + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + universal_newlines=True, + bufsize=1, + ) + + start_time = time.time() + while is_free_port(self._etcd_client_port): + time.sleep(1) + if self._timeout_seconds + start_time < time.time(): + self._etcd_process.kill() + _, errs = self._etcd_process.communicate() + logger.error("Start etcd timeout, %s", errs) + raise RuntimeError("Launch etcd service failed due to timeout.") + logger.info("Etcd is ready, endpoint is %s", self._etcd_endpoint) + + def launch_vineyard(self): + if self.vineyard_socket is not None: + logger.info("Found existing vineyard socket: %s", self.vineyard_socket) + return + + hosts = [f"{host.split(':')[0]}:1" for host in self._hosts.split(",")] + + if len(hosts) > 1: # Use MPI to start multiple process + mpi_resolver = ResolveMPICmdPrefix() + cmd, mpi_env = mpi_resolver.resolve(len(hosts), ",".join(hosts)) + else: # Start single process without MPI + cmd, mpi_env = [], {} + + ts = get_timestamp() + self._vineyard_socket = f"{self._vineyard_socket_prefix}{ts}" + self._vineyard_rpc_port = 9600 if is_free_port(9600) else get_free_port() + + cmd.extend(self.find_vineyardd()) + cmd.extend(["--socket", self.vineyard_socket]) + cmd.extend(["--rpc_socket_port", str(self._vineyard_rpc_port)]) + cmd.extend(["--size", self._shared_mem]) + cmd.extend(["-etcd_endpoint", self._etcd_endpoint]) + cmd.extend(["-etcd_prefix", f"vineyard.gsa.{ts}"]) + env = os.environ.copy() + env["GLOG_v"] = str(self._glog_level) + env.update(mpi_env) + + logger.info("Launch vineyardd with command: %s", " ".join(cmd)) + logger.info("Server is initializing vineyardd.") + + process = subprocess.Popen( + cmd, + start_new_session=True, + cwd=os.getcwd(), + env=env, + encoding="utf-8", + errors="replace", + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + bufsize=1, + ) + + stdout_watcher = PipeWatcher( + process.stdout, + sys.stdout, + suppressed=(not logger.isEnabledFor(logging.DEBUG)), + ) + setattr(process, "stdout_watcher", stdout_watcher) + + self._vineyardd_process = process + + start_time = time.time() + if len(hosts) > 1: + time.sleep(5) # should be OK + else: + while not os.path.exists(self._vineyard_socket): + time.sleep(1) + if self._vineyardd_process.poll() is not None: + msg = "Launch vineyardd failed." + msg += "\nRerun with `graphscope.set_option(log_level='debug')`," + msg += " to get verbosed vineyardd logs." + raise RuntimeError(msg) + if self._timeout_seconds + start_time < time.time(): + self._vineyardd_process.kill() + # outs, _ = self._vineyardd_process.communicate() + # logger.error("Start vineyardd timeout, %s", outs) + raise RuntimeError("Launch vineyardd failed due to timeout.") + logger.info( + "Vineyardd is ready, ipc socket is {0}".format(self._vineyard_socket) + ) + + def close_etcd(self): + self._stop_subprocess(self._etcd_process) + + def close_vineyard(self): + self._stop_subprocess(self._vineyardd_process, kill=True) + + @staticmethod + def _stop_subprocess(proc, kill=False) -> None: + if proc is not None: + if kill: + proc.kill() + else: + proc.terminate() + + def distribute_file(self, path) -> None: + d = os.path.dirname(path) + for host in self.hosts.split(","): + if host not in ("localhost", "127.0.0.1"): + # TODO: handle failure, The error message is in CallProcessError.output as bytes + subprocess.check_output( + [shutil.which("ssh"), host, "mkdir -p {}".format(d)], + stderr=subprocess.STDOUT, + ) + subprocess.check_output( + [shutil.which("scp"), "-r", path, "{}:{}".format(host, path)], + stderr=subprocess.STDOUT, + ) + + @staticmethod + def find_etcd() -> [str]: + etcd = shutil.which("etcd") + if etcd is None: + etcd = [sys.executable, "-m", "etcd_distro.etcd"] + else: + etcd = [etcd] + return etcd + + @staticmethod + def find_vineyardd() -> [str]: + vineyardd = None + if "VINEYARD_HOME" in os.environ: + vineyardd = os.path.expandvars("$VINEYARD_HOME/vineyardd") + if vineyardd is None: + vineyardd = shutil.which("vineyardd") + if vineyardd is None: + vineyardd = [sys.executable, "-m", "vineyard"] + else: + vineyardd = [vineyardd] + return vineyardd + + def configure_etcd_endpoint(self): + if self._external_etcd_addr is None: + self.launch_etcd() + logger.info("etcd cluster created") + else: + self._etcd_endpoint = f"http://{self._external_etcd_addr}" + logger.info("Using etcd cluster") + logger.info("etcd endpoint is %s", self._etcd_endpoint) + + def start(self): + # create etcd + self.configure_etcd_endpoint() + # create vineyard + self.launch_vineyard() + + def get_engine_config(self) -> dict: + config = { + "engine_hosts": self._hosts, + "mars_endpoint": None, + } + return config + + def get_vineyard_stream_info(self): + return "ssh", self.hosts.split(",") diff --git a/coordinator/gscoordinator/monitor.py b/coordinator/gscoordinator/monitor.py index ed3ddd954af3..22ec4c80a81e 100644 --- a/coordinator/gscoordinator/monitor.py +++ b/coordinator/gscoordinator/monitor.py @@ -54,13 +54,9 @@ 21: "VIEW_GRAPH", 22: "INDUCE_SUBGRAPH", 23: "UNLOAD_CONTEXT", - 31: "CREATE_INTERACTIVE_QUERY", 32: "SUBGRAPH", 33: "GREMLIN_QUERY", 34: "FETCH_GREMLIN_RESULT", - 35: "CLOSE_INTERACTIVE_QUERY", - 41: "CREATE_LEARNING_INSTANCE", - 42: "CLOSE_LEARNING_INSTANCE", 46: "DATA_SOURCE", 47: "DATA_SINK", 50: "CONTEXT_TO_NUMPY", @@ -84,8 +80,8 @@ prometheus_client.REGISTRY.unregister(prometheus_client.GC_COLLECTOR) -class TemGuage(object): - """A temporary Gague. +class TemGauge(object): + """A temporary Gauge. It will clear the old metrics once they are collected. """ @@ -124,31 +120,32 @@ class Monitor: data_pat = re.compile(r"^.+Finished\s+(.+val.*),\s+time:\s+(.+)\s+.+$") sessionState = Gauge( - "session_state", "The session's state: 1 contected or 0 closed" + "session_state", + "The session's state: 1 stands for connected or 0 stands for closed", ) analyticalRequestCounter = Counter( "analytical_request", "Count requests of analytical requests" ) # analyticalRequestGauge = Gauge("analytical_request_time", "The analytical opration task time", ["op_name"]) - analyticalRequestGauge = TemGuage( - "analytical_request_time", "The analytical opration task time", ["op_name"] + analyticalRequestGauge = TemGauge( + "analytical_request_time", "The analytical operation task time", ["op_name"] ) interactiveRequestCounter = Counter( "interactive_request", "Count requests of interactive requests" ) interactiveRequestGauge = Gauge( - "interactive_request_time", "The interactive opration task time", ["op_name"] + "interactive_request_time", "The interactive operation task time", ["op_name"] ) - analyticalPerformace = TemGuage( + analyticalPerformance = TemGauge( "analytical_performance", - "The analytical opration task time of each round", + "The analytical operation task time of each round", ["app", "graph", "round"], ) - prometheus_client.REGISTRY.register(analyticalPerformace) + prometheus_client.REGISTRY.register(analyticalPerformance) prometheus_client.REGISTRY.register(analyticalRequestGauge) @classmethod diff --git a/coordinator/gscoordinator/object_manager.py b/coordinator/gscoordinator/object_manager.py index 28a1103a07f2..65d4b4750e8f 100644 --- a/coordinator/gscoordinator/object_manager.py +++ b/coordinator/gscoordinator/object_manager.py @@ -27,34 +27,40 @@ def __init__(self, key, lib_type, lib_path): class GraphMeta(object): - def __init__(self, key, vineyard_id, graph_def, schema_path=None): + def __init__(self, key, object_id, graph_def, schema_path=None): self.key = key self.type = "graph" - self.vineyard_id = vineyard_id + self.object_id = object_id self.graph_def = graph_def self.schema_path = schema_path class InteractiveQueryManager(object): - def __init__(self, key, frontend_endpoint, object_id): - self.key = key + def __init__(self, object_id, endpoint=None): self.type = "gie_manager" # graph object id in vineyard self.object_id = object_id - self.graph_url = f"ws://{frontend_endpoint}/gremlin" - self.client = Client(self.graph_url, "g") - self.closed = False + self.endpoint = endpoint + self.client = None + + def set_endpoint(self, endpoint): + self.endpoint = endpoint + + def __del__(self): + if self.client is not None: + try: + self.client.close() + except Exception: + # TODO(siyuan): throws no event loop exception with tornado 5.1.1 + pass def submit(self, message, bindings=None, request_options=None): + if self.client is None: + if self.endpoint is None: + raise RuntimeError("InteractiveQueryManager's endpoint cannot be None") + self.client = Client(f"ws://{self.endpoint}/gremlin", "g") return self.client.submit(message, bindings, request_options) - def close(self): - try: - self.client.close() - except Exception: - pass - self.closed = True - class GremlinResultSet(object): def __init__(self, key, result_set): @@ -64,11 +70,9 @@ def __init__(self, key, result_set): class LearningInstanceManager(object): - def __init__(self, key, object_id): - self.key = key + def __init__(self, object_id): self.type = "gle_manager" self.object_id = object_id - self.closed = False class ObjectManager(object): @@ -89,6 +93,9 @@ def pop(self, key): def keys(self): return self._objects.keys() + def items(self): + return self._objects.items() + def clear(self): self._objects.clear() diff --git a/coordinator/gscoordinator/op_executor.py b/coordinator/gscoordinator/op_executor.py new file mode 100644 index 000000000000..51cbaea848d9 --- /dev/null +++ b/coordinator/gscoordinator/op_executor.py @@ -0,0 +1,790 @@ +import datetime +import json +import logging +import os +import pickle +import random +import zipfile +from concurrent import futures +from io import BytesIO + +import grpc +from graphscope.framework import utils +from graphscope.framework.dag_utils import create_graph +from graphscope.framework.dag_utils import create_loader +from graphscope.framework.errors import AnalyticalEngineInternalError +from graphscope.framework.graph_utils import normalize_parameter_edges +from graphscope.framework.graph_utils import normalize_parameter_vertices +from graphscope.framework.loader import Loader +from graphscope.framework.utils import find_java +from graphscope.framework.utils import get_tempdir +from graphscope.framework.utils import normalize_data_type_str +from graphscope.proto import attr_value_pb2 +from graphscope.proto import engine_service_pb2_grpc +from graphscope.proto import graph_def_pb2 +from graphscope.proto import message_pb2 +from graphscope.proto import op_def_pb2 +from graphscope.proto import types_pb2 +from graphscope.proto.error_codes_pb2 import OK + +from gscoordinator.monitor import Monitor +from gscoordinator.object_manager import GraphMeta +from gscoordinator.object_manager import GremlinResultSet +from gscoordinator.object_manager import LibMeta +from gscoordinator.utils import ANALYTICAL_BUILTIN_SPACE +from gscoordinator.utils import ANALYTICAL_ENGINE_JAVA_INIT_CLASS_PATH +from gscoordinator.utils import ANALYTICAL_ENGINE_JAVA_JVM_OPTS +from gscoordinator.utils import GS_GRPC_MAX_MESSAGE_LENGTH +from gscoordinator.utils import INTERACTIVE_ENGINE_THREADS_PER_WORKER +from gscoordinator.utils import RESOURCE_DIR_NAME +from gscoordinator.utils import WORKSPACE +from gscoordinator.utils import compile_app +from gscoordinator.utils import compile_graph_frame +from gscoordinator.utils import create_single_op_dag +from gscoordinator.utils import dump_string +from gscoordinator.utils import get_app_sha256 +from gscoordinator.utils import get_graph_sha256 +from gscoordinator.utils import get_lib_path +from gscoordinator.utils import op_pre_process +from gscoordinator.utils import to_maxgraph_schema + +logger = logging.getLogger("graphscope") + + +class OperationExecutor: + def __init__(self, session_id: str, launcher, object_manager): + self._session_id = session_id + self._launcher = launcher + + self._object_manager = object_manager + + self._key_to_op = {} + # dict of op_def_pb2.OpResult + self._op_result_pool = {} + + # Analytical engine attributes + # ============================ + self._analytical_grpc_stub = None + # java class path should contain + # 1) java runtime path + # 2) uploaded resources, the recent uploaded resource will be placed first. + self._java_class_path = ANALYTICAL_ENGINE_JAVA_INIT_CLASS_PATH + self._jvm_opts = ANALYTICAL_ENGINE_JAVA_JVM_OPTS + # runtime workspace, consisting of some libraries, logs, etc. + self._builtin_workspace = os.path.join(WORKSPACE, "builtin") + # udf app workspace and resource directory should be bound to a specific session when client connect. + self._udf_app_workspace = os.path.join( + WORKSPACE, launcher.instance_id, session_id + ) + self._resource_dir = os.path.join( + WORKSPACE, launcher.instance_id, session_id, RESOURCE_DIR_NAME + ) + + def run_step(self, dag_def, dag_bodies): + def _generate_runstep_request(session_id, dag_def, dag_bodies): + runstep_requests = [ + message_pb2.RunStepRequest( + head=message_pb2.RunStepRequestHead( + session_id=session_id, dag_def=dag_def + ) + ) + ] + # head + runstep_requests.extend(dag_bodies) + for item in runstep_requests: + yield item + + requests = _generate_runstep_request(self._session_id, dag_def, dag_bodies) + # response + response_head, response_bodies = None, [] + try: + responses = self.analytical_grpc_stub.RunStep(requests) + for response in responses: + if response.HasField("head"): + response_head = response + else: + response_bodies.append(response) + return response_head, response_bodies + except grpc.RpcError as e: + if e.code() == grpc.StatusCode.INTERNAL: + # TODO: make the stacktrace separated from normal error messages + # Too verbose. + if len(e.details()) > 3072: # 3k bytes + msg = f"{e.details()[:30]} ... [truncated]" + else: + msg = e.details() + raise AnalyticalEngineInternalError(msg) + else: + raise + + def pre_process(self, dag_def, dag_bodies, loader_op_bodies): + for op in dag_def.op: + self._key_to_op[op.key] = op + op_pre_process( + op, + self._op_result_pool, + self._key_to_op, + engine_hosts=self._launcher.hosts, + engine_java_class_path=self._java_class_path, # may be needed in CREATE_GRAPH or RUN_APP + engine_jvm_opts=self._jvm_opts, + ) + + # Handle op that depends on loader (data source) + if op.op == types_pb2.CREATE_GRAPH or op.op == types_pb2.ADD_LABELS: + for key_of_parent_op in op.parents: + parent_op = self._key_to_op[key_of_parent_op] + if parent_op.op == types_pb2.DATA_SOURCE: + # handle bodies of loader op + if parent_op.key in loader_op_bodies: + dag_bodies.extend(loader_op_bodies[parent_op.key]) + + # Compile app or not. + if op.op == types_pb2.BIND_APP: + op, _, _ = self._maybe_compile_app(op) + + # Compile graph or not + # arrow property graph and project graph need to compile + # If engine crashed, we will get a SocketClosed grpc Exception. + # In that case, we should notify client the engine is dead. + if ( + ( + op.op == types_pb2.CREATE_GRAPH + and op.attr[types_pb2.GRAPH_TYPE].i == graph_def_pb2.ARROW_PROPERTY + ) + or op.op == types_pb2.TRANSFORM_GRAPH + or op.op == types_pb2.PROJECT_TO_SIMPLE + or op.op == types_pb2.ADD_LABELS + ): + op = self._maybe_register_graph(op) + return dag_def, dag_bodies + + @Monitor.runOnAnalyticalEngine + def run_on_analytical_engine( + self, dag_def, dag_bodies, loader_op_bodies + ): # noqa: C901 + # preprocess of op before run on analytical engine + dag_def, dag_bodies = self.pre_process(dag_def, dag_bodies, loader_op_bodies) + # generate runstep requests, and run on analytical engine + response_head, response_bodies = self.run_step(dag_def, dag_bodies) + response_head, response_bodies = self.post_process( + response_head, response_bodies + ) + return response_head, response_bodies + + def post_process(self, response_head, response_bodies): + # handle result from response stream + if response_head is None: + raise AnalyticalEngineInternalError( + "Missing head from the response stream." + ) + for op_result in response_head.head.results: + # record result in coordinator, which doesn't contain large data + self._op_result_pool[op_result.key] = op_result + # get the op corresponding to the result + op = self._key_to_op[op_result.key] + # register graph and dump graph schema + if op.op in ( + types_pb2.CREATE_GRAPH, + types_pb2.PROJECT_GRAPH, + types_pb2.PROJECT_TO_SIMPLE, + types_pb2.TRANSFORM_GRAPH, + types_pb2.ADD_LABELS, + types_pb2.ADD_COLUMN, + ): + schema_path = os.path.join( + get_tempdir(), op_result.graph_def.key + ".json" + ) + vy_info = graph_def_pb2.VineyardInfoPb() + op_result.graph_def.extension.Unpack(vy_info) + self._object_manager.put( + op_result.graph_def.key, + GraphMeta( + op_result.graph_def.key, + vy_info.vineyard_id, + op_result.graph_def, + schema_path, + ), + ) + if op_result.graph_def.graph_type == graph_def_pb2.ARROW_PROPERTY: + dump_string( + to_maxgraph_schema(vy_info.property_schema_json), + schema_path, + ) + vy_info.schema_path = schema_path + op_result.graph_def.extension.Pack(vy_info) + # register app + elif op.op == types_pb2.BIND_APP: + _, app_sig, app_lib_path = self._maybe_compile_app(op) + self._object_manager.put( + app_sig, + LibMeta(op_result.result.decode("utf-8"), "app", app_lib_path), + ) + # unregister graph + elif op.op == types_pb2.UNLOAD_GRAPH: + self._object_manager.pop(op.attr[types_pb2.GRAPH_NAME].s.decode()) + # unregister app + elif op.op == types_pb2.UNLOAD_APP: + self._object_manager.pop(op.attr[types_pb2.APP_NAME].s.decode()) + return response_head, response_bodies + + # Analytical engine related operations + # ==================================== + def _maybe_compile_app(self, op): + app_sig = get_app_sha256(op.attr, self._java_class_path) + # try to get compiled file from GRAPHSCOPE_HOME/precompiled + app_lib_path = get_lib_path( + os.path.join(ANALYTICAL_BUILTIN_SPACE, app_sig), app_sig + ) + if not os.path.isfile(app_lib_path): + algo_name = op.attr[types_pb2.APP_ALGO].s.decode("utf-8") + if ( + types_pb2.GAR in op.attr + or algo_name.startswith("giraph:") + or algo_name.startswith("java_pie:") + ): + space = self._udf_app_workspace + else: + space = self._builtin_workspace + # try to get compiled file from workspace + app_lib_path = get_lib_path(os.path.join(space, app_sig), app_sig) + if not os.path.isfile(app_lib_path): + # compile and distribute + compiled_path = self._compile_lib_and_distribute( + compile_app, app_sig, op + ) + if app_lib_path != compiled_path: + msg = f"Computed app library path != compiled path, {app_lib_path} versus {compiled_path}" + raise RuntimeError(msg) + op.attr[types_pb2.APP_LIBRARY_PATH].CopyFrom( + attr_value_pb2.AttrValue(s=app_lib_path.encode("utf-8")) + ) + return op, app_sig, app_lib_path + + def _maybe_register_graph(self, op): + graph_sig = get_graph_sha256(op.attr) + # try to get compiled file from GRAPHSCOPE_HOME/precompiled/builtin + graph_lib_path = get_lib_path( + os.path.join(ANALYTICAL_BUILTIN_SPACE, graph_sig), graph_sig + ) + if not os.path.isfile(graph_lib_path): + space = self._builtin_workspace + # try to get compiled file from workspace + graph_lib_path = get_lib_path(os.path.join(space, graph_sig), graph_sig) + if not os.path.isfile(graph_lib_path): + # compile and distribute + compiled_path = self._compile_lib_and_distribute( + compile_graph_frame, graph_sig, op + ) + if graph_lib_path != compiled_path: + raise RuntimeError( + f"Computed graph library path not equal to compiled path, {graph_lib_path} versus {compiled_path}" + ) + if graph_sig not in self._object_manager: + dag_def = create_single_op_dag( + types_pb2.REGISTER_GRAPH_TYPE, + config={ + types_pb2.GRAPH_LIBRARY_PATH: attr_value_pb2.AttrValue( + s=graph_lib_path.encode("utf-8") + ), + types_pb2.TYPE_SIGNATURE: attr_value_pb2.AttrValue( + s=graph_sig.encode("utf-8") + ), + types_pb2.GRAPH_TYPE: attr_value_pb2.AttrValue( + i=op.attr[types_pb2.GRAPH_TYPE].i + ), + }, + ) + try: + response_head, _ = self.run_on_analytical_engine(dag_def, [], {}) + except grpc.RpcError as e: + logger.error( + "Register graph failed, code: %s, details: %s", + e.code().name, + e.details(), + ) + if e.code() == grpc.StatusCode.INTERNAL: + raise AnalyticalEngineInternalError(e.details()) + else: + raise + self._object_manager.put( + graph_sig, + LibMeta( + response_head.head.results[0].result, + "graph_frame", + graph_lib_path, + ), + ) + op.attr[types_pb2.TYPE_SIGNATURE].CopyFrom( + attr_value_pb2.AttrValue(s=graph_sig.encode("utf-8")) + ) + return op + + def _create_analytical_grpc_stub(self): + options = [ + ("grpc.max_send_message_length", GS_GRPC_MAX_MESSAGE_LENGTH), + ("grpc.max_receive_message_length", GS_GRPC_MAX_MESSAGE_LENGTH), + ("grpc.max_metadata_size", GS_GRPC_MAX_MESSAGE_LENGTH), + ] + channel = grpc.insecure_channel( + self._launcher.analytical_engine_endpoint, options=options + ) + return engine_service_pb2_grpc.EngineServiceStub(channel) + + @property + def analytical_grpc_stub(self): + if self._launcher.analytical_engine_endpoint is None: + raise RuntimeError("Analytical engine endpoint not set.") + if self._analytical_grpc_stub is None: + self._analytical_grpc_stub = self._create_analytical_grpc_stub() + return self._analytical_grpc_stub + + def get_analytical_engine_config(self) -> {}: + dag_def = create_single_op_dag(types_pb2.GET_ENGINE_CONFIG) + response_head, _ = self.run_on_analytical_engine(dag_def, [], {}) + config = json.loads(response_head.head.results[0].result.decode("utf-8")) + config["engine_hosts"] = self._launcher.hosts + # Disable ENABLE_JAVA_SDK when java is not installed on coordinator + if config["enable_java_sdk"] == "ON": + try: + find_java() + except RuntimeError: + logger.warning( + "Disable java sdk support since java is not installed on coordinator" + ) + config["enable_java_sdk"] = "OFF" + return config + + def _compile_lib_and_distribute(self, compile_func, lib_name, op): + algo_name = op.attr[types_pb2.APP_ALGO].s.decode("utf-8") + if ( + types_pb2.GAR in op.attr + or algo_name.startswith("giraph:") + or algo_name.startswith("java_pie:") + ): + space = self._udf_app_workspace + else: + space = self._builtin_workspace + app_lib_path, java_jar_path, java_ffi_path, app_type = compile_func( + space, + lib_name, + op.attr, + self.get_analytical_engine_config(), + self._java_class_path, + ) + # for java app compilation, we need to distribute the jar and ffi generated + if app_type == "java_pie": + self._launcher.distribute_file(java_jar_path) + self._launcher.distribute_file(java_ffi_path) + self._launcher.distribute_file(app_lib_path) + return app_lib_path + + def heart_beat(self, request): + return self.analytical_grpc_stub.HeartBeat(request) + + def add_lib(self, request): + os.makedirs(self._resource_dir, exist_ok=True) + fp = BytesIO(request.gar) + with zipfile.ZipFile(fp, "r") as zip_ref: + zip_ref.extractall(self._resource_dir) + logger.info( + "Coordinator received add lib request with file: %s", zip_ref.namelist() + ) + if len(zip_ref.namelist()) != 1: + raise RuntimeError("Expect only one resource in one gar") + filename = zip_ref.namelist()[0] + filename = os.path.join(self._resource_dir, filename) + self._launcher.distribute_file(filename) + logger.info("Successfully distributed %s", filename) + if filename.endswith(".jar"): + logger.info("adding lib to java class path since it ends with .jar") + self._java_class_path = filename + ":" + self._java_class_path + logger.info("current java class path: %s", self._java_class_path) + + # Interactive engine related operations + # ===================================== + @Monitor.runOnInteractiveEngine + def run_on_interactive_engine(self, dag_def: op_def_pb2.DagDef): + response_head = message_pb2.RunStepResponseHead() + for op in dag_def.op: + self._key_to_op[op.key] = op + op_pre_process(op, self._op_result_pool, self._key_to_op) + if op.op == types_pb2.GREMLIN_QUERY: + op_result = self._execute_gremlin_query(op) + elif op.op == types_pb2.FETCH_GREMLIN_RESULT: + op_result = self._fetch_gremlin_result(op) + elif op.op == types_pb2.SUBGRAPH: + op_result = self._gremlin_to_subgraph(op) + else: + raise RuntimeError("Unsupported op type: " + str(op.op)) + response_head.results.append(op_result) + # record op result + self._op_result_pool[op.key] = op_result + return message_pb2.RunStepResponse(head=response_head), [] + + def _execute_gremlin_query(self, op: op_def_pb2.OpDef): + logger.info("execute gremlin query") + message = op.attr[types_pb2.GIE_GREMLIN_QUERY_MESSAGE].s.decode() + request_options = None + if types_pb2.GIE_GREMLIN_REQUEST_OPTIONS in op.attr: + request_options = json.loads( + op.attr[types_pb2.GIE_GREMLIN_REQUEST_OPTIONS].s.decode() + ) + object_id = op.attr[types_pb2.VINEYARD_ID].i + gremlin_client = self._object_manager.get(object_id) + rlt = gremlin_client.submit(message, request_options=request_options) + logger.info("put %s, client %s", op.key, gremlin_client) + self._object_manager.put(op.key, GremlinResultSet(op.key, rlt)) + return op_def_pb2.OpResult(code=OK, key=op.key) + + def _fetch_gremlin_result(self, op: op_def_pb2.OpDef): + fetch_result_type = op.attr[types_pb2.GIE_GREMLIN_FETCH_RESULT_TYPE].s.decode() + key_of_parent_op = op.parents[0] + result_set = self._object_manager.get(key_of_parent_op).result_set + if fetch_result_type == "one": + rlt = result_set.one() + elif fetch_result_type == "all": + rlt = result_set.all().result() + else: + raise RuntimeError("Not supported fetch result type: " + fetch_result_type) + # Large data should be fetched use gremlin pagination + # meta = op_def_pb2.OpResult.Meta(has_large_result=True) + return op_def_pb2.OpResult( + code=OK, + key=op.key, + result=pickle.dumps(rlt), + ) + + def _gremlin_to_subgraph(self, op: op_def_pb2.OpDef): + gremlin_script = op.attr[types_pb2.GIE_GREMLIN_QUERY_MESSAGE].s.decode() + oid_type = op.attr[types_pb2.OID_TYPE].s.decode() + request_options = None + if types_pb2.GIE_GREMLIN_REQUEST_OPTIONS in op.attr: + request_options = json.loads( + op.attr[types_pb2.GIE_GREMLIN_REQUEST_OPTIONS].s.decode() + ) + object_id = op.attr[types_pb2.VINEYARD_ID].i + gremlin_client = self._object_manager.get(object_id) + + def create_global_graph_builder( + graph_name, num_workers, threads_per_executor, vineyard_rpc_endpoint + ): + import vineyard + + vineyard_client = vineyard.connect(*vineyard_rpc_endpoint.split(":")) + + instances = [key for key in vineyard_client.meta] + + # duplicate each instances for each thread per worker. + chunk_instances = [ + key for key in instances for _ in range(threads_per_executor) + ] + + # build the vineyard::GlobalPGStream + metadata = vineyard.ObjectMeta() + metadata.set_global(True) + metadata["typename"] = "vineyard::htap::GlobalPGStream" + metadata["local_stream_chunks"] = threads_per_executor + metadata["total_stream_chunks"] = len(chunk_instances) + + # build the parallel stream for edge + edge_metadata = vineyard.ObjectMeta() + edge_metadata.set_global(True) + edge_metadata["typename"] = "vineyard::ParallelStream" + edge_metadata["__streams_-size"] = len(chunk_instances) + + # build the parallel stream for vertex + vertex_metadata = vineyard.ObjectMeta() + vertex_metadata.set_global(True) + vertex_metadata["typename"] = "vineyard::ParallelStream" + vertex_metadata["__streams_-size"] = len(chunk_instances) + + # NB: we don't respect `num_workers`, instead, we create a substream + # on each vineyard instance. + # + # Such a choice is to handle cases where that etcd instance still contains + # information about dead instances. + # + # It should be ok, as each engine work will get its own local stream. But, + # generally it should be equal to `num_workers`. + for worker, instance_id in enumerate(chunk_instances): + edge_stream = vineyard.ObjectMeta() + edge_stream["typename"] = "vineyard::RecordBatchStream" + edge_stream["nbytes"] = 0 + edge_stream["params_"] = json.dumps( + { + "graph_name": graph_name, + "kind": "edge", + } + ) + edge = vineyard_client.create_metadata(edge_stream, instance_id) + vineyard_client.persist(edge.id) + edge_metadata.add_member("__streams_-%d" % worker, edge) + + vertex_stream = vineyard.ObjectMeta() + vertex_stream["typename"] = "vineyard::RecordBatchStream" + vertex_stream["nbytes"] = 0 + vertex_stream["params_"] = json.dumps( + { + "graph_name": graph_name, + "kind": "vertex", + } + ) + vertex = vineyard_client.create_metadata(vertex_stream, instance_id) + vineyard_client.persist(vertex.id) + vertex_metadata.add_member("__streams_-%d" % worker, vertex) + + chunk_stream = vineyard.ObjectMeta() + chunk_stream["typename"] = "vineyard::htap::PropertyGraphOutStream" + chunk_stream["graph_name"] = graph_name + chunk_stream["graph_schema"] = "{}" + chunk_stream["nbytes"] = 0 + chunk_stream["stream_index"] = worker + chunk_stream.add_member("edge_stream", edge) + chunk_stream.add_member("vertex_stream", vertex) + chunk = vineyard_client.create_metadata(chunk_stream, instance_id) + vineyard_client.persist(chunk.id) + metadata.add_member("stream_chunk_%d" % worker, chunk) + + # build the vineyard::GlobalPGStream + graph = vineyard_client.create_metadata(metadata) + vineyard_client.persist(graph.id) + vineyard_client.put_name(graph.id, graph_name) + + # build the parallel stream for edge + edge = vineyard_client.create_metadata(edge_metadata) + vineyard_client.persist(edge.id) + vineyard_client.put_name(edge.id, "__%s_edge_stream" % graph_name) + + # build the parallel stream for vertex + vertex = vineyard_client.create_metadata(vertex_metadata) + vineyard_client.persist(vertex.id) + vineyard_client.put_name(vertex.id, "__%s_vertex_stream" % graph_name) + + return repr(graph.id), repr(edge.id), repr(vertex.id) + + def load_subgraph( + graph_name, + total_builder_chunks, + oid_type, + edge_stream_id, + vertex_stream_id, + vineyard_rpc_endpoint, + ): + import vineyard + + # wait all flags been created, see also + # + # `PropertyGraphOutStream::Initialize(Schema schema)` + vineyard_client = vineyard.connect(*vineyard_rpc_endpoint.split(":")) + + # wait for all stream been created by GAIA executor in FFI + for worker in range(total_builder_chunks): + name = "__%s_%d_streamed" % (graph_name, worker) + vineyard_client.get_name(name, wait=True) + + vertices = [Loader(vineyard.ObjectID(vertex_stream_id))] + edges = [Loader(vineyard.ObjectID(edge_stream_id))] + oid_type = normalize_data_type_str(oid_type) + v_labels = normalize_parameter_vertices(vertices, oid_type) + e_labels = normalize_parameter_edges(edges, oid_type) + loader_op = create_loader(v_labels + e_labels) + config = { + types_pb2.DIRECTED: utils.b_to_attr(True), + types_pb2.OID_TYPE: utils.s_to_attr(oid_type), + types_pb2.GENERATE_EID: utils.b_to_attr(False), + types_pb2.VID_TYPE: utils.s_to_attr("uint64_t"), + types_pb2.IS_FROM_VINEYARD_ID: utils.b_to_attr(False), + } + new_op = create_graph( + self._session_id, + graph_def_pb2.ARROW_PROPERTY, + inputs=[loader_op], + attrs=config, + ) + # spawn a vineyard stream loader on coordinator + loader_op_def = loader_op.as_op_def() + coordinator_dag = op_def_pb2.DagDef() + coordinator_dag.op.extend([loader_op_def]) + # set the same key from subgraph to new op + new_op_def = new_op.as_op_def() + new_op_def.key = op.key + dag = op_def_pb2.DagDef() + dag.op.extend([new_op_def]) + self.run_on_coordinator(coordinator_dag, [], {}) + response_head, _ = self.run_on_analytical_engine(dag, [], {}) + logger.info("subgraph has been loaded") + return response_head.head.results[-1] + + # generate a random graph name + now_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + random_num = random.randint(0, 10000000) + graph_name = "subgraph-%s-%s" % (str(now_time), str(random_num)) + + threads_per_worker = int( + os.environ.get("THREADS_PER_WORKER", INTERACTIVE_ENGINE_THREADS_PER_WORKER) + ) + + if self._launcher.type() == types_pb2.HOSTS: + # only 1 GIE executor on local cluster + executor_workers_num = 1 + threads_per_executor = self._launcher.num_workers * threads_per_worker + else: + executor_workers_num = self._launcher.num_workers + threads_per_executor = threads_per_worker + total_builder_chunks = executor_workers_num * threads_per_executor + engine_config = self.get_analytical_engine_config() + vineyard_rpc_endpoint = engine_config["vineyard_rpc_endpoint"] + + ( + _graph_builder_id, + edge_stream_id, + vertex_stream_id, + ) = create_global_graph_builder( + graph_name, + executor_workers_num, + threads_per_executor, + vineyard_rpc_endpoint, + ) + # start a thread to launch the graph + pool = futures.ThreadPoolExecutor() + subgraph_task = pool.submit( + load_subgraph, + graph_name, + total_builder_chunks, + oid_type, + edge_stream_id, + vertex_stream_id, + vineyard_rpc_endpoint, + ) + + # add subgraph vertices and edges + subgraph_script = "{0}.subgraph('{1}')".format( + gremlin_script, + graph_name, + ) + gremlin_client.submit( + subgraph_script, request_options=request_options + ).all().result() + + return subgraph_task.result() + + # Learning engine related operations + # ================================== + def run_on_learning_engine(self, dag_def: op_def_pb2.DagDef): + raise NotImplementedError("Learning engine is not implemented yet") + + # Coordinator related operations + # ============================== + def run_on_coordinator(self, dag_def, dag_bodies, loader_op_bodies): + response_head = message_pb2.RunStepResponseHead() + for op in dag_def.op: + self._key_to_op[op.key] = op + op_pre_process(op, self._op_result_pool, self._key_to_op) + if op.op == types_pb2.DATA_SOURCE: + op_result = self._process_data_source(op, dag_bodies, loader_op_bodies) + elif op.op == types_pb2.DATA_SINK: + op_result = self._process_data_sink(op) + else: + raise RuntimeError("Unsupported op type: " + str(op.op)) + response_head.results.append(op_result) + self._op_result_pool[op.key] = op_result + return message_pb2.RunStepResponse(head=response_head), [] + + def _process_data_sink(self, op: op_def_pb2.OpDef): + import vineyard + import vineyard.io + + storage_options = json.loads(op.attr[types_pb2.STORAGE_OPTIONS].s.decode()) + fd = op.attr[types_pb2.FD].s.decode() + df = op.attr[types_pb2.VINEYARD_ID].s.decode() + engine_config = self.get_analytical_engine_config() + vineyard_endpoint = engine_config["vineyard_rpc_endpoint"] + vineyard_ipc_socket = engine_config["vineyard_socket"] + deployment, hosts = self._launcher.get_vineyard_stream_info() + dfstream = vineyard.io.open( + "vineyard://" + str(df), + mode="r", + vineyard_ipc_socket=vineyard_ipc_socket, + vineyard_endpoint=vineyard_endpoint, + deployment=deployment, + hosts=hosts, + ) + vineyard.io.open( + fd, + dfstream, + mode="w", + vineyard_ipc_socket=vineyard_ipc_socket, + vineyard_endpoint=vineyard_endpoint, + storage_options=storage_options, + deployment=deployment, + hosts=hosts, + ) + return op_def_pb2.OpResult(code=OK, key=op.key) + + def _process_data_source( + self, op: op_def_pb2.OpDef, dag_bodies, loader_op_bodies: dict + ): + def _spawn_vineyard_io_stream( + source, + storage_options, + read_options, + vineyard_endpoint, + vineyard_ipc_socket, + ): + import vineyard + import vineyard.io + + deployment, hosts = self._launcher.get_vineyard_stream_info() + num_workers = self._launcher.num_workers + stream_id = repr( + vineyard.io.open( + source, + mode="r", + vineyard_endpoint=vineyard_endpoint, + vineyard_ipc_socket=vineyard_ipc_socket, + hosts=hosts, + num_workers=num_workers, + deployment=deployment, + read_options=read_options, + storage_options=storage_options, + ) + ) + return "vineyard", stream_id + + def _process_loader_func(loader, vineyard_endpoint, vineyard_ipc_socket): + # loader is type of attr_value_pb2.Chunk + protocol = loader.attr[types_pb2.PROTOCOL].s.decode() + if protocol in ("hdfs", "hive", "oss", "s3"): + source = loader.attr[types_pb2.SOURCE].s.decode() + storage_options = json.loads( + loader.attr[types_pb2.STORAGE_OPTIONS].s.decode() + ) + read_options = json.loads( + loader.attr[types_pb2.READ_OPTIONS].s.decode() + ) + new_protocol, new_source = _spawn_vineyard_io_stream( + source, + storage_options, + read_options, + vineyard_endpoint, + vineyard_ipc_socket, + ) + loader.attr[types_pb2.PROTOCOL].CopyFrom(utils.s_to_attr(new_protocol)) + loader.attr[types_pb2.SOURCE].CopyFrom(utils.s_to_attr(new_source)) + + engine_config = self.get_analytical_engine_config() + vineyard_endpoint = engine_config["vineyard_rpc_endpoint"] + vineyard_ipc_socket = engine_config["vineyard_socket"] + + for loader in op.large_attr.chunk_meta_list.items: + # handle vertex or edge loader + if loader.attr[types_pb2.CHUNK_TYPE].s.decode() == "loader": + # set op bodies, this is for loading graph from numpy/pandas + op_bodies = [] + for bodies in dag_bodies: + if bodies.body.op_key == op.key: + op_bodies.append(bodies) + loader_op_bodies[op.key] = op_bodies + _process_loader_func(loader, vineyard_endpoint, vineyard_ipc_socket) + + return op_def_pb2.OpResult(code=OK, key=op.key) diff --git a/coordinator/gscoordinator/utils.py b/coordinator/gscoordinator/utils.py index fcd03175cb2e..ecdb0a356c3e 100644 --- a/coordinator/gscoordinator/utils.py +++ b/coordinator/gscoordinator/utils.py @@ -17,7 +17,6 @@ # -import base64 import copy import datetime import glob @@ -25,21 +24,15 @@ import inspect import json import logging -import numbers import os -import pickle import shutil -import socket import subprocess import sys -import threading import time import uuid import zipfile from concurrent.futures import ThreadPoolExecutor from io import BytesIO -from pathlib import Path -from queue import Empty as EmptyQueue from queue import Queue from string import Template @@ -89,7 +82,7 @@ ) # default config file in gar resource DEFAULT_GS_CONFIG_FILE = ".gs_conf.yaml" -DEFAULT_GRAPHSCOPE_HOME = "/opt/graphscope" +DEFAULT_GRAPHSCOPE_HOME = "/usr/local" # GRAPHSCOPE_HOME # 1) get from environment variable `GRAPHSCOPE_HOME`, if not exist, @@ -115,11 +108,12 @@ ANALYTICAL_ENGINE_HOME = os.path.join(GRAPHSCOPE_HOME) ANALYTICAL_ENGINE_PATH = os.path.join(ANALYTICAL_ENGINE_HOME, "bin", "grape_engine") if not os.path.isfile(ANALYTICAL_ENGINE_PATH): - # try get analytical engine from build dir + # try to get analytical engine from build dir ANALYTICAL_ENGINE_HOME = os.path.join(GRAPHSCOPE_HOME, "analytical_engine") ANALYTICAL_ENGINE_PATH = os.path.join( ANALYTICAL_ENGINE_HOME, "build", "grape_engine" ) +ANALYTICAL_BUILTIN_SPACE = os.path.join(GRAPHSCOPE_HOME, "precompiled", "builtin") # ANALYTICAL_ENGINE_JAVA_HOME ANALYTICAL_ENGINE_JAVA_HOME = ANALYTICAL_ENGINE_HOME @@ -140,7 +134,7 @@ # INTERACTIVE_ENGINE_SCRIPT -INTERAVTIVE_INSTANCE_TIMEOUT_SECONDS = 600 # 10 mins +INTERACTIVE_INSTANCE_TIMEOUT_SECONDS = 120 # 2 mins INTERACTIVE_ENGINE_SCRIPT = os.path.join(GRAPHSCOPE_HOME, "bin", "giectl") if not os.path.isfile(INTERACTIVE_ENGINE_SCRIPT): INTERACTIVE_ENGINE_SCRIPT = os.path.join( @@ -150,10 +144,6 @@ INTERACTIVE_ENGINE_SCRIPT = os.path.join( GRAPHSCOPE_HOME, "interactive_engine", "bin", "giectl" ) -if not os.path.isfile(INTERACTIVE_ENGINE_SCRIPT): - INTERACTIVE_ENGINE_SCRIPT = os.path.join( - GRAPHSCOPE_HOME, "interactive_engine", "assembly", "bin", "giectl" - ) # default threads per worker configuration for GIE/GAIA INTERACTIVE_ENGINE_THREADS_PER_WORKER = 2 @@ -162,45 +152,44 @@ LLVM4JNI_HOME = os.environ.get("LLVM4JNI_HOME", None) LLVM4JNI_USER_OUT_DIR_BASE = "user-llvm4jni-output" PROCESSOR_MAIN_CLASS = "com.alibaba.graphscope.annotation.Main" -JAVA_CODEGNE_OUTPUT_PREFIX = "gs-ffi" +JAVA_CODEGEN_OUTPUT_PREFIX = "gs-ffi" GRAPE_PROCESSOR_JAR = os.path.join( GRAPHSCOPE_HOME, "lib", "grape-runtime-{}-shaded.jar".format(__version__) ) -GIRAPH_DIRVER_CLASS = "com.alibaba.graphscope.app.GiraphComputationAdaptor" +GIRAPH_DRIVER_CLASS = "com.alibaba.graphscope.app.GiraphComputationAdaptor" +# 2 GB +GS_GRPC_MAX_MESSAGE_LENGTH = 2 * 1024 * 1024 * 1024 - 1 -def get_timestamp(): - now = datetime.datetime.now() - return datetime.datetime.timestamp(now) +def get_timestamp() -> float: + return datetime.datetime.timestamp(datetime.datetime.now()) -def get_lib_path(app_dir, app_name): - lib_path = "" + +def get_lib_path(app_dir: str, app_name: str) -> str: if sys.platform == "linux" or sys.platform == "linux2": - lib_path = os.path.join(app_dir, "lib%s.so" % app_name) + return os.path.join(app_dir, "lib%s.so" % app_name) elif sys.platform == "darwin": - lib_path = os.path.join(app_dir, "lib%s.dylib" % app_name) + return os.path.join(app_dir, "lib%s.dylib" % app_name) else: raise RuntimeError(f"Unsupported platform {sys.platform}") - return lib_path def get_app_sha256(attr, java_class_path: str): ( app_type, - app_header, + _, app_class, vd_type, - md_type, - pregel_combine, + _, + _, java_jar_path, java_app_class, ) = _codegen_app_info(attr, DEFAULT_GS_CONFIG_FILE, java_class_path) graph_header, graph_type, _ = _codegen_graph_info(attr) logger.info("Codegened graph type: %s, Graph header: %s", graph_type, graph_header) - app_sha256 = "" if app_type == "cpp_pie": app_sha256 = hashlib.sha256( f"{app_type}.{app_class}.{graph_type}".encode("utf-8") @@ -351,7 +340,7 @@ def compile_app( # if the fragment & vd type is same. java_codegen_out_dir = os.path.join( - workspace, "{}-{}".format(JAVA_CODEGNE_OUTPUT_PREFIX, library_name) + workspace, "{}-{}".format(JAVA_CODEGEN_OUTPUT_PREFIX, library_name) ) if os.path.isdir(java_codegen_out_dir): logger.info( @@ -463,6 +452,7 @@ def compile_app( raise CompilationError( f"Failed to compile app {app_class} on platform {get_platform_info()}" ) + # TODO(siyuan): Append cmake/make logs to error message when failed. return lib_path, java_jar_path, java_codegen_out_dir, app_type @@ -567,6 +557,7 @@ def compile_graph_frame( raise CompilationError( f"Failed to compile graph {graph_class} on platform {get_platform_info()}" ) + # TODO(siyuan): Append cmake/make logs to error message when failed. return lib_path, None, None, None @@ -587,8 +578,6 @@ def _type_param_consistent(graph_actucal_type_param, java_app_type_param): def op_pre_process(op, op_result_pool, key_to_op, **kwargs): # noqa: C901 - if op.op == types_pb2.REPORT_GRAPH: - return if op.op == types_pb2.CREATE_GRAPH: _pre_process_for_create_graph_op(op, op_result_pool, key_to_op, **kwargs) if op.op == types_pb2.ADD_LABELS: @@ -619,28 +608,6 @@ def op_pre_process(op, op_result_pool, key_to_op, **kwargs): # noqa: C901 _pre_process_for_unload_app_op(op, op_result_pool, key_to_op, **kwargs) if op.op == types_pb2.UNLOAD_CONTEXT: _pre_process_for_unload_context_op(op, op_result_pool, key_to_op, **kwargs) - if op.op == types_pb2.CREATE_INTERACTIVE_QUERY: - _pre_process_for_create_interactive_query_op( - op, op_result_pool, key_to_op, **kwargs - ) - if op.op == types_pb2.GREMLIN_QUERY: - _pre_process_for_gremlin_query_op(op, op_result_pool, key_to_op, **kwargs) - if op.op == types_pb2.FETCH_GREMLIN_RESULT: - _pre_process_for_fetch_gremlin_result(op, op_result_pool, key_to_op, **kwargs) - if op.op == types_pb2.CLOSE_INTERACTIVE_QUERY: - _pre_process_for_close_interactive_query_op( - op, op_result_pool, key_to_op, **kwargs - ) - if op.op == types_pb2.SUBGRAPH: - _pre_process_for_gremlin_to_subgraph_op(op, op_result_pool, key_to_op, **kwargs) - if op.op == types_pb2.CREATE_LEARNING_INSTANCE: - _pre_process_for_create_learning_graph_op( - op, op_result_pool, key_to_op, **kwargs - ) - if op.op == types_pb2.CLOSE_LEARNING_INSTANCE: - _pre_process_for_close_learning_instance_op( - op, op_result_pool, key_to_op, **kwargs - ) if op.op == types_pb2.DATA_SINK: _pre_process_for_data_sink_op(op, op_result_pool, key_to_op, **kwargs) if op.op in (types_pb2.TO_DIRECTED, types_pb2.TO_UNDIRECTED): @@ -682,81 +649,12 @@ def _pre_process_for_add_labels_op(op, op_result_pool, key_to_op, **kwargs): def _pre_process_for_transform_op(op, op_result_pool, key_to_op, **kwargs): assert len(op.parents) == 1 result = op_result_pool[op.parents[0]] - # To compatible with eager evaluation cases where it will has the key. + # To compatible with eager evaluation cases where it has the key. if types_pb2.GRAPH_NAME not in op.attr: op.attr[types_pb2.GRAPH_NAME].CopyFrom(utils.s_to_attr(result.graph_def.key)) -def _pre_process_for_close_interactive_query_op( - op, op_result_pool, key_to_op, **kwargs -): - assert len(op.parents) == 1 - assert op.parents[0] in op_result_pool - - -def _pre_process_for_gremlin_to_subgraph_op(op, op_result_pool, key_to_op, **kwargs): - assert len(op.parents) == 1 - assert op.parents[0] in op_result_pool - - -def _pre_process_for_gremlin_query_op(op, op_result_pool, key_to_op, **kwargs): - assert len(op.parents) == 1 - assert op.parents[0] in op_result_pool - - -def _pre_process_for_fetch_gremlin_result(op, op_result_pool, key_to_op, **kwargs): - assert len(op.parents) == 1 - assert op.parents[0] in op_result_pool - - -def _pre_process_for_create_interactive_query_op( - op, op_result_pool, key_to_op, **kwargs -): - assert len(op.parents) == 1 - key_of_parent_op = op.parents[0] - result = op_result_pool[key_of_parent_op] - assert result.graph_def.extension.Is(graph_def_pb2.VineyardInfoPb.DESCRIPTOR) - vy_info = graph_def_pb2.VineyardInfoPb() - result.graph_def.extension.Unpack(vy_info) - op.attr[types_pb2.VINEYARD_ID].CopyFrom(utils.i_to_attr(vy_info.vineyard_id)) - op.attr[types_pb2.SCHEMA_PATH].CopyFrom(utils.s_to_attr(vy_info.schema_path)) - - -def _pre_process_for_close_learning_instance_op( - op, op_result_pool, key_to_op, **kwargs -): - assert len(op.parents) == 1 - assert op.parents[0] in op_result_pool - - -def _pre_process_for_create_learning_graph_op(op, op_result_pool, key_to_op, **kwargs): - from graphscope.learning.graph import Graph as LearningGraph - - nodes = pickle.loads(op.attr[types_pb2.NODES].s) - edges = pickle.loads(op.attr[types_pb2.EDGES].s) - gen_labels = pickle.loads(op.attr[types_pb2.GLE_GEN_LABELS].s) - # get graph schema - key_of_parent_op = op.parents[0] - result = op_result_pool[key_of_parent_op] - assert result.graph_def.extension.Is(graph_def_pb2.VineyardInfoPb.DESCRIPTOR) - schema = GraphSchema() - schema.from_graph_def(result.graph_def) - # get graph vineyard id - vy_info = graph_def_pb2.VineyardInfoPb() - result.graph_def.extension.Unpack(vy_info) - vineyard_id = vy_info.vineyard_id - # gle handle - engine_hosts = kwargs.pop("engine_hosts") - engine_config = kwargs.pop("engine_config") - handle = get_gl_handle(schema, vineyard_id, engine_hosts, engine_config) - config = LearningGraph.preprocess_args(handle, nodes, edges, gen_labels) - config = base64.b64encode(json.dumps(config).encode("utf-8")).decode("utf-8") - op.attr[types_pb2.VINEYARD_ID].CopyFrom(utils.i_to_attr(vineyard_id)) - op.attr[types_pb2.GLE_HANDLE].CopyFrom(utils.s_to_attr(handle)) - op.attr[types_pb2.GLE_CONFIG].CopyFrom(utils.s_to_attr(config)) - - -# get `bind_app` runtime informarion in lazy mode +# get `bind_app` runtime information in lazy mode def _pre_process_for_bind_app_op(op, op_result_pool, key_to_op, **kwargs): for key_of_parent_op in op.parents: parent_op = key_to_op[key_of_parent_op] @@ -813,7 +711,7 @@ def _pre_process_for_bind_app_op(op, op_result_pool, key_to_op, **kwargs): ) -# get `run_app` runtime informarion in lazy mode +# get `run_app` runtime information in lazy mode def _pre_process_for_run_app_op(op, op_result_pool, key_to_op, **kwargs): # run_app op has only one parent assert len(op.parents) == 1 @@ -866,7 +764,7 @@ def _pre_process_for_run_app_op(op, op_result_pool, key_to_op, **kwargs): # for giraph app, we need to add args into orginal query_args, which is a json string # first one should be user params, second should be lib_path if app_type.startswith("giraph:"): - user_params["app_class"] = GIRAPH_DIRVER_CLASS + user_params["app_class"] = GIRAPH_DRIVER_CLASS user_params["user_app_class"] = app_type[7:] else: user_params["app_class"] = app_type.split(":")[-1] @@ -930,7 +828,7 @@ def _pre_process_for_add_column_op(op, op_result_pool, key_to_op, **kwargs): parent_op_result = json.loads(r.result.decode("utf-8")) context_key = parent_op_result["context_key"] context_type = parent_op_result["context_type"] - selector = _tranform_dataframe_selector(context_type, schema, selector) + selector = _transform_dataframe_selector(context_type, schema, selector) op.attr[types_pb2.GRAPH_NAME].CopyFrom(utils.s_to_attr(graph_name)) op.attr[types_pb2.GRAPH_TYPE].CopyFrom(utils.graph_type_to_attr(graph_type)) op.attr[types_pb2.CONTEXT_KEY].CopyFrom(utils.s_to_attr(context_key)) @@ -982,10 +880,10 @@ def __backtrack_key_of_graph_op(key): types_pb2.TO_VINEYARD_DATAFRAME, types_pb2.OUTPUT, ): - selector = _tranform_dataframe_selector(context_type, schema, selector) + selector = _transform_dataframe_selector(context_type, schema, selector) else: # to numpy - selector = _tranform_numpy_selector(context_type, schema, selector) + selector = _transform_numpy_selector(context_type, schema, selector) if selector is not None: op.attr[types_pb2.SELECTOR].CopyFrom( attr_value_pb2.AttrValue(s=selector.encode("utf-8")) @@ -1015,12 +913,14 @@ def _pre_process_for_output_graph_op(op, op_result_pool, key_to_op, **kwargs): graph_name = r.graph_def.key selector = op.attr[types_pb2.SELECTOR].s.decode("utf-8") if op.op == types_pb2.GRAPH_TO_DATAFRAME: - selector = _tranform_dataframe_selector( + selector = _transform_dataframe_selector( "labeled_vertex_property", schema, selector ) else: # to numpy - selector = _tranform_numpy_selector("labeled_vertex_property", schema, selector) + selector = _transform_numpy_selector( + "labeled_vertex_property", schema, selector + ) if selector is not None: op.attr[types_pb2.SELECTOR].CopyFrom( attr_value_pb2.AttrValue(s=selector.encode("utf-8")) @@ -1411,11 +1311,11 @@ def transform_labeled_vertex_property_data_selector(schema, selector): } -def _tranform_numpy_selector(context_type, schema, selector): +def _transform_numpy_selector(context_type, schema, selector): return _transform_selector_func_map[context_type](schema, selector) -def _tranform_dataframe_selector(context_type, schema, selector): +def _transform_dataframe_selector(context_type, schema, selector): selector = json.loads(selector) transform_func = _transform_selector_func_map[context_type] selector = {key: transform_func(schema, value) for key, value in selector.items()} @@ -1529,7 +1429,7 @@ def _probe_for_java_app(attr, java_class_path, real_algo): def _codegen_app_info(attr, meta_file: str, java_class_path: str): - """Codegen application by instanize the template specialization. + """Codegen application by instantiate the template specialization. Args: workspace (str): Working directory @@ -1600,7 +1500,7 @@ def _codegen_app_info(attr, meta_file: str, java_class_path: str): raise KeyError("Algorithm does not exist in the gar resource.") -# a mapping for classname to header file. +# a mapping for class name to header file. GRAPH_HEADER_MAP = { graph_def_pb2.IMMUTABLE_EDGECUT: ( "grape::ImmutableEdgecutFragment", @@ -1691,8 +1591,12 @@ def dump_as_json(schema, path): items = [] idx = 0 for i, vertex_label in enumerate(schema.vertex_labels): - vertex = {"id": idx, "label": vertex_label, "type": "VERTEX"} - vertex["propertyDefList"] = [] + vertex = { + "id": idx, + "label": vertex_label, + "type": "VERTEX", + "propertyDefList": [], + } for j, value in enumerate(schema.vertex_property_names[i].s): names = schema.vertex_property_names[i] types = schema.vertex_property_types[i] @@ -1705,8 +1609,7 @@ def dump_as_json(schema, path): idx += 1 for i, edge_label in enumerate(schema.edge_labels): - edge = {"id": idx, "label": edge_label, "type": "EDGE"} - edge["propertyDefList"] = [] + edge = {"id": idx, "label": edge_label, "type": "EDGE", "propertyDefList": []} for j, value in enumerate(schema.edge_property_names[i].s): names = schema.edge_property_names[i] types = schema.edge_property_types[i] @@ -1773,7 +1676,7 @@ class ResolveMPICmdPrefix(object): >>> # openmpi found >>> rmcp = ResolveMPICmdPrefix() - >>> (cmd, env) = rmcp.resolve(4, 'h1, h2, h3') + >>> (cmd, env) = rmcp.resolve(4, 'h1,h2,h3') >>> cmd ['mpirun', '--allow-run-as-root', '-n', '4', '-host', 'h1:2,h2:1,h3:1'] @@ -1785,7 +1688,7 @@ class ResolveMPICmdPrefix(object): >>> # if openmpi not found, change to mpich >>> rmcp = ResolveMPICmdPrefix() - >>> (cmd, env) = rmcp.resolve(4, 'h1, h2, h3') + >>> (cmd, env) = rmcp.resolve(4, 'h1,h2,h3') >>> cmd ['mpirun', '-n', '4', '-host', 'h1:2,h2:1,h3:1'] >>> env @@ -1848,16 +1751,16 @@ def alloc(num_workers, hosts): @staticmethod def find_mpi(): - mpi = "" + mpi = None if ResolveMPICmdPrefix.openmpi(): if "OPAL_PREFIX" in os.environ: mpi = os.path.expandvars("$OPAL_PREFIX/bin/mpirun") - if not mpi: + if mpi is None: if "OPAL_BINDIR" in os.environ: mpi = os.path.expandvars("$OPAL_BINDIR/mpirun") - if not mpi: + if mpi is None: mpi = shutil.which("mpirun") - if not mpi: + if mpi is None: raise RuntimeError("mpirun command not found.") return mpi @@ -1901,123 +1804,6 @@ def resolve(self, num_workers, hosts): return cmd, env -def get_gl_handle(schema, vineyard_id, engine_hosts, engine_config): - """Dump a handler for GraphLearn for interaction. - - Fields in :code:`schema` are: - - + the name of node type or edge type - + whether the graph is weighted graph - + whether the graph is labeled graph - + the number of int attributes - + the number of float attributes - + the number of string attributes - - An example of the graph handle: - - .. code:: python - - { - "server": "127.0.0.1:8888,127.0.0.1:8889", - "client_count": 1, - "vineyard_socket": "/var/run/vineyard.sock", - "vineyard_id": 13278328736, - "node_schema": [ - "user:false:false:10:0:0", - "item:true:false:0:0:5" - ], - "edge_schema": [ - "user:click:item:true:false:0:0:0", - "user:buy:item:true:true:0:0:0", - "item:similar:item:false:false:10:0:0" - ], - "node_attribute_types": { - "person": { - "age": "i", - "name": "s", - }, - }, - "edge_attribute_types": { - "knows": { - "weight": "f", - }, - }, - } - - The handle can be decoded using: - - .. code:: python - - base64.b64decode(handle.encode('ascii')).decode('ascii') - - Note that the ports are selected from a range :code:`(8000, 9000)`. - - Args: - schema: The graph schema. - vineyard_id: The object id of graph stored in vineyard. - engine_hosts: A list of hosts for GraphScope engine workers. - engine_config: dict of config for GAE engine. - - Returns: - str: Base64 encoded handle - - """ - - def group_property_types(props): - weighted, labeled, i, f, s, attr_types = "false", "false", 0, 0, 0, {} - for prop in props: - if prop.type in [graph_def_pb2.STRING]: - s += 1 - attr_types[prop.name] = "s" - elif prop.type in (graph_def_pb2.FLOAT, graph_def_pb2.DOUBLE): - f += 1 - attr_types[prop.name] = "f" - else: - i += 1 - attr_types[prop.name] = "i" - if prop.name == "weight": - weighted = "true" - elif prop.name == "label": - labeled = "true" - return weighted, labeled, i, f, s, attr_types - - node_schema, node_attribute_types = [], dict() - for label in schema.vertex_labels: - weighted, labeled, i, f, s, attr_types = group_property_types( - schema.get_vertex_properties(label) - ) - node_schema.append( - "{}:{}:{}:{}:{}:{}".format(label, weighted, labeled, i, f, s) - ) - node_attribute_types[label] = attr_types - - edge_schema, edge_attribute_types = [], dict() - for label in schema.edge_labels: - weighted, labeled, i, f, s, attr_types = group_property_types( - schema.get_edge_properties(label) - ) - for rel in schema.get_relationships(label): - edge_schema.append( - "{}:{}:{}:{}:{}:{}:{}:{}".format( - rel[0], label, rel[1], weighted, labeled, i, f, s - ) - ) - edge_attribute_types[label] = attr_types - - handle = { - "hosts": engine_hosts, - "client_count": 1, - "vineyard_id": vineyard_id, - "vineyard_socket": engine_config["vineyard_socket"], - "node_schema": node_schema, - "edge_schema": edge_schema, - "node_attribute_types": node_attribute_types, - "edge_attribute_types": edge_attribute_types, - } - handle_json_string = json.dumps(handle) - return base64.b64encode(handle_json_string.encode("utf-8")).decode("utf-8") - - # In Analytical engine, assume label ids of vertex entries are continuous # from zero, and property ids of each label is also continuous from zero. # When transform schema to Maxgraph style, we gather all property names and @@ -2065,40 +1851,35 @@ def _check_task(endpoint): if "MY_POD_NAME" in os.environ: # inner kubernetes env if endpoint == "localhost" or endpoint == "127.0.0.1": - # now, used in mac os with docker-desktop kubernetes cluster, + # now, used in macOS with docker-desktop kubernetes cluster, # which external ip is 'localhost' when service type is 'LoadBalancer' return True try: client = Client(f"ws://{endpoint}/gremlin", "g") + # May throw client.submit("g.V().limit(1)").all().result() + finally: try: client.close() except: # noqa: E722 pass - except Exception as e: - try: - client.close() - except: # noqa: E722 - pass - raise RuntimeError(str(e)) - return True executor = ThreadPoolExecutor(max_workers=20) begin_time = time.time() - error_message = "" while True: t = executor.submit(_check_task, endpoint) try: - rlt = t.result(timeout=30) + _ = t.result(timeout=30) except Exception as e: t.cancel() error_message = str(e) else: - return rlt + executor.shutdown(wait=False) + return True time.sleep(3) - if time.time() - begin_time > INTERAVTIVE_INSTANCE_TIMEOUT_SECONDS: + if time.time() - begin_time > INTERACTIVE_INSTANCE_TIMEOUT_SECONDS: executor.shutdown(wait=False) raise TimeoutError(f"Gremlin check query failed: {error_message}") diff --git a/coordinator/requirements.txt b/coordinator/requirements.txt index 65cec51c5e98..e4c4727b04e8 100644 --- a/coordinator/requirements.txt +++ b/coordinator/requirements.txt @@ -3,9 +3,13 @@ etcd-distro>=3.5.1 graphscope-client>=0.11.0 grpcio<=1.43.0,>=1.40.0 grpcio-tools<=1.43.0,>=1.40.0 -kubernetes +kubernetes~=12.0.1 protobuf>=3.15.0,<=3.18.1 -PyYAML +PyYAML~=5.4.1 vineyard>=0.5.2; sys_platform != "win32" vineyard-io>=0.5.2; sys_platform != "win32" prometheus-client>=0.14.1 + +setuptools~=51.1.2 +packaging~=20.8 +tqdm~=4.62.3 \ No newline at end of file diff --git a/docs/deployment.rst b/docs/deployment.rst index ef708e6f7da0..4fe8609dc118 100644 --- a/docs/deployment.rst +++ b/docs/deployment.rst @@ -128,4 +128,4 @@ Ubuntu 20.04+ or MacOS. .. code:: shell source ~/.graphscope_env - make graphscope + sudo make install diff --git a/docs/developer_guide.rst b/docs/developer_guide.rst index 44fd5a5475e7..6fd2a98cec27 100644 --- a/docs/developer_guide.rst +++ b/docs/developer_guide.rst @@ -138,7 +138,7 @@ Then you can build GraphScope with pre-configured `make` commands. ```bash # to make graphscope whole package, including python package + engine binaries. -make graphscope +make install # or make the engine components # make gie diff --git a/docs/reference/operation.rst b/docs/reference/operation.rst index de63e5089f64..25a06b109464 100644 --- a/docs/reference/operation.rst +++ b/docs/reference/operation.rst @@ -60,10 +60,6 @@ BuiltIn operations add_column graph_to_numpy graph_to_dataframe - create_interactive_query - create_learning_instance - close_interactive_query - close_learning_instance gremlin_query gremlin_to_subgraph fetch_gremlin_result diff --git a/docs/zh/deployment.rst b/docs/zh/deployment.rst index 71742ad32e41..deebdade01c0 100644 --- a/docs/zh/deployment.rst +++ b/docs/zh/deployment.rst @@ -117,4 +117,4 @@ Coordinator 作为 GraphScope 后端服务的入口,通过 grpc 接收来自 P .. code:: shell source ~/.graphscope_env - make graphscope + sudo make install diff --git a/interactive_engine/assembly/graphscope.xml b/interactive_engine/assembly/graphscope.xml index 9abd3785effb..1f2469b5bee1 100644 --- a/interactive_engine/assembly/graphscope.xml +++ b/interactive_engine/assembly/graphscope.xml @@ -34,11 +34,12 @@ ${project.parent.basedir}/executor/ir/target/release lib - libir_core* + libir_core.so + libir_core.dylib - ${project.parent.basedir}/executor/assembly/target/debug + ${project.parent.basedir}/executor/assembly/v6d/target/debug gaia_executor @@ -46,7 +47,7 @@ bin - ${project.parent.basedir}/executor/assembly/target/release + ${project.parent.basedir}/executor/assembly/v6d/target/release gaia_executor diff --git a/interactive_engine/assembly/groot.xml b/interactive_engine/assembly/groot.xml index edf4d07f2f9a..bca2fc2bdc33 100644 --- a/interactive_engine/assembly/groot.xml +++ b/interactive_engine/assembly/groot.xml @@ -8,7 +8,7 @@ - ${project.parent.basedir}/executor/assembly/target/debug + ${project.parent.basedir}/executor/assembly/groot/target/debug libmaxgraph_ffi.so libmaxgraph_ffi.dylib @@ -16,7 +16,7 @@ native - ${project.parent.basedir}/executor/assembly/target/release + ${project.parent.basedir}/executor/assembly/groot/target/release libmaxgraph_ffi.so libmaxgraph_ffi.dylib diff --git a/interactive_engine/executor/assembly/Cargo.toml b/interactive_engine/executor/assembly/Cargo.toml deleted file mode 100644 index 24ec073463a5..000000000000 --- a/interactive_engine/executor/assembly/Cargo.toml +++ /dev/null @@ -1,26 +0,0 @@ -[workspace] - -members = [ - "groot", - "v6d" -] - -[profile.release] -opt-level = 3 -debug = true -rpath = false -lto = true -debug-assertions = false -codegen-units=1 -# Don't change to "abort", since runtime rely on this to catch unexpected errors in worker threads. -panic = "unwind" - -[profile.dev] -opt-level = 0 -debug = true -rpath = false -lto = false -debug-assertions = true -codegen-units=1 -# Don't change to "abort", since runtime rely on this to catch unexpected errors in worker threads. -panic = "unwind" diff --git a/interactive_engine/executor/build.sh b/interactive_engine/executor/build.sh index ffb2e28592df..6ae55da616b3 100755 --- a/interactive_engine/executor/build.sh +++ b/interactive_engine/executor/build.sh @@ -3,12 +3,20 @@ set -x MODE=$1 SKIP=$2 -FEATURE=$3 +TARGET=$3 +FEATURE=$4 if [ "$SKIP" = "true" ]; then exit 0 fi +if [ "$TARGET" = "v6d" ] || [ "$TARGET" = "groot" ]; then + echo "Build target $TARGET" +else + echo "Invalid target, choose from v6d or groot." + exit 1 +fi + if [ "$(uname -s)" = "Darwin" ]; then SUFFIX="dylib" STRIP_OPTION="-u" @@ -17,17 +25,17 @@ else STRIP_OPTION="" fi -cd assembly; +cd assembly/$TARGET; if [ "$MODE" = "debug" ]; then - ../exec.sh cargo build --workspace --features="$FEATURE" + cargo build --features="$FEATURE" elif [ "$MODE" = "release" ]; then - ../exec.sh cargo build --workspace --release --features="$FEATURE" + cargo build --release --features="$FEATURE" else + echo "Invalid mode, choose from debug or release." exit 1 fi -rm -rf $(pwd)/target/${MODE}/build -rm -rf $(pwd)/target/${MODE}/deps - -strip ${STRIP_OPTION} $(pwd)/target/${MODE}/libmaxgraph_ffi.${SUFFIX} -ln -sf $(pwd)/target/${MODE}/libmaxgraph_ffi.${SUFFIX} ./target/libmaxgraph_ffi.${SUFFIX} +if [ "$TARGET" = "groot" ]; then + strip ${STRIP_OPTION} $(pwd)/target/${MODE}/libmaxgraph_ffi.${SUFFIX} + ln -sf $(pwd)/target/${MODE}/libmaxgraph_ffi.${SUFFIX} $(pwd)/target/libmaxgraph_ffi.${SUFFIX} +fi diff --git a/interactive_engine/executor/exec.sh b/interactive_engine/executor/exec.sh deleted file mode 100755 index 484989adc68c..000000000000 --- a/interactive_engine/executor/exec.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Copyright 2020 Alibaba Group Holding Limited. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -echo "VINEYARD_ROOT_DIR = $VINEYARD_ROOT_DIR" - -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${JAVA_HOME}/jre/lib/amd64/server - -$@ diff --git a/interactive_engine/executor/ir/integrated/config/distributed/server_0/server_config.toml b/interactive_engine/executor/ir/integrated/config/distributed/server_0/server_config.toml index 79a214c9b8e9..1c32c5ff1358 100644 --- a/interactive_engine/executor/ir/integrated/config/distributed/server_0/server_config.toml +++ b/interactive_engine/executor/ir/integrated/config/distributed/server_0/server_config.toml @@ -42,7 +42,7 @@ servers_size = 2 # Set addresses of your servers; # If the cluster is standalone, the size of addresses should be equal to [server_size] set above, and the addresses -# should be in order, the fisrt address would be server 0. +# should be in order, the first address would be server 0. [[network.servers]] hostname = '127.0.0.1' port = 11234 diff --git a/interactive_engine/executor/ir/integrated/config/distributed/server_1/server_config.toml b/interactive_engine/executor/ir/integrated/config/distributed/server_1/server_config.toml index b8a0de5fc8de..03678fd17a8f 100644 --- a/interactive_engine/executor/ir/integrated/config/distributed/server_1/server_config.toml +++ b/interactive_engine/executor/ir/integrated/config/distributed/server_1/server_config.toml @@ -42,7 +42,7 @@ servers_size = 2 # Set addresses of your servers; # If the cluster is standalone, the size of addresses should be equal to [server_size] set above, and the addresses -# should be in order, the fisrt address would be server 0. +# should be in order, the first address would be server 0. [[network.servers]] hostname = '127.0.0.1' port = 11234 diff --git a/interactive_engine/executor/pom.xml b/interactive_engine/executor/pom.xml index 6dd04bcd648e..db5789f2949c 100644 --- a/interactive_engine/executor/pom.xml +++ b/interactive_engine/executor/pom.xml @@ -34,11 +34,12 @@ exec - sh + bash build.sh ${rust.compile.mode} ${rust.compile.skip} + ${rust.compile.target} ${groot.compile.feature} diff --git a/interactive_engine/executor/store/global_query/Cargo.toml b/interactive_engine/executor/store/global_query/Cargo.toml index c3416528be9d..f65a74fde3fb 100644 --- a/interactive_engine/executor/store/global_query/Cargo.toml +++ b/interactive_engine/executor/store/global_query/Cargo.toml @@ -20,4 +20,5 @@ cmake = "0.1" [features] default = [] -with_v6d = [] \ No newline at end of file +with_v6d = [] +groot = [] \ No newline at end of file diff --git a/interactive_engine/executor/store/global_query/build.rs b/interactive_engine/executor/store/global_query/build.rs index a101394454b4..7ecba09ac849 100644 --- a/interactive_engine/executor/store/global_query/build.rs +++ b/interactive_engine/executor/store/global_query/build.rs @@ -31,13 +31,6 @@ fn codegen_inplace() -> Result<(), Box> { println!("cargo:rustc-link-search=/usr/local/lib"); println!("cargo:rustc-link-search=/usr/local/lib64"); println!("cargo:rustc-link-search=/opt/homebrew/lib"); - match std::env::var("VINEYARD_ROOT_DIR") { - Ok(val) => { - println!("cargo:rustc-link-search={}/lib", val); - println!("cargo:rustc-link-search={}/lib64", val); - } - Err(_) => (), - } println!("cargo:rustc-link-search={}/build", dst.display()); println!("cargo:rustc-link-lib=v6d_native_store"); println!("cargo:rustc-link-lib=vineyard_graph"); diff --git a/interactive_engine/groot-server/pom.xml b/interactive_engine/groot-server/pom.xml index fd2c2a2c1a14..4385e4f31f75 100644 --- a/interactive_engine/groot-server/pom.xml +++ b/interactive_engine/groot-server/pom.xml @@ -206,7 +206,7 @@ **/GremlinStandardTest.java - -Djna.library.path=${project.parent.basedir}/executor/assembly/target:/usr/local/lib:${project.parent.basedir}/lgraph/build:${project.parent.basedir}/executor/ir/target/release + -Djna.library.path=${project.parent.basedir}/executor/assembly/groot/target:/usr/local/lib:${project.parent.basedir}/lgraph/build:${project.parent.basedir}/executor/ir/target/release 1 @@ -235,7 +235,7 @@ **/FfiTest.java - -Djna.library.path=${project.parent.basedir}/executor/assembly/target:/usr/local/lib:${project.parent.basedir}/lgraph/build:${project.parent.basedir}/executor/ir/target/release + -Djna.library.path=${project.parent.basedir}/executor/assembly/groot/target:/usr/local/lib:${project.parent.basedir}/lgraph/build:${project.parent.basedir}/executor/ir/target/release 1 diff --git a/interactive_engine/lgraph/CMakeLists.txt b/interactive_engine/lgraph/CMakeLists.txt index 0d751d47fd52..7206a5bf6457 100644 --- a/interactive_engine/lgraph/CMakeLists.txt +++ b/interactive_engine/lgraph/CMakeLists.txt @@ -73,7 +73,7 @@ else () set(DYLIB_SUFFIX "so") endif () -set(Maxgraph_FFI_LIB_DIR ${Lgraph_SOURCE_DIR}/../executor/assembly/target) +set(Maxgraph_FFI_LIB_DIR ${Lgraph_SOURCE_DIR}/../executor/assembly/groot/target) set(Maxgraph_FFI_LIB ${Maxgraph_FFI_LIB_DIR}/libmaxgraph_ffi.${DYLIB_SUFFIX}) # Add library diff --git a/interactive_engine/pom.xml b/interactive_engine/pom.xml index af31aa04f89c..f9cbb553eb98 100644 --- a/interactive_engine/pom.xml +++ b/interactive_engine/pom.xml @@ -12,6 +12,9 @@ graphscope + + v6d + assembly common @@ -24,6 +27,9 @@ groot + + groot + assembly common @@ -64,6 +70,7 @@ false debug + v6d true UTF-8 [2.17.1,) diff --git a/k8s/Makefile b/k8s/Makefile index fcb24877432b..0abd52d3f89f 100644 --- a/k8s/Makefile +++ b/k8s/Makefile @@ -107,28 +107,29 @@ graphscope-darwin-py3: mkdir -p build && cd build && \ cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && \ sudo make install -j`nproc` && \ - sudo rm -fr /opt/vineyard/lib - sudo rm -fr /tmp/v6d /tmp/libgrape-lite + sudo rm -rf /opt/vineyard/lib + sudo rm -rf /tmp/v6d /tmp/libgrape-lite # build graphscope cd $(WORKING_DIR)/../ && \ python3 -m pip install --upgrade setuptools && \ - make install && \ + make && \ + sudo make install && \ python3 $(WORKING_DIR)/precompile.py # build and delocate wheel cd $(WORKING_DIR)/../coordinator && \ export WITH_EXTRA_DATA=ON && \ - rm -fr build dist/*.whl || true && \ + rm -rf build dist/*.whl || true && \ sudo strip -s $(WORKING_DIR)/../analytical_engine/exported_symbols_osx.lds /opt/graphscope/bin/grape_engine || true && \ sudo strip /opt/graphscope/bin/gaia_executor && \ export DYLD_LIBRARY_PATH=/usr/local/lib:$$DYLD_LIBRARY_PATH && \ package_name=gs-include python3 setup.py bdist_wheel --plat=macosx_10_9_x86_64 && \ - rm -fr build && \ + rm -rf build && \ package_name=gs-apps python3 setup.py bdist_wheel --plat=macosx_10_9_x86_64 && \ - rm -fr build && \ + rm -rf build && \ package_name=gs-engine python3 setup.py bdist_wheel --plat=macosx_10_9_x86_64 && \ - rm -fr build && \ + rm -rf build && \ package_name=gs-coordinator python3 setup.py bdist_wheel --plat=macosx_10_9_x86_64 && \ - rm -fr build && \ + rm -rf build && \ package_name=graphscope python3 setup.py bdist_wheel --plat=macosx_10_9_x86_64 && \ pip3 install delocate && \ for wheel in `ls dist/*.whl`; do \ @@ -138,23 +139,24 @@ graphscope-darwin-py3: graphscope-manylinux2014-py3-nodocker: cd $(WORKING_DIR)/../ && \ - make install && \ + make && \ + sudo make install && \ python3 $(WORKING_DIR)/precompile.py && \ export WITH_EXTRA_DATA=ON && \ cd $(WORKING_DIR)/../coordinator && \ - rm -fr build dist/*.whl && \ + rm -rf build dist/*.whl && \ sudo strip /opt/graphscope/bin/grape_engine && \ sudo strip /opt/graphscope/bin/gaia_executor && \ sudo strip /opt/graphscope/lib/*.so && \ strip /tmp/gs/builtin/*/*.so && \ package_name=gs-include python3 setup.py bdist_wheel && \ - rm -fr build && \ + rm -rf build && \ package_name=gs-apps python3 setup.py bdist_wheel && \ - rm -fr build && \ + rm -rf build && \ package_name=gs-engine python3 setup.py bdist_wheel && \ - rm -fr build && \ + rm -rf build && \ package_name=gs-coordinator python3 setup.py bdist_wheel && \ - rm -fr build && \ + rm -rf build && \ package_name=graphscope python3 setup.py bdist_wheel && \ cd dist && \ for wheel in `ls ./*.whl`; do \ @@ -202,7 +204,7 @@ graphscope-client-darwin-py3: cd $(WORKING_DIR)/../python && \ pip3 install -U pip && \ pip3 install "numpy==1.18.5" "pandas<1.5.0" "grpcio<=1.43.0,>=1.40.0" "grpcio-tools<=1.43.0,>=1.40.0" delocate wheel && \ - rm -fr build dist/*.whl || true && \ + rm -rf build dist/*.whl || true && \ python3 setup.py bdist_wheel --plat=macosx_10_9_x86_64 && \ for wheel in `ls dist/*.whl`; do \ delocate-wheel -w dist/wheelhouse -v $$wheel && rm $$wheel; \ diff --git a/k8s/graphscope-dev.Dockerfile b/k8s/graphscope-dev.Dockerfile index 0d82ff4e7ebf..444c1e0bdba7 100644 --- a/k8s/graphscope-dev.Dockerfile +++ b/k8s/graphscope-dev.Dockerfile @@ -24,7 +24,7 @@ RUN sudo mkdir -p /opt/graphscope && \ cd ${HOME}/gs && make gle # build analytical engine -RUN cd ${HOME}/gs && make gae +RUN cd ${HOME}/gs && make gae-install # build python bdist_wheel RUN export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/graphscope/lib:/opt/graphscope/lib64 && \ diff --git a/k8s/graphscope-store.Dockerfile b/k8s/graphscope-store.Dockerfile index 80b59023a28e..66acbcd22fbf 100644 --- a/k8s/graphscope-store.Dockerfile +++ b/k8s/graphscope-store.Dockerfile @@ -29,15 +29,24 @@ RUN sudo chown -R $(id -u):$(id -g) /home/graphscope/gs /home/graphscope/.m2 && && mv /home/graphscope/gs/interactive_engine/assembly/target/groot.tar.gz /home/graphscope/gs/groot.tar.gz; \ fi -FROM registry.cn-hongkong.aliyuncs.com/graphscope/graphscope-runtime:latest +FROM centos:7.9.2009 -COPY --from=builder /opt/vineyard/ /usr/local/ +COPY --from=builder /home/graphscope/gs/groot.tar.gz /tmp/groot.tar.gz COPY ./k8s/ready_probe.sh /tmp/ready_probe.sh -COPY --from=builder /home/graphscope/gs/groot.tar.gz /tmp/groot.tar.gz -RUN sudo tar -zxf /tmp/groot.tar.gz -C /usr/local + +RUN tar -zxf /tmp/groot.tar.gz -C /usr/local RUN rm /tmp/groot.tar.gz +RUN yum install -y sudo java-1.8.0-openjdk-devel bind-utils \ + && yum clean all \ + && rm -rf /var/cache/yum + +RUN useradd -m graphscope -u 1001 \ + && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +USER graphscope +WORKDIR /home/graphscope + # init log directory RUN sudo mkdir /var/log/graphscope \ && sudo chown -R $(id -u):$(id -g) /var/log/graphscope diff --git a/k8s/manylinux2014.Dockerfile b/k8s/manylinux2014.Dockerfile index 2471233811bd..af41ef9fe6d8 100644 --- a/k8s/manylinux2014.Dockerfile +++ b/k8s/manylinux2014.Dockerfile @@ -1,4 +1,4 @@ -# the graphscope-manylinux2010 image is based on manylinux2010, including all necessary +# the graphscope-manylinux2014 image is based on manylinux2014, including all necessary # dependencies for graphscope's wheel package. FROM registry.cn-hongkong.aliyuncs.com/graphscope/manylinux2014:2022-08-16-53df7cb diff --git a/k8s/standalone/Makefile b/k8s/standalone/Makefile index 0a4a93197736..69f0532a44e5 100644 --- a/k8s/standalone/Makefile +++ b/k8s/standalone/Makefile @@ -1,12 +1,12 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -WORKING_DIR := $(dir $(MKFILE_PATH)) +WORKING_DIR := $(dir $(MKFILE_PATH)) ifeq ("$(DOCKER_ORG)","") $(warning WARNING: No docker user found, using graphscope) DOCKER_ORG = graphscope endif -.PHONY: gsruntime gsvineyard graphscope push clean +.PHONY: image-vineyard-runtime image-coordinator image-analytical image-analytical-java image-interactive-frontend image-interactive-executor image-interactive-experimental image-learning push clean ifeq ($(REGISTRY),) REGISTRY := registry.cn-hongkong.aliyuncs.com @@ -15,13 +15,45 @@ endif VERSION ?= latest PROFILE ?= release -# build gie with experimental storage -gie-exp-runtime: - cd $(WORKING_DIR)/../.. && docker build --target gie-exp -t ${REGISTRY}/$(DOCKER_ORG)/gie-exp-runtime:${VERSION} -f $(WORKING_DIR)/gie-exp.Dockerfile . +image-vineyard-runtime: + cd $(WORKING_DIR)/../.. && docker build -t ${REGISTRY}/$(DOCKER_ORG)/vineyard-runtime:${VERSION} \ + -f $(WORKING_DIR)/runtime/vineyard.Dockerfile . + +image-coordinator: + cd $(WORKING_DIR)/../.. && docker build --target coordinator -t ${REGISTRY}/$(DOCKER_ORG)/coordinator:${VERSION} -f $(WORKING_DIR)/coordinator.Dockerfile . + +image-analytical: + cd $(WORKING_DIR)/../.. && docker build --target analytical -t ${REGISTRY}/$(DOCKER_ORG)/analytical:${VERSION} -f $(WORKING_DIR)/analytical.Dockerfile . + +image-analytical-java: + cd $(WORKING_DIR)/../.. && docker build --target analytical-java -t ${REGISTRY}/$(DOCKER_ORG)/analytical-java:${VERSION} -f $(WORKING_DIR)/analytical.Dockerfile . + +image-interactive-frontend: + cd $(WORKING_DIR)/../.. && docker build --build-arg profile=$(PROFILE) --target frontend -t ${REGISTRY}/$(DOCKER_ORG)/interactive-frontend:${VERSION} -f $(WORKING_DIR)/interactive.Dockerfile . + +image-interactive-executor: + cd $(WORKING_DIR)/../.. && docker build --build-arg profile=$(PROFILE) --target executor -t ${REGISTRY}/$(DOCKER_ORG)/interactive-executor:${VERSION} -f $(WORKING_DIR)/interactive.Dockerfile . + +# gie with experimental storage +image-interactive-experimental: + cd $(WORKING_DIR)/../.. && docker build --target experimental -t ${REGISTRY}/$(DOCKER_ORG)/interactive-experimental:${VERSION} -f $(WORKING_DIR)/interactive-experimental.Dockerfile . + +image-learning: + cd $(WORKING_DIR)/../.. && docker build --target learning -t ${REGISTRY}/$(DOCKER_ORG)/learning:${VERSION} -f $(WORKING_DIR)/learning.Dockerfile . + + push: - docker push ${REGISTRY}/$(DOCKER_ORG)/gie-exp-runtime:${VERSION} + docker push ${REGISTRY}/$(DOCKER_ORG)/vineyard-runtime:${VERSION} + docker push ${REGISTRY}/$(DOCKER_ORG)/coordinator:${VERSION} + docker push ${REGISTRY}/$(DOCKER_ORG)/analytical:${VERSION} + docker push ${REGISTRY}/$(DOCKER_ORG)/analytical-java:${VERSION} + docker push ${REGISTRY}/$(DOCKER_ORG)/interactive-frontend:${VERSION} + docker push ${REGISTRY}/$(DOCKER_ORG)/interactive-executor:${VERSION} + docker push ${REGISTRY}/$(DOCKER_ORG)/interactive-experimental:${VERSION} + docker push ${REGISTRY}/$(DOCKER_ORG)/learning:${VERSION} clean: docker ps -qa | xargs $(XARGS_EMPTY_FLAG) docker rm -f docker images -f "dangling=true" -q | xargs $(XARGS_EMPTY_FLAG) docker rmi -f + diff --git a/k8s/standalone/analytical.Dockerfile b/k8s/standalone/analytical.Dockerfile new file mode 100644 index 000000000000..0d6a9ae1f2c8 --- /dev/null +++ b/k8s/standalone/analytical.Dockerfile @@ -0,0 +1,62 @@ +# Analytical engine + +ARG BASE_VERSION=v0.9.0 +FROM registry.cn-hongkong.aliyuncs.com/graphscope/graphscope-vineyard:$BASE_VERSION AS builder + +ARG profile=release +ENV profile=$profile +ADD . /home/graphscope/GraphScope + +RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope +RUN cd /home/graphscope/GraphScope/ \ + && mkdir /home/graphscope/install \ + && make gae-install ENABLE_JAVA_SDK=OFF INSTALL_PREFIX=/home/graphscope/install \ + && mkdir /home/graphscope/install-with-java \ + && make gae-install ENABLE_JAVA_SDK=ON INSTALL_PREFIX=/home/graphscope/install-with-java + +############### RUNTIME: GAE ####################### +FROM registry.cn-hongkong.aliyuncs.com/graphscope/vineyard-runtime:$BASE_VERSION AS analytical + +COPY --from=builder /home/graphscope/install /opt/graphscope/ + +USER graphscope +WORKDIR /home/graphscope + +############### RUNTIME: GAE-JAVA ####################### +FROM registry.cn-hongkong.aliyuncs.com/graphscope/vineyard-runtime:$BASE_VERSION AS analytical-java + +COPY --from=builder /home/graphscope/install-with-java /opt/graphscope/ + +# Installed size: 200M +RUN yum install -y java-1.8.0-openjdk-devel \ + && yum clean all \ + && rm -rf /var/cache/yum + +# install clang-11 with gold optimizer plugin, depends on header include/plugin-api.h +# Installed size: 1.5G +# TODO: Don't compile from scratch +RUN cd /tmp && \ + mkdir -p binutils/include && \ + cd binutils/include && \ + wget -q https://raw.githubusercontent.com/bminor/binutils-gdb/binutils-2_37-branch/include/plugin-api.h && \ + cd /tmp && \ + wget -q https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-11.1.0.tar.gz && \ + tar zxf /tmp/llvmorg-11.1.0.tar.gz -C /tmp/ && \ + cd llvm-project-llvmorg-11.1.0/ && \ + cmake -G "Unix Makefiles" -DLLVM_ENABLE_PROJECTS='clang;lld' \ + -DCMAKE_INSTALL_PREFIX=/opt/llvm11 \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_TARGETS_TO_BUILD=X86 \ + -DLLVM_BINUTILS_INCDIR=/tmp/binutils/include \ + ./llvm && \ + make install -j`nproc` && \ + rm -rf /tmp/llvm-project-llvmorg-11.1.0 /tmp/llvmorg-11.1.0.tar.gz /tmp/binutils + +ENV JAVA_HOME /usr/lib/jvm/java + +ENV LLVM11_HOME=/opt/llvm11 +ENV LIBCLANG_PATH=/opt/llvm11/lib +ENV LLVM_CONFIG_PATH=/opt/llvm11/bin/llvm-config + +USER graphscope +WORKDIR /home/graphscope diff --git a/k8s/standalone/coordinator.Dockerfile b/k8s/standalone/coordinator.Dockerfile new file mode 100644 index 000000000000..6ec037ed7d3c --- /dev/null +++ b/k8s/standalone/coordinator.Dockerfile @@ -0,0 +1,72 @@ +# Coordinator of graphscope engines + +ARG BASE_VERSION=v0.9.0 +FROM registry.cn-hongkong.aliyuncs.com/graphscope/graphscope-vineyard:$BASE_VERSION AS builder + +ADD . /home/graphscope/GraphScope + +RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope +RUN cd /home/graphscope/GraphScope \ + && git submodule update --init \ + && cd learning_engine/graph-learn \ + && git submodule update --init third_party/pybind11 \ + && rm -rf cmake-build \ + && mkdir -p cmake-build \ + && cd cmake-build \ + && cmake -DWITH_VINEYARD=ON .. \ + && make graphlearn_shared -j`nproc` \ + && export LD_LIBRARY_PATH=`pwd`/built/lib:$LD_LIBRARY_PATH \ + && cd /home/graphscope/GraphScope/python \ + && export PATH=/opt/python/cp38-cp38/bin:$PATH \ + && python3 -m pip install -U pip \ + && python3 -m pip install "numpy==1.18.5" "pandas<1.5.0" "grpcio<=1.43.0,>=1.40.0" "grpcio-tools<=1.43.0,>=1.40.0" wheel \ + && python3 setup.py bdist_wheel \ + && cd /home/graphscope/GraphScope/coordinator \ + && package_name=gs-coordinator python3 setup.py bdist_wheel + +############### RUNTIME: Coordinator ####################### + +FROM centos:7.9.2009 AS coordinator + +COPY --from=builder /home/graphscope/GraphScope/coordinator/dist/ /opt/graphscope/dist/ +COPY --from=builder /home/graphscope/GraphScope/python/dist/* /opt/graphscope/dist/ + +RUN yum install -y centos-release-scl-rh sudo && \ + INSTALL_PKGS="devtoolset-10-gcc-c++ rh-python38-python-pip" && \ + yum install -y --setopt=tsflags=nodocs $INSTALL_PKGS && \ + rpm -V $INSTALL_PKGS && \ + yum -y clean all --enablerepo='*' && \ + rm -rf /var/cache/yum + +SHELL [ "/usr/bin/scl", "enable", "devtoolset-10", "rh-python38" ] + +RUN python3 -m pip install /opt/graphscope/dist/graphscope_client*.whl +RUN python3 -m pip install /opt/graphscope/dist/gs_coordinator*.whl + +RUN rm -rf /opt/graphscope/dist + +# Enable rh-python, devtoolsets-10 +COPY ./k8s/standalone/entrypoint.sh /usr/bin/entrypoint.sh +RUN chmod +x /usr/bin/entrypoint.sh + +COPY ./k8s/kube_ssh /usr/local/bin/kube_ssh + +# kubectl v1.19.2 +RUN curl -L -o /usr/local/bin/kubectl https://storage.googleapis.com/kubernetes-release/release/v1.19.2/bin/linux/amd64/kubectl + +# shanghai zoneinfo +ENV TZ=Asia/Shanghai +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && \ + echo '$TZ' > /etc/timezone + +# for programming output +RUN localedef -c -f UTF-8 -i en_US en_US.UTF-8 +ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8' + +RUN useradd -m graphscope -u 1001 \ + && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +USER graphscope +WORKDIR /home/graphscope + +ENTRYPOINT [ "/usr/bin/entrypoint.sh" ] diff --git a/k8s/standalone/entrypoint.sh b/k8s/standalone/entrypoint.sh new file mode 100644 index 000000000000..d14e911267f6 --- /dev/null +++ b/k8s/standalone/entrypoint.sh @@ -0,0 +1,3 @@ +#!/bin/bash +source scl_source enable rh-python38 devtoolset-10 +exec "$@" diff --git a/k8s/standalone/gie-exp.Dockerfile b/k8s/standalone/interactive-experimental.Dockerfile similarity index 88% rename from k8s/standalone/gie-exp.Dockerfile rename to k8s/standalone/interactive-experimental.Dockerfile index 7ffb06008e25..683e4d7fd35b 100644 --- a/k8s/standalone/gie-exp.Dockerfile +++ b/k8s/standalone/interactive-experimental.Dockerfile @@ -1,17 +1,20 @@ -ARG BASE_VERSION=v0.8.5 +# Interactive engine which uses experimental storage + +ARG BASE_VERSION=v0.9.0 FROM registry.cn-hongkong.aliyuncs.com/graphscope/graphscope-vineyard:$BASE_VERSION AS builder ARG profile=release ENV profile=$profile ADD . /home/graphscope/GraphScope +ENV PATH="/home/graphscope/.cargo/bin:$PATH" + RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope -RUN source $HOME/.bashrc \ - && cd /home/graphscope/GraphScope/interactive_engine/compiler \ +RUN cd /home/graphscope/GraphScope/interactive_engine/compiler \ && make build rpc.target=start_rpc_server_k8s ############### RUNTIME: frontend && executor ####################### -FROM centos:7.9.2009 AS gie-exp +FROM centos:7.9.2009 AS experimental COPY --from=builder /home/graphscope/GraphScope/interactive_engine/compiler/target/libs /opt/GraphScope/interactive_engine/compiler/target/libs COPY --from=builder /home/graphscope/GraphScope/interactive_engine/compiler/target/compiler-1.0-SNAPSHOT.jar /opt/GraphScope/interactive_engine/compiler/target/compiler-1.0-SNAPSHOT.jar diff --git a/k8s/standalone/interactive.Dockerfile b/k8s/standalone/interactive.Dockerfile new file mode 100644 index 000000000000..a2b28317f851 --- /dev/null +++ b/k8s/standalone/interactive.Dockerfile @@ -0,0 +1,46 @@ +# Interactive engine + +ARG BASE_VERSION=v0.9.0 +FROM registry.cn-hongkong.aliyuncs.com/graphscope/graphscope-vineyard:$BASE_VERSION AS builder + +ARG profile=release +ENV profile=$profile +ADD . /home/graphscope/GraphScope + +ENV PATH="/home/graphscope/.cargo/bin:$PATH" + +RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope +RUN cd /home/graphscope/GraphScope/ \ + && mkdir /home/graphscope/install \ + && make gie-install BUILD_TYPE="$profile" INSTALL_PREFIX=/home/graphscope/install + +############### RUNTIME: frontend ####################### +FROM centos:7.9.2009 AS frontend + +COPY --from=builder /home/graphscope/install/bin/giectl /opt/graphscope/bin/giectl +# vineyard.frontend.properties, log configuration files +COPY --from=builder /home/graphscope/install/conf /opt/graphscope/conf +# jars, libir_core.so +COPY --from=builder /home/graphscope/install/lib /opt/graphscope/lib + +RUN yum install -y java-1.8.0-openjdk-devel \ + && yum clean all \ + && rm -rf /var/cache/yum + +RUN useradd -m graphscope -u 1001 \ + && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +USER graphscope +WORKDIR /home/graphscope + +############### RUNTIME: executor ####################### +FROM registry.cn-hongkong.aliyuncs.com/graphscope/vineyard-runtime:$BASE_VERSION AS executor + +# gaia_executor, giectl +COPY --from=builder /home/graphscope/install/bin /opt/graphscope/bin +# vineyard.executor.properties, log configuration files +COPY --from=builder /home/graphscope/install/conf /opt/graphscope/conf + +ENV RUST_BACKTRACE=1 + +USER graphscope +WORKDIR /home/graphscope diff --git a/k8s/standalone/learning.Dockerfile b/k8s/standalone/learning.Dockerfile new file mode 100644 index 000000000000..6023b2ece86d --- /dev/null +++ b/k8s/standalone/learning.Dockerfile @@ -0,0 +1,26 @@ +# Learning engine + +ARG BASE_VERSION=v0.9.0 +FROM registry.cn-hongkong.aliyuncs.com/graphscope/graphscope-vineyard:$BASE_VERSION AS builder + +ADD . /home/graphscope/GraphScope + +RUN sudo chown -R graphscope:graphscope /home/graphscope/GraphScope +RUN cd /home/graphscope/GraphScope/ \ + && mkdir /home/graphscope/install \ + && make gle INSTALL_PREFIX=/home/graphscope/install \ + && cd /home/graphscope/GraphScope/coordinator \ + && export PATH=/opt/python/cp38-cp38/bin:$PATH \ + && python3 setup.py bdist_wheel \ + && cp dist/*.whl /home/graphscope/install/ + +############### RUNTIME: GLE ####################### +FROM registry.cn-hongkong.aliyuncs.com/graphscope/vineyard-runtime:$BASE_VERSION AS learning + +COPY --from=builder /home/graphscope/install /opt/graphscope + +RUN python3 -m pip install /opt/graphscope/*.whl +RUN rm -f /opt/graphscope/*.whl + +USER graphscope +WORKDIR /home/graphscope diff --git a/k8s/standalone/vineyard.Dockerfile b/k8s/standalone/vineyard.Dockerfile new file mode 100644 index 000000000000..2fb9583180bb --- /dev/null +++ b/k8s/standalone/vineyard.Dockerfile @@ -0,0 +1,207 @@ +FROM centos:7.9.2009 + +RUN yum install -y centos-release-scl-rh perl which sudo wget libunwind-devel && \ + INSTALL_PKGS="devtoolset-10-gcc-c++ rh-python38-python-pip" && \ + yum install -y --setopt=tsflags=nodocs $INSTALL_PKGS && \ + rpm -V $INSTALL_PKGS && \ + yum -y clean all --enablerepo='*' && \ + rm -rf /var/cache/yum + +SHELL [ "/usr/bin/scl", "enable", "devtoolset-10", "rh-python38" ] + +RUN python3 -m pip install libclang etcd-distro + +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib:/usr/local/lib64 + +COPY ./download /tmp/ + +RUN cd /tmp && \ + wget -q https://github.com/Kitware/CMake/releases/download/v3.19.1/cmake-3.19.1-Linux-x86_64.sh && \ + bash cmake-3.19.1-Linux-x86_64.sh --prefix=/usr --skip-license && \ + rm -f /tmp/cmake-3.19.1-Linux-x86_64.sh + +# install openmpi v4.0.5 to /opt/openmpi and link to /usr/local +RUN cd /tmp && \ + wget -q https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.5.tar.gz && \ + tar zxvf openmpi-4.0.5.tar.gz && \ + cd openmpi-4.0.5 && \ + ./configure --enable-mpi-cxx --disable-dlopen --prefix=/usr/local && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/openmpi-4.0.5 /tmp/openmpi-4.0.5.tar.gz + +# GLOG +RUN cd /tmp && \ + wget -q https://github.com/google/glog/archive/v0.4.0.tar.gz && \ + tar zxvf v0.4.0.tar.gz && \ + cd glog-0.4.0 && \ + cmake . -DBUILD_SHARED_LIBS=ON && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/v0.4.0.tar.gz /tmp/glog-0.4.0 + +# libgrape-lite, required by vineyard +RUN cd /tmp && \ + git clone -b master https://github.com/alibaba/libgrape-lite.git --depth=1 && \ + cd libgrape-lite && \ + cmake . && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/libgrape-lite + +# apache arrow v7.0.0 +RUN cd /tmp && \ + wget -q https://github.com/apache/arrow/archive/apache-arrow-7.0.0.tar.gz && \ + tar zxvf apache-arrow-7.0.0.tar.gz && \ + cd arrow-apache-arrow-7.0.0 && \ + cmake ./cpp \ + -DARROW_COMPUTE=ON \ + -DARROW_WITH_UTF8PROC=OFF \ + -DARROW_CSV=ON \ + -DARROW_CUDA=OFF \ + -DARROW_DATASET=OFF \ + -DARROW_FILESYSTEM=ON \ + -DARROW_FLIGHT=OFF \ + -DARROW_GANDIVA=OFF \ + -DARROW_GANDIVA_JAVA=OFF \ + -DARROW_HDFS=OFF \ + -DARROW_HIVESERVER2=OFF \ + -DARROW_JSON=OFF \ + -DARROW_ORC=OFF \ + -DARROW_PARQUET=OFF \ + -DARROW_PLASMA=OFF \ + -DARROW_PLASMA_JAVA_CLIENT=OFF \ + -DARROW_PYTHON=OFF \ + -DARROW_S3=OFF \ + -DARROW_WITH_BZ2=OFF \ + -DARROW_WITH_ZLIB=OFF \ + -DARROW_WITH_LZ4=OFF \ + -DARROW_WITH_SNAPPY=OFF \ + -DARROW_WITH_ZSTD=OFF \ + -DARROW_WITH_BROTLI=OFF \ + -DARROW_IPC=ON \ + -DARROW_BUILD_BENCHMARKS=OFF \ + -DARROW_BUILD_EXAMPLES=OFF \ + -DARROW_BUILD_INTEGRATION=OFF \ + -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_BUILD_TESTS=OFF \ + -DARROW_ENABLE_TIMING_TESTS=OFF \ + -DARROW_FUZZING=OFF \ + -DARROW_USE_ASAN=OFF \ + -DARROW_USE_TSAN=OFF \ + -DARROW_USE_UBSAN=OFF \ + -DARROW_JEMALLOC=OFF \ + -DARROW_BUILD_SHARED=ON \ + -DARROW_BUILD_STATIC=OFF && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/arrow-apache-arrow-7.0.0 /tmp/apache-arrow-7.0.0.tar.gz + + +# gflags v2.2.2 +RUN cd /tmp && \ + wget -q https://github.com/gflags/gflags/archive/v2.2.2.tar.gz && \ + tar zxvf v2.2.2.tar.gz && \ + cd gflags-2.2.2 && \ + cmake . -DBUILD_SHARED_LIBS=ON && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/v2.2.2.tar.gz /tmp/gflags-2.2.2 + +# Boost 1.73.1, required by vineyard +RUN cd /tmp && \ + wget -q https://boostorg.jfrog.io/artifactory/main/release/1.73.0/source/boost_1_73_0.tar.gz && \ + tar zxf boost_1_73_0.tar.gz && \ + cd boost_1_73_0 && \ + ./bootstrap.sh --with-libraries=system,filesystem,context,program_options,regex,thread,random,chrono,atomic,date_time && \ + ./b2 install link=shared runtime-link=shared variant=release threading=multi && \ + rm -rf /tmp/boost_1_73_0 /tmp/boost_1_73_0.tar.gz + # bcp --boost=./download/boost_1_73_0 system filesystem context program_options regex thread random chrono atomic date_time boost + +# OpenSSL 1.1.1, required by vineyard +RUN cd /tmp && \ + wget -q https://github.com/openssl/openssl/archive/OpenSSL_1_1_1h.tar.gz && \ + tar zxvf OpenSSL_1_1_1h.tar.gz && \ + cd openssl-OpenSSL_1_1_1h && \ + ./config -fPIC -shared && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/OpenSSL_1_1_1h.tar.gz /tmp/openssl-OpenSSL_1_1_1h + +# zlib v1.2.11, required by vineyard +RUN cd /tmp && \ + wget -q https://github.com/madler/zlib/archive/v1.2.11.tar.gz && \ + tar zxvf v1.2.11.tar.gz && \ + cd zlib-1.2.11 && \ + cmake . -DBUILD_SHARED_LIBS=ON && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/v1.2.11.tar.gz /tmp/zlib-1.2.11 + +# protobuf v.3.13.0 +RUN cd /tmp && \ + wget -q https://github.com/protocolbuffers/protobuf/releases/download/v3.13.0/protobuf-all-3.13.0.tar.gz && \ + tar zxvf protobuf-all-3.13.0.tar.gz && \ + cd protobuf-3.13.0 && \ + ./configure --enable-shared --disable-static && \ + make -j`nproc` && \ + make install && \ + ldconfig && \ + rm -rf /tmp/protobuf-all-3.13.0.tar.gz /tmp/protobuf-3.13.0 + +# grpc v1.33.1 +RUN cd /tmp && \ + git clone --depth 1 --branch v1.33.1 https://github.com/grpc/grpc.git && \ + cd grpc && \ + git submodule update --init && \ + cmake . -DBUILD_SHARED_LIBS=ON \ + -DgRPC_INSTALL=ON \ + -DgRPC_BUILD_TESTS=OFF \ + -DgRPC_BUILD_CSHARP_EXT=OFF \ + -DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF \ + -DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF \ + -DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF \ + -DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF \ + -DgRPC_BUILD_GRPC_PYTHON_PLUGIN=OFF \ + -DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF \ + -DgRPC_BACKWARDS_COMPATIBILITY_MODE=ON \ + -DgRPC_PROTOBUF_PROVIDER=package \ + -DgRPC_ZLIB_PROVIDER=package \ + -DgRPC_SSL_PROVIDER=package \ + -DOPENSSL_ROOT_DIR=/usr/local \ + -DCMAKE_CXX_FLAGS="-fpermissive" && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/grpc + +# Vineyard +RUN cd /tmp && \ + git clone -b v0.9.0 https://github.com/v6d-io/v6d.git --depth=1 && \ + cd v6d && \ + git submodule update --init && \ + cmake . -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DBUILD_VINEYARD_TESTS=OFF \ + -DBUILD_VINEYARD_PYTHON_BINDINGS=OFF && \ + make -j`nproc` && \ + make install && \ + rm -rf /tmp/v6d + +RUN strip /usr/local/bin/vineyardd /usr/local/lib/libvineyard* +RUN rm -rf /tmp/* + +RUN useradd -m graphscope -u 1001 \ + && echo 'graphscope ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +# shanghai zoneinfo +ENV TZ=Asia/Shanghai +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && \ + echo '$TZ' > /etc/timezone + +# for programming output +RUN localedef -c -f UTF-8 -i en_US en_US.UTF-8 +ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8' + +# Enable rh-python, devtoolsets-10 binary +COPY entrypoint.sh /usr/bin/entrypoint.sh +RUN chmod +x /usr/bin/entrypoint.sh +ENTRYPOINT [ "/usr/bin/entrypoint.sh" ] diff --git a/python/graphscope/client/rpc.py b/python/graphscope/client/rpc.py index 849ba2a54daf..8702bfd03165 100644 --- a/python/graphscope/client/rpc.py +++ b/python/graphscope/client/rpc.py @@ -114,7 +114,7 @@ def close(self): if self._session_id: self._close_session_impl() self._session_id = None - if self._logs_fetching_thread: + if self._logs_fetching_thread is not None: self._logs_fetching_thread.join(timeout=5) @handle_grpc_error @@ -151,8 +151,6 @@ def _connect_session_impl(self, cleanup_instance=True, dangling_timeout_seconds= return ( response.session_id, response.cluster_type, - json.loads(response.engine_config), - response.pod_name_list, response.num_workers, response.namespace, ) @@ -161,11 +159,10 @@ def _connect_session_impl(self, cleanup_instance=True, dangling_timeout_seconds= def _fetch_logs_impl(self): request = message_pb2.FetchLogsRequest(session_id=self._session_id) responses = self._stub.FetchLogs(request) - for resp in responses: - info = resp.info_message.rstrip() + for res in responses: + info, error = res.info_message.rstrip(), res.error_message.rstrip() if info: logger.info(info, extra={"simple": True}) - error = resp.error_message.rstrip() if error: logger.error(error, extra={"simple": True}) @@ -194,3 +191,43 @@ def _run_step_impl(self, dag_def): if response.full_exception: raise pickle.loads(response.full_exception) return response + + def create_analytical_instance(self): + request = message_pb2.CreateAnalyticalInstanceRequest( + session_id=self._session_id + ) + response = self._stub.CreateAnalyticalInstance(request) + return json.loads(response.engine_config), response.host_names + + def create_interactive_instance(self, object_id, schema_path): + request = message_pb2.CreateInteractiveInstanceRequest( + session_id=self._session_id, object_id=object_id, schema_path=schema_path + ) + response = self._stub.CreateInteractiveInstance(request) + return response.gremlin_endpoint + + def create_learning_instance(self, object_id, handle, config): + request = message_pb2.CreateLearningInstanceRequest(session_id=self._session_id) + request.object_id = object_id + request.handle = handle + request.config = config + response = self._stub.CreateLearningInstance(request) + return response.handle, response.config, response.endpoints + + def close_analytical_instance(self): + request = message_pb2.CloseAnalyticalInstanceRequest( + session_id=self._session_id + ) + self._stub.CloseAnalyticalInstance(request) + + def close_interactive_instance(self, object_id): + request = message_pb2.CloseInteractiveInstanceRequest( + session_id=self._session_id, object_id=object_id + ) + self._stub.CloseInteractiveInstance(request) + + def close_learning_instance(self, object_id): + request = message_pb2.CloseLearningInstanceRequest( + session_id=self._session_id, object_id=object_id + ) + self._stub.CloseLearningInstance(request) diff --git a/python/graphscope/client/session.py b/python/graphscope/client/session.py index 0c3789ba367c..aef7def93d98 100755 --- a/python/graphscope/client/session.py +++ b/python/graphscope/client/session.py @@ -53,7 +53,6 @@ from graphscope.deploy.kubernetes.utils import resolve_api_client from graphscope.framework.dag import Dag from graphscope.framework.errors import FatalError -from graphscope.framework.errors import InteractiveEngineInternalError from graphscope.framework.errors import InvalidArgumentError from graphscope.framework.errors import K8sError from graphscope.framework.graph import Graph @@ -62,13 +61,10 @@ from graphscope.framework.utils import decode_dataframe from graphscope.framework.utils import decode_numpy from graphscope.interactive.query import InteractiveQuery -from graphscope.interactive.query import InteractiveQueryDAGNode -from graphscope.interactive.query import InteractiveQueryStatus from graphscope.proto import graph_def_pb2 from graphscope.proto import message_pb2 from graphscope.proto import op_def_pb2 from graphscope.proto import types_pb2 -from graphscope.version import __version__ DEFAULT_CONFIG_FILE = os.environ.get( "GS_CONFIG_PATH", os.path.expanduser("~/.graphscope/session.json") @@ -103,19 +99,17 @@ def __init__(self, dag, fetches): self._ops.append(fetch) # extract sub dag self._sub_dag = dag.extract_subdag_for(self._ops) - if "debug" in os.environ: + if "GRAPHSCOPE_DEBUG" in os.environ: logger.info("sub_dag: %s", self._sub_dag) @property def targets(self): return self._sub_dag - def _rebuild_graph(self, seq, op: Operation, op_result: op_def_pb2.OpResult): + def _rebuild_graph(self, seq, op_result: op_def_pb2.OpResult): if isinstance(self._fetches[seq], Operation): # for nx Graph return op_result.graph_def - # run_app op also can return graph result, so judge here - # get graph dag node as base graph_dag_node = self._fetches[seq] # construct graph @@ -124,40 +118,7 @@ def _rebuild_graph(self, seq, op: Operation, op_result: op_def_pb2.OpResult): g.update_from_graph_def(op_result.graph_def) return g - def _rebuild_learning_graph( - self, seq, op: Operation, op_result: op_def_pb2.OpResult - ): - from graphscope.learning.graph import Graph as LearningGraph - - result = op_result.result.decode("utf-8") - result = json.loads(result) - handle = result["handle"] - handle = json.loads(base64.b64decode(handle).decode("utf-8")) - handle["server"] = ",".join(result["endpoints"]) - handle["client_count"] = 1 - - graph_dag_node = self._fetches[seq] - # construct learning graph - g = LearningGraph(graph_dag_node, handle, result["config"], result["object_id"]) - return g - - def _rebuild_interactive_query( - self, seq, op: Operation, op_result: op_def_pb2.OpResult - ): - # get interactive query dag node as base - interactive_query_node = self._fetches[seq] - # construct interactive query - result = op_result.result.decode("utf-8") - result = json.loads(result) - interactive_query = InteractiveQuery( - interactive_query_node, - result["endpoint"], - result["object_id"], - ) - interactive_query.status = InteractiveQueryStatus.Running - return interactive_query - - def _rebuild_app(self, seq, op: Operation, op_result: op_def_pb2.OpResult): + def _rebuild_app(self, seq, op_result: op_def_pb2.OpResult): from graphscope.framework.app import App # get app dag node as base @@ -166,9 +127,8 @@ def _rebuild_app(self, seq, op: Operation, op_result: op_def_pb2.OpResult): app = App(app_dag_node, op_result.result.decode("utf-8")) return app - def _rebuild_context(self, seq, op: Operation, op_result: op_def_pb2.OpResult): + def _rebuild_context(self, seq, op_result: op_def_pb2.OpResult): from graphscope.framework.context import Context - from graphscope.framework.context import DynamicVertexDataContext # get context dag node as base context_dag_node = self._fetches[seq] @@ -176,12 +136,12 @@ def _rebuild_context(self, seq, op: Operation, op_result: op_def_pb2.OpResult): context_type = ret["context_type"] if context_type == "dynamic_vertex_data": # for nx + from graphscope.framework.context import DynamicVertexDataContext + return DynamicVertexDataContext(context_dag_node, ret["context_key"]) return Context(context_dag_node, ret["context_key"], ret["context_schema"]) - def _rebuild_gremlin_results( - self, seq, op: Operation, op_result: op_def_pb2.OpResult - ): + def _rebuild_gremlin_results(self, seq, op_result: op_def_pb2.OpResult): from graphscope.interactive.query import ResultSet # get result set node as base @@ -195,7 +155,7 @@ def wrap_results(self, response: message_pb2.RunStepResponse): # noqa: C901 if op.key == op_result.key: if op.output_types == types_pb2.RESULTS: if op.type == types_pb2.RUN_APP: - rets.append(self._rebuild_context(seq, op, op_result)) + rets.append(self._rebuild_context(seq, op_result)) elif op.type == types_pb2.FETCH_GREMLIN_RESULT: rets.append(pickle.loads(op_result.result)) elif op.type == types_pb2.REPORT_GRAPH: @@ -204,15 +164,13 @@ def wrap_results(self, response: message_pb2.RunStepResponse): # noqa: C901 # for nx Graph rets.append(op_result.result.decode("utf-8")) if op.output_types == types_pb2.GREMLIN_RESULTS: - rets.append(self._rebuild_gremlin_results(seq, op, op_result)) + rets.append(self._rebuild_gremlin_results(seq, op_result)) if op.output_types == types_pb2.GRAPH: - rets.append(self._rebuild_graph(seq, op, op_result)) - if op.output_types == types_pb2.LEARNING_GRAPH: - rets.append(self._rebuild_learning_graph(seq, op, op_result)) + rets.append(self._rebuild_graph(seq, op_result)) if op.output_types == types_pb2.APP: rets.append(None) if op.output_types == types_pb2.BOUND_APP: - rets.append(self._rebuild_app(seq, op, op_result)) + rets.append(self._rebuild_app(seq, op_result)) if op.output_types in ( types_pb2.VINEYARD_TENSOR, types_pb2.VINEYARD_DATAFRAME, @@ -231,8 +189,6 @@ def wrap_results(self, response: message_pb2.RunStepResponse): # noqa: C901 or op.type == types_pb2.GRAPH_TO_NUMPY ): rets.append(decode_numpy(op_result.result)) - if op.output_types == types_pb2.INTERACTIVE_QUERY: - rets.append(self._rebuild_interactive_query(seq, op, op_result)) if op.output_types == types_pb2.NULL_OUTPUT: rets.append(None) break @@ -561,7 +517,7 @@ def __init__( deterministic behavior would be better. If :code:`reconnect` is True, the existing session will be reused. It is the user's responsibility - to ensure there's no such an active client actually. + to ensure there's no such an active client. Defaults to :code:`False`. @@ -626,11 +582,11 @@ def __init__( self._config_params[param] = saved_locals[param] # parse config, which should be a path to config file, or dict - # config has highest priority + # config has the highest priority if isinstance(config, dict): self._config_params.update(config) elif isinstance(config, str): - self._load_config(config, slient=False) + self._load_config(config, silent=False) elif DEFAULT_CONFIG_FILE: self._load_config(DEFAULT_CONFIG_FILE) @@ -653,16 +609,12 @@ def __init__( for param in self._deprecated_params: if param in kw: warnings.warn( - "The `{0}` parameter has been deprecated and has no effect.".format( - param - ), + f"The `{param}` parameter has been deprecated and has no effect.", category=DeprecationWarning, ) if param == "show_log" or param == "log_level": warnings.warn( - "Please use `graphscope.set_option({0}={1})` instead".format( - param, kw.pop(param, None) - ), + f"Please use `graphscope.set_option({param}={kw.pop(param, None)})` instead", category=DeprecationWarning, ) if param == "k8s_vineyard_shared_mem": @@ -698,8 +650,8 @@ def __init__( self._launcher = None self._heartbeat_sending_thread = None - self._grpc_client = None - self._session_id = None # unique identifier across sessions + self._grpc_client: GRPCClient = None + self._session_id: str = None # unique identifier across sessions # engine config: # # { @@ -707,7 +659,7 @@ def __init__( # "vineyard_socket": "...", # "vineyard_rpc_endpoint": "..." # } - self._engine_config = None + self._engine_config: {} = None # interactive instance related graph map self._interactive_instance_dict = {} @@ -721,16 +673,16 @@ def __init__( with CaptureKeyboardInterrupt(self.close): self._connect() - self._disconnected = False + self._disconnected: bool = False # heartbeat - self._heartbeat_interval_seconds = 5 + self._heartbeat_interval_seconds: int = 5 self._heartbeat_sending_thread = threading.Thread( target=self._send_heartbeat, args=() ) self._heartbeat_sending_thread.daemon = True self._heartbeat_sending_thread.start() - self._heartbeat_maximum_failures = 3 + self._heartbeat_maximum_failures: int = 3 # networkx module self._nx = None @@ -742,21 +694,21 @@ def __str__(self): return repr(self) @property - def session_id(self): + def session_id(self) -> str: return self._session_id @property def dag(self): return self._dag - def _load_config(self, path, slient=True): + def _load_config(self, path, silent=True): config_path = os.path.expandvars(os.path.expanduser(path)) try: with open(config_path, "r") as f: data = json.load(f) self._config_params.update(data) except Exception as exp: # noqa - if not slient: + if not silent: raise exp def _parse_cluster_type(self): @@ -770,17 +722,17 @@ def _parse_cluster_type(self): elif self._config_params["cluster_type"] == "k8s": cluster_type = types_pb2.K8S else: - raise ValueError("Expect hosts or k8s of cluster_type parameter") + raise ValueError("Expect 'hosts' or 'k8s' for cluster_type parameter") return cluster_type @property def engine_config(self): - """Show the engine configration associated with session in json format.""" + """Show the engine configuration associated with session in json format.""" return self._engine_config @property def info(self): - """Show all resources info associated with session in json format.""" + """Show all resource info associated with session in json format.""" info = {} if self._closed: info["status"] = "closed" @@ -797,7 +749,7 @@ def info(self): info["type"] = "hosts" info["engine_hosts"] = self._engine_config["engine_hosts"] - info["cluster_type"] = str(self._cluster_type) + info["cluster_type"] = types_pb2.ClusterType.Name(self._cluster_type) info["session_id"] = self.session_id info["num_workers"] = self._config_params["num_workers"] info["coordinator_endpoint"] = self._coordinator_endpoint @@ -862,7 +814,7 @@ def _close(self): # noqa: C901 self._closed = True self._coordinator_endpoint = None - self._deregister_default() + self._unregister_default() if self._heartbeat_sending_thread: try: @@ -878,8 +830,7 @@ def _close(self): # noqa: C901 # close all interactive instances for instance in self._interactive_instance_dict.values(): try: - if instance is not None: - instance.close() + instance.close() except Exception: pass self._interactive_instance_dict.clear() @@ -887,8 +838,7 @@ def _close(self): # noqa: C901 # close all learning instances for instance in self._learning_instance_dict.values(): try: - if instance is not None: - instance.close() + instance.close() except Exception: pass self._learning_instance_dict.clear() @@ -911,14 +861,12 @@ def _close(self): # noqa: C901 self._pod_name_list = [] def _close_interactive_instance(self, instance): - """Close a interactive instance.""" - if self.eager(): - self._interactive_instance_dict[instance.object_id] = None + """Close an interactive instance.""" + self._grpc_client.close_interactive_instance(instance.object_id) def _close_learning_instance(self, instance): """Close a learning instance.""" - if self.eager(): - self._learning_instance_dict[instance.object_id] = None + self._grpc_client.close_learning_instance(instance.object_id) def __del__(self): # cleanly ignore all exceptions @@ -942,11 +890,11 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_value, exc_tb): - """Deregister self from the default session, + """Unregister self from the default session, close the session and release the resources, ignore all exceptions in close(). """ try: - self._deregister_default() + self._unregister_default() self.close() except Exception: pass @@ -971,7 +919,7 @@ def as_default(self): self._default_session = default_session(self) self._default_session.__enter__() - def _deregister_default(self): + def _unregister_default(self): """Remove self from the default session stack.""" if self._default_session: self._default_session.__exit__(None, None, None) @@ -982,17 +930,17 @@ def _wrapper(self, dag_node): return self.run(dag_node) return dag_node - def run(self, fetches, debug=False): + def run(self, fetches): """Run operations of `fetches`. Args: - fetch: :class:`Operation` + fetches: :class:`Operation` Raises: RuntimeError: Client disconnect to the service. Or run on a closed session. ValueError: - If fetch is not a instance of :class:`Operation`. Or + If fetch is not an instance of :class:`Operation`. Or the fetch has been evaluated. InvalidArgumentError: @@ -1001,9 +949,9 @@ def run(self, fetches, debug=False): Returns: Different values for different output types of :class:`Operation` """ - return self.run_fetches(fetches, debug) + return self.run_fetches(fetches) - def run_fetches(self, fetches, debug=False): + def run_fetches(self, fetches): """Run operations of `fetches` without the session lock.""" if self._closed: raise RuntimeError("Attempted to use a closed Session.") @@ -1027,7 +975,7 @@ def run_fetches(self, fetches, debug=False): def _connect(self): if self._config_params["addr"] is not None: - # try connect to exist coordinator + # try to connect to exist coordinator self._coordinator_endpoint = self._config_params["addr"] elif self._cluster_type == types_pb2.K8S: if ( @@ -1060,7 +1008,7 @@ def _connect(self): and len(self._config_params["hosts"]) != 0 and self._config_params["num_workers"] > 0 ): - # lanuch coordinator with hosts + # launch coordinator with hosts self._launcher = HostsClusterLauncher( **self._config_params, ) @@ -1087,8 +1035,6 @@ def _connect(self): ( self._session_id, self._cluster_type, - self._engine_config, - self._pod_name_list, self._config_params["num_workers"], self._config_params["k8s_namespace"], ) = self._grpc_client.connect( @@ -1101,6 +1047,13 @@ def _connect(self): if self._config_params["addr"] or self._cluster_type == types_pb2.K8S: self._grpc_client.fetch_logs() _session_dict[self._session_id] = self + # Launch analytical engine right after session connected. + # This may be changed to on demand launching in the future. + ( + self._engine_config, + pod_name_list, + ) = self._grpc_client.create_analytical_instance() + self._pod_name_list = list(pod_name_list) except Exception: self.close() raise @@ -1125,41 +1078,29 @@ def _run_on_local(self): self._config_params["port"] = None self._config_params["vineyard_socket"] = "" - @set_defaults(gs_config) - def gremlin(self, graph, engine_params=None): - """Get a interactive engine handler to execute gremlin queries. + def gremlin(self, graph): + """Get an interactive engine handler to execute gremlin queries. - It will return a instance of :class:`graphscope.interactive.query.InteractiveQueryDAGNode`, - that will be evaluated by :method:`sess.run` in eager mode. - - Note that this method will be executed implicitly in eager mode when a property graph created - and cache a instance of InteractiveQuery in session if `initializing_interactive_engine` is True. - If you want to create a new instance under the same graph by different params, you should close - the instance first. + It will return an instance of :class:`graphscope.interactive.query.InteractiveQuery`, .. code:: python - >>> # close and recreate InteractiveQuery in eager mode. + >>> # close and recreate InteractiveQuery. >>> interactive_query = sess.gremlin(g) >>> interactive_query.close() - >>> interactive_query = sess.gremlin(g, engine_params={"xxx":"xxx"}) + >>> interactive_query = sess.gremlin(g) Args: graph (:class:`graphscope.framework.graph.GraphDAGNode`): The graph to create interactive instance. - engine_params (dict, optional): Configure startup parameters of interactive engine. - You can also configure this param by `graphscope.set_option(engine_params={})`. - See a list of configurable keys in - `interactive_engine/deploy/docker/dockerfile/executor.vineyard.properties` Raises: InvalidArgumentError: - :code:`graph` is not a property graph. - - :code:`graph` is unloaded in eager mode. Returns: - :class:`graphscope.interactive.query.InteractiveQueryDAGNode`: - InteractiveQuery to execute gremlin queries, evaluated in eager mode. + :class:`graphscope.interactive.query.InteractiveQuery`: + InteractiveQuery to execute gremlin queries. """ if self._session_id != graph.session_id: raise RuntimeError( @@ -1168,51 +1109,18 @@ def gremlin(self, graph, engine_params=None): ) ) - # Interactive query instance won't add to self._interactive_instance_dict in lazy mode. - # self._interactive_instance_dict[graph.vineyard_id] will be None if InteractiveQuery closed - if ( - self.eager() - and graph.vineyard_id in self._interactive_instance_dict - and self._interactive_instance_dict[graph.vineyard_id] is not None - ): - interactive_query = self._interactive_instance_dict[graph.vineyard_id] - if interactive_query.status == InteractiveQueryStatus.Running: - return interactive_query - if interactive_query.status == InteractiveQueryStatus.Failed: - raise InteractiveEngineInternalError(interactive_query.error_msg) - # Initializing. - # while True is ok, as the status is either running or failed eventually after timeout. - while True: - time.sleep(1) - if interactive_query.status == InteractiveQueryStatus.Failed: - raise InteractiveEngineInternalError(interactive_query.error_msg) - if interactive_query.status == InteractiveQueryStatus.Running: - return interactive_query - if not graph.graph_type == graph_def_pb2.ARROW_PROPERTY: raise InvalidArgumentError("The graph should be a property graph.") - if self.eager(): - if not graph.loaded(): - raise InvalidArgumentError("The graph has already been unloaded") - # cache the instance of interactive query in eager mode - interactive_query = InteractiveQuery() - self._interactive_instance_dict[graph.vineyard_id] = interactive_query + if not isinstance(graph, Graph): # Is a GraphDAGNode + graph = self.run(graph) - try: - _wrapper = self._wrapper( - InteractiveQueryDAGNode(self, graph, engine_params) - ) - except Exception as e: - if self.eager(): - interactive_query.status = InteractiveQueryStatus.Failed - interactive_query.error_msg = str(e) - raise InteractiveEngineInternalError(str(e)) from e - else: - if self.eager(): - interactive_query = _wrapper - graph._attach_interactive_instance(interactive_query) - return _wrapper + object_id = graph.vineyard_id + schema_path = graph.schema_path + endpoint = self._grpc_client.create_interactive_instance(object_id, schema_path) + interactive_query = InteractiveQuery(graph, endpoint) + self._interactive_instance_dict[object_id] = interactive_query + return interactive_query def learning(self, graph, nodes=None, edges=None, gen_labels=None): """Start a graph learning engine. @@ -1278,29 +1186,31 @@ def graphlearn(self, graph, nodes=None, edges=None, gen_labels=None): self._session_id, graph.session_id ) ) - if ( - self.eager() - and graph.vineyard_id in self._learning_instance_dict - and self._learning_instance_dict[graph.vineyard_id] is not None - ): - return self._learning_instance_dict[graph.vineyard_id] if not graph.graph_type == graph_def_pb2.ARROW_PROPERTY: raise InvalidArgumentError("The graph should be a property graph.") - if self.eager(): - if not graph.loaded(): - raise InvalidArgumentError("The graph has already been unloaded") + from graphscope.learning.graph import Graph as LearningGraph + from graphscope.learning.graph import get_gl_handle + + object_id = graph.vineyard_id + schema = graph.schema + engine_config = self._engine_config + handle = get_gl_handle(schema, object_id, self._pod_name_list, engine_config) + config = LearningGraph.preprocess_args(handle, nodes, edges, gen_labels) + config = base64.b64encode(json.dumps(config).encode("utf-8")).decode("utf-8") + handle, config, endpoints = self._grpc_client.create_learning_instance( + object_id, handle, config + ) - from graphscope.learning.graph import GraphDAGNode as LearningGraphDAGNode + handle = json.loads(base64.b64decode(handle).decode("utf-8")) + handle["server"] = ",".join(endpoints) + handle["client_count"] = 1 - _wrapper = self._wrapper( - LearningGraphDAGNode(self, graph, nodes, edges, gen_labels) - ) - if self.eager(): - self._learning_instance_dict[graph.vineyard_id] = _wrapper - graph._attach_learning_instance(_wrapper) - return _wrapper + # construct learning graph + g = LearningGraph(graph, handle, config, object_id) + self._learning_instance_dict[graph.vineyard_id] = g + return g def nx(self): if not self.eager(): @@ -1329,23 +1239,17 @@ def add_lib(self, resource_name): """ add the specified resource to the k8s cluster from client machine. """ - logger.info("client: adding lib {}".format(resource_name)) - if not os.path.exists(resource_name): - raise FileNotFoundError( - "resource file not found in {}.".format(resource_name) - ) + logger.info("client: adding lib %s", resource_name) if not os.path.isfile(resource_name): - raise RuntimeError( - "Provided resource {} can not be found".format(resource_name) - ) + raise RuntimeError("Resource {} can not be found".format(resource_name)) # pack into a gar file garfile = InMemoryZip() resource_reader = open(resource_name, "rb") bytes_ = resource_reader.read() if len(bytes_) <= 0: - raise KeyError("Expect a non-empty file.") + raise RuntimeError("Expect a non-empty file.") # the uploaded file may be placed in the same directory - garfile.append("{}".format(resource_name.split("/")[-1]), bytes_) + garfile.append(resource_name.split("/")[-1], bytes_) self._grpc_client.add_lib(garfile.read_bytes().getvalue()) @@ -1382,8 +1286,6 @@ def set_option(**kwargs): - with_mars - k8s_volumes - k8s_waiting_for_delete - - engine_params - - initializing_interactive_engine - timeout_seconds - dataset_download_retries @@ -1437,8 +1339,6 @@ def get_option(key): - with_mars - k8s_volumes - k8s_waiting_for_delete - - engine_params - - initializing_interactive_engine - timeout_seconds - dataset_download_retries @@ -1562,8 +1462,8 @@ def g(incoming_data=None, oid_type="int64", directed=True, generate_eid=True): return get_default_session().g(incoming_data, oid_type, directed, generate_eid) -def gremlin(graph, engine_params=None): - """Create a interactive engine and get the handler to execute the gremlin queries. +def gremlin(graph): + """Create an interactive engine and get the handler to execute the gremlin queries. See params detail in :meth:`graphscope.Session.gremlin` @@ -1581,7 +1481,7 @@ def gremlin(graph, engine_params=None): """ if _default_session_stack.is_cleared(): raise RuntimeError("No default session found.") - return get_default_session().gremlin(graph, engine_params) + return get_default_session().gremlin(graph) def graphlearn(graph, nodes=None, edges=None, gen_labels=None): diff --git a/python/graphscope/client/utils.py b/python/graphscope/client/utils.py index 1afab76cf6c6..194e0349c77b 100644 --- a/python/graphscope/client/utils.py +++ b/python/graphscope/client/utils.py @@ -265,7 +265,7 @@ def init(): ) stdout_handler.setFormatter(formatter) if gs_config.show_log: - stdout_handler.setLevel(gs_config.log_level) + stdout_handler.setLevel(gs_config.log_level.upper()) else: stdout_handler.setLevel(logging.ERROR) logger.addHandler(stdout_handler) @@ -277,7 +277,7 @@ def update(): log_level = gs_config.log_level else: log_level = logging.ERROR - logger.setLevel(log_level) + logger.setLevel(log_level.upper()) for handler in logger.handlers: handler.setLevel(log_level) diff --git a/python/graphscope/config.py b/python/graphscope/config.py index 3ff67ed640da..3c1f6abf1eff 100644 --- a/python/graphscope/config.py +++ b/python/graphscope/config.py @@ -22,12 +22,14 @@ from graphscope.version import __is_prerelease__ from graphscope.version import __version__ +registry = "registry.cn-hongkong.aliyuncs.com" + class GSConfig(object): - # the endpoint of a pre-launched GraphScope instance. + # the coordinator endpoint of a pre-launched GraphScope instance. addr = None - # "lazy" or "eager", defaults to "eager" + # "eager" or "lazy", defaults to "eager" mode = "eager" # "k8s" or "hosts" @@ -35,11 +37,18 @@ class GSConfig(object): k8s_namespace = None - # image + # etcd image k8s_etcd_image = "quay.io/coreos/etcd:v3.4.13" - k8s_gs_image = ( - f"registry.cn-hongkong.aliyuncs.com/graphscope/graphscope:{__version__}" - ) + + # All in one image + k8s_gs_image = f"{registry}/graphscope/graphscope:{__version__}" + + # Coordinator image + # Also could be used as a client image + k8s_coordinator_image = f"{registry}/graphscope/coordinator:{__version__}" + + # Dataset image + k8s_dataset_image = f"{registry}/graphscope/dataset:{__version__}" # image pull configuration k8s_image_pull_policy = "IfNotPresent" @@ -93,14 +102,6 @@ class GSConfig(object): show_log = False log_level = "INFO" - # GIE engine params - engine_params = None - - # GIE instance will be created automatically when a property graph loaded. - # Otherwise, you should create a GIE instance manually by `sess.gremlin` if - # `initializing_interactive_engine` is False - initializing_interactive_engine = False - timeout_seconds = 600 # kill GraphScope instance after seconds of client disconnect @@ -109,9 +110,6 @@ class GSConfig(object): # Demo dataset related mount_dataset = None - k8s_dataset_image = ( - f"registry.cn-hongkong.aliyuncs.com/graphscope/dataset:{__version__}" - ) # download_retries dataset_download_retries = 3 diff --git a/python/graphscope/deploy/hosts/cluster.py b/python/graphscope/deploy/hosts/cluster.py index b571719fd39b..86c6f5772a05 100644 --- a/python/graphscope/deploy/hosts/cluster.py +++ b/python/graphscope/deploy/hosts/cluster.py @@ -62,7 +62,7 @@ def __init__( vineyard_shared_mem=None, **kwargs ): - self._hosts = hosts + self._hosts: [str] = hosts self._port = port self._etcd_addrs = etcd_addrs self._etcd_listening_client_port = etcd_listening_client_port @@ -194,7 +194,6 @@ def start(self): """ try: self._launch_coordinator() - self._closed = False logger.info( "Coordinator service started successful, connecting to service..." ) @@ -207,9 +206,7 @@ def start(self): def stop(self): """Stop GraphScope instance.""" # coordinator's GRPCServer.wait_for_termination works for SIGINT (Ctrl-C) - if not self._closed: - if self._proc is not None: - self._proc.send_signal(signal.SIGINT) - self._proc.wait(timeout=10) - self._proc = None - self._closed = True + if self._proc is not None: + self._proc.send_signal(signal.SIGINT) + self._proc.wait(timeout=10) + self._proc = None diff --git a/python/graphscope/deploy/kubernetes/cluster.py b/python/graphscope/deploy/kubernetes/cluster.py index 8d83b8cb0525..530f3ee4caa4 100644 --- a/python/graphscope/deploy/kubernetes/cluster.py +++ b/python/graphscope/deploy/kubernetes/cluster.py @@ -174,19 +174,15 @@ def type(self): def _get_free_namespace(self): while True: namespace = "gs-" + random_string(6) - try: - self._core_api.read_namespace(namespace) - except K8SApiException as e: - if e.status != 404: - raise RuntimeError(str(e)) + if not self._namespace_exist(namespace): return namespace def _namespace_exist(self, namespace): try: self._core_api.read_namespace(namespace) except K8SApiException as e: - if e.status != 404: - raise RuntimeError(str(e)) + if e.status != 404: # Not found + raise return False return True @@ -195,7 +191,7 @@ def _role_exist(self, namespace, role): self._rbac_api.read_namespaced_role(name=role, namespace=namespace) except K8SApiException as e: if e.status != 404: - raise RuntimeError(str(e)) + raise return False return True @@ -204,7 +200,7 @@ def _cluster_role_exist(self, cluster_role): self._rbac_api.read_cluster_role(name=cluster_role) except K8SApiException as e: if e.status != 404: - raise RuntimeError(str(e)) + raise return False return True @@ -215,7 +211,7 @@ def _role_binding_exist(self, namespace, role_binding): ) except K8SApiException as e: if e.status != 404: - raise RuntimeError(str(e)) + raise return False return True @@ -224,7 +220,7 @@ def _cluster_role_binding_exist(self, cluster_role_binding): self._rbac_api.read_cluster_role_binding(name=cluster_role_binding) except K8SApiException as e: if e.status != 404: - raise RuntimeError(str(e)) + raise return False return True diff --git a/python/graphscope/deploy/kubernetes/resource_builder.py b/python/graphscope/deploy/kubernetes/resource_builder.py index dcac457dd299..5783d0c4116f 100644 --- a/python/graphscope/deploy/kubernetes/resource_builder.py +++ b/python/graphscope/deploy/kubernetes/resource_builder.py @@ -1203,77 +1203,6 @@ def add_etcd_pod_node_selector(self, node_selector): self._node_selector[k] = v -class GSGraphManagerBuilder(DeploymentBuilder): - """Builder for graphscope interactive graph manager.""" - - _manager_requests_cpu = 0.2 - _manager_requests_mem = "256Mi" - - def __init__(self, name, labels, image_pull_policy, replicas=1): - self._name = name - self._labels = labels - self._replicas = replicas - self._image_pull_policy = image_pull_policy - super().__init__( - self._name, self._labels, self._replicas, self._image_pull_policy - ) - - def add_manager_container(self, name, image, cpu, mem, preemptive, **kwargs): - cmd = kwargs.pop("cmd", None) - args = kwargs.pop("args", None) - - resources_dict = { - "requests": ResourceBuilder( - self._manager_requests_cpu, self._manager_requests_mem - ).build() - if preemptive - else ResourceBuilder(cpu, mem).build(), - "limits": ResourceBuilder(cpu, mem).build(), - } - - volumeMounts = [] - for vol in self._volumes: - for vol_mount in vol.build_mount(): - volumeMounts.append(vol_mount) - - pre_stop_command = ["kill", "-TERM", "`lsof -i:8080 -t`"] - lifecycle_dict = _remove_nones( - { - "preStop": { - "exec": {"command": pre_stop_command}, - } - if pre_stop_command - else None, - } - ) - - ports = kwargs.pop("ports", None) - if ports is not None and not isinstance(ports, list): - ports = [ports] - - super().add_container( - _remove_nones( - { - "command": cmd, - "args": args, - "env": [env.build() for env in self._envs.values()] or None, - "image": image, - "name": name, - "imagePullPolicy": self._image_pull_policy, - "resources": dict((k, v) for k, v in resources_dict.items() if v) - or None, - "ports": [PortBuilder(port).build() for port in ports] - if ports - else None, - "volumeMounts": volumeMounts or None, - "livenessProbe": None, - "readinessProbe": None, - "lifecycle": lifecycle_dict or None, - } - ) - ) - - class GSCoordinatorBuilder(DeploymentBuilder): """Builder for graphscope coordinator.""" diff --git a/python/graphscope/deploy/kubernetes/utils.py b/python/graphscope/deploy/kubernetes/utils.py index 23f62f079a6b..1ca62611d045 100644 --- a/python/graphscope/deploy/kubernetes/utils.py +++ b/python/graphscope/deploy/kubernetes/utils.py @@ -34,7 +34,7 @@ def resolve_api_client(k8s_client_config={}): - """The order of resolves are as following. + """Get ApiClient from predefined locations. Args: k8s_client_config (dict): @@ -57,7 +57,7 @@ def resolve_api_client(k8s_client_config={}): 1. load from kubernetes config file or, 2. load from incluster configuration or, 3. set api address from env if `KUBE_API_ADDRESS` exist. - 4. RuntimeError will be raised if resolve failed. + RuntimeError will be raised if resolution failed. """ try: # load from kubernetes config file @@ -83,11 +83,9 @@ def parse_readable_memory(value): try: float(num) except ValueError as e: - raise ValueError( - "Argument cannot be interpreted as a number: %s" % value - ) from e + raise ValueError(f"Argument cannot be interpreted as a number: {value}") from e if suffix not in ["Ki", "Mi", "Gi"]: - raise ValueError("Memory suffix must be one of 'Ki', 'Mi' and 'Gi': %s" % value) + raise ValueError(f"Memory suffix must be one of 'Ki', 'Mi' and 'Gi': {value}") return value @@ -175,13 +173,13 @@ def _stream_event_impl(self, simple=False): pass else: for event in events.items: - msg = "{0}: {1}".format(self._pod_name, event.message) + msg = f"{self._pod_name}: {event.message}" if msg and msg not in event_messages: event_messages.append(msg) self._lines.put(msg) logger.info(msg, extra={"simple": simple}) if event.reason == "Failed": - raise K8sError("Kubernetes event error: {}".format(msg)) + raise K8sError(f"Kubernetes event error: {msg}") def _stream_log_impl(self, simple=False): log_messages = [] diff --git a/python/graphscope/framework/dag.py b/python/graphscope/framework/dag.py index eb64c55bb8cc..07f24d9151c3 100644 --- a/python/graphscope/framework/dag.py +++ b/python/graphscope/framework/dag.py @@ -84,7 +84,7 @@ def extract_subdag_for(self, ops): # assert op is not present in current dag for op in ops: assert op.key in self._ops_by_key, "%s is not in the dag" % op.key - assert not self._ops_by_key[op.key].evaluated, "%is is evaluated" % op.key + assert not self._ops_by_key[op.key].evaluated, "%s is evaluated" % op.key op_keys.append(op.key) op_keys_to_keep = self._bfs_for_reachable_ops(op_keys) op_keys_to_keep = sorted(op_keys_to_keep, key=lambda n: self._ops_seq_by_key[n]) diff --git a/python/graphscope/framework/dag_utils.py b/python/graphscope/framework/dag_utils.py index c7a4182c6ecc..faa12a8d6881 100644 --- a/python/graphscope/framework/dag_utils.py +++ b/python/graphscope/framework/dag_utils.py @@ -949,102 +949,6 @@ def graph_to_dataframe(graph, selector=None, vertex_range=None): return op -def create_interactive_query(graph, engine_params): - """Create a interactive engine that query on the :code:`graph` - - Args: - graph (:class:`graphscope.framework.graph.GraphDAGNode`): - Source property graph. - engine_params (dict, optional): - Configuration to startup the interactive engine. See detail in: - `interactive_engine/deploy/docker/dockerfile/executor.vineyard.properties` - - Returns: - An op to create a interactive engine based on a graph. - """ - config = {} - if engine_params is not None: - config[types_pb2.GIE_GREMLIN_ENGINE_PARAMS] = utils.s_to_attr( - json.dumps(engine_params) - ) - op = Operation( - graph.session_id, - types_pb2.CREATE_INTERACTIVE_QUERY, - config=config, - inputs=[graph.op], - output_types=types_pb2.INTERACTIVE_QUERY, - ) - return op - - -def create_learning_instance(graph, nodes=None, edges=None, gen_labels=None): - """Create an engine for graph learning. - - Args: - graph (:class:`graphscope.framework.graph.GraphDAGNode`): - Source property graph. - nodes (list): The node types that will be used for gnn training. - edges (list): The edge types that will be used for gnn training. - gen_labels (list): Extra node and edge labels on original graph for gnn training. - - Returns: - An op to create a learning engine based on a graph. - """ - config = {} - # pickle None is expected - config[types_pb2.NODES] = utils.bytes_to_attr(pickle.dumps(nodes)) - config[types_pb2.EDGES] = utils.bytes_to_attr(pickle.dumps(edges)) - config[types_pb2.GLE_GEN_LABELS] = utils.bytes_to_attr(pickle.dumps(gen_labels)) - op = Operation( - graph.session_id, - types_pb2.CREATE_LEARNING_INSTANCE, - config=config, - inputs=[graph.op], - output_types=types_pb2.LEARNING_GRAPH, - ) - return op - - -def close_interactive_query(interactive_query): - """Close the interactive instance. - - Args: - interactive_query (:class:`graphscope.interactive.query.InteractiveQueryDAGNode`): - The GIE instance holds the graph that gremlin query on. - Returns: - An op to close the instance. - """ - config = {} - op = Operation( - interactive_query.session_id, - types_pb2.CLOSE_INTERACTIVE_QUERY, - config=config, - inputs=[interactive_query.op], - output_types=types_pb2.NULL_OUTPUT, - ) - return op - - -def close_learning_instance(learning_instance): - """Close the learning instance. - - Args: - learning_instance (:class:`graphscope.learning.graph.GraphDAGNode`): - The learning instance. - Returns: - An op to close the instance. - """ - config = {} - op = Operation( - learning_instance.session_id, - types_pb2.CLOSE_LEARNING_INSTANCE, - config=config, - inputs=[learning_instance.op], - output_types=types_pb2.NULL_OUTPUT, - ) - return op - - def gremlin_query(interactive_query, query, request_options=None): """Execute a gremlin query. @@ -1063,6 +967,7 @@ def gremlin_query(interactive_query, query, request_options=None): """ config = {} config[types_pb2.GIE_GREMLIN_QUERY_MESSAGE] = utils.s_to_attr(query) + config[types_pb2.VINEYARD_ID] = utils.i_to_attr(interactive_query.object_id) if request_options: config[types_pb2.GIE_GREMLIN_REQUEST_OPTIONS] = utils.s_to_attr( json.dumps(request_options) @@ -1071,7 +976,6 @@ def gremlin_query(interactive_query, query, request_options=None): interactive_query.session_id, types_pb2.GREMLIN_QUERY, config=config, - inputs=[interactive_query.op], output_types=types_pb2.GREMLIN_RESULTS, ) return op @@ -1100,6 +1004,7 @@ def gremlin_to_subgraph( config = {} config[types_pb2.GIE_GREMLIN_QUERY_MESSAGE] = utils.s_to_attr(gremlin_script) config[types_pb2.OID_TYPE] = utils.s_to_attr(oid_type) + config[types_pb2.VINEYARD_ID] = utils.i_to_attr(interactive_query.object_id) if request_options: config[types_pb2.GIE_GREMLIN_REQUEST_OPTIONS] = utils.s_to_attr( json.dumps(request_options) @@ -1108,7 +1013,6 @@ def gremlin_to_subgraph( interactive_query.session_id, types_pb2.SUBGRAPH, config=config, - inputs=[interactive_query.op], output_types=types_pb2.GRAPH, ) return op diff --git a/python/graphscope/framework/graph.py b/python/graphscope/framework/graph.py index 2be6e95b0008..7b7b1396eceb 100644 --- a/python/graphscope/framework/graph.py +++ b/python/graphscope/framework/graph.py @@ -19,7 +19,6 @@ import hashlib import json import logging -import threading import warnings from abc import ABCMeta from abc import abstractmethod @@ -33,7 +32,6 @@ except ImportError: vineyard = None -from graphscope.config import GSConfig as gs_config from graphscope.framework import dag_utils from graphscope.framework import utils from graphscope.framework.dag import DAGNode @@ -312,10 +310,6 @@ def _project_to_simple(self, v_prop=None, e_prop=None): return graph_dag_node def _resolve_op(self, incoming_data): - # Don't import the :code:`NXGraph` in top-level statements to improve the - # performance of :code:`import graphscope`. - from graphscope import nx - if incoming_data is None: # create dag node of empty graph self._op = self._construct_op_of_empty_graph() @@ -323,8 +317,6 @@ def _resolve_op(self, incoming_data): self._op = incoming_data if self._op.type == types_pb2.PROJECT_TO_SIMPLE: self._graph_type = graph_def_pb2.ARROW_PROJECTED - elif isinstance(incoming_data, nx.classes.graph._GraphBase): - self._op = self._from_nx_graph(incoming_data) elif isinstance(incoming_data, Graph): self._op = dag_utils.copy_graph(incoming_data) self._graph_type = incoming_data.graph_type @@ -337,7 +329,14 @@ def _resolve_op(self, incoming_data): ): self._op = self._from_vineyard(incoming_data) else: - raise RuntimeError("Not supported incoming data.") + # Don't import the :code:`NXGraph` in top-level statements to improve the + # performance of :code:`import graphscope`. + from graphscope import nx + + if isinstance(incoming_data, nx.classes.graph._GraphBase): + self._op = self._from_nx_graph(incoming_data) + else: + raise RuntimeError("Not supported incoming data.") def to_numpy(self, selector, vertex_range=None): """Select some elements of the graph and output to numpy. @@ -711,30 +710,6 @@ def __init__( self._schema = GraphSchema() self._detached = False - self._interactive_instance_launching_thread = None - self._interactive_instance_list = [] - self._learning_instance_list = [] - - def _close_interactive_instances(self): - # Close related interactive instances when graph unloaded. - # Since the graph is gone, quering via interactive client is meaningless. - for instance in self._interactive_instance_list: - instance.close() - self._interactive_instance_list.clear() - - def _close_learning_instances(self): - for instance in self._learning_instance_list: - instance.close() - self._learning_instance_list.clear() - - def _launch_interactive_instance_impl(self): - try: - self._session.gremlin(self) - except: # noqa: E722 - # Record error msg in `InteractiveQuery` when launching failed. - # Unexpect and suppress all exceptions here. - pass - def update_from_graph_def(self, graph_def): if graph_def.graph_type == graph_def_pb2.ARROW_FLATTENED: self._graph_node._graph_type = graph_def_pb2.ARROW_FLATTENED @@ -760,12 +735,6 @@ def update_from_graph_def(self, graph_def): self._e_relationships = self._schema.edge_relationships # init saved_signature (must be after init schema) self._saved_signature = self.signature - # create gremlin server pod asynchronously - if self._session.eager() and gs_config.initializing_interactive_engine: - self._interactive_instance_launching_thread = threading.Thread( - target=self._launch_interactive_instance_impl, args=() - ) - self._interactive_instance_launching_thread.start() def __getattr__(self, name): if hasattr(self._graph_node, name): @@ -878,28 +847,6 @@ def _unload(self): """Unload this graph from graphscope engine.""" if self._session.info["status"] != "active" or self._key is None: return - - # close interactive instances first - try: - if ( - self._interactive_instance_launching_thread is not None - and self._interactive_instance_launching_thread.is_alive() - ): - # join raises a RuntimeError if an attempt is made to join the current thread. - # this exception occurs when a object collected by gc mechanism contains a running thread. - if ( - threading.current_thread() - != self._interactive_instance_launching_thread - ): - self._interactive_instance_launching_thread.join() - self._close_interactive_instances() - except Exception as e: - logger.error("Failed to close interactive instances: %s" % e) - try: - self._close_learning_instances() - except Exception as e: - logger.error("Failed to close learning instances: %s" % e) - rlt = None if not self._detached: rlt = self._session._wrapper(self._graph_node._unload()) self._key = None @@ -984,22 +931,6 @@ def _check_unmodified(self): self.signature == self._saved_signature, "Graph has been modified!" ) - def _attach_interactive_instance(self, instance): - """Store the instance when a new interactive instance is started. - - Args: - instance: interactive instance - """ - self._interactive_instance_list.append(instance) - - def _attach_learning_instance(self, instance): - """Store the instance when a new learning instance is created. - - Args: - instance: learning instance - """ - self._learning_instance_list.append(instance) - def save_to(self, path, **kwargs): """Serialize graph to a location. The meta and data of graph is dumped to specified location, diff --git a/python/graphscope/interactive/query.py b/python/graphscope/interactive/query.py index 2229f7d97b09..9dc0c9d60658 100644 --- a/python/graphscope/interactive/query.py +++ b/python/graphscope/interactive/query.py @@ -24,8 +24,6 @@ from gremlin_python.process.anonymous_traversal import traversal from graphscope.framework.dag import DAGNode -from graphscope.framework.dag_utils import close_interactive_query -from graphscope.framework.dag_utils import create_interactive_query from graphscope.framework.dag_utils import fetch_gremlin_result from graphscope.framework.dag_utils import gremlin_query from graphscope.framework.dag_utils import gremlin_to_subgraph @@ -34,7 +32,7 @@ class InteractiveQueryStatus(Enum): - """A enumeration class of current status of InteractiveQuery""" + """Enumeration class of current status of InteractiveQuery""" Initializing = 0 Running = 1 @@ -49,8 +47,8 @@ class ResultSetDAGNode(DAGNode): and you can get the result by :method:`one()` or :method:`all()`. """ - def __init__(self, dag_node, op): - self._session = dag_node.session + def __init__(self, interactive, op): + self._session = interactive.session self._op = op # add op to dag self._session.dag.add_op(self._op) @@ -91,47 +89,47 @@ def all(self): return self._session._wrapper(self._result_set_node.all()) -class InteractiveQueryDAGNode(DAGNode): - """A class represents an interactive query node in a DAG. - - The following example demonstrates its usage: - - .. code:: python - - >>> # lazy node - >>> import graphscope as gs - >>> sess = gs.session(mode="lazy") - >>> g = sess.g() # - >>> ineractive = sess.gremlin(g) - >>> print(ineractive) # - >>> rs = ineractive.execute("g.V()") - >>> print(rs) # - >>> r = rs.one() - >>> print(r) # - >>> print(sess.run(r)) - [2] - >>> subgraph = ineractive.subgraph("xxx") - >>> print(subgraph) # - >>> g2 = sess.run(subgraph) - >>> print(g2) # +class InteractiveQuery(object): + """`InteractiveQuery` class, is a simple wrapper around + `Gremlin-Python `_, + which implements Gremlin within the Python language. + It also can expose gremlin endpoint which can be used by + any other standard gremlin console, with the method `graph_url()`. + + It also has a method called `subgraph` which can extract some fragments + from origin graph, produce a new, smaller but concise graph stored in vineyard, + which lifetime is independently of the origin graph. + + User can either use `execute()` to submit a script, or use `traversal_source()` + to get a `GraphTraversalSource` for further traversal. """ - def __init__(self, session, graph, engine_params=None): - """ - Args: - session (:class:`Session`): instance of GraphScope session. - graph (:class:`graphscope.framework.graph.GraphDAGNode`): - A graph instance that the gremlin query on. - engine_params (dict, optional): - Configuration to startup the interactive engine. See detail in: - `interactive_engine/deploy/docker/dockerfile/executor.vineyard.properties` - """ - self._session = session + def __init__(self, graph, frontend_endpoint): + """Construct a :class:`InteractiveQuery` object.""" + self._conn = None + # graph object id stored in vineyard self._graph = graph - self._engine_params = engine_params - self._op = create_interactive_query(self._graph, self._engine_params) - # add op to dag - self._session.dag.add_op(self._op) + self._session = graph._session + frontend_endpoint = frontend_endpoint.split(",") + self._graph_url = [f"ws://{endpoint}/gremlin" for endpoint in frontend_endpoint] + self.closed = False + + @property + def graph_url(self): + """The gremlin graph url can be used with any standard gremlin console, e.g., tinkerpop.""" + return self._graph_url + + @property + def object_id(self): + return self._graph.vineyard_id + + @property + def session(self): + return self._session + + @property + def session_id(self): + return self._session.session_id def execute(self, query, request_options=None): """Execute gremlin querying scripts. @@ -148,7 +146,10 @@ def execute(self, query, request_options=None): A result holds the gremlin result, evaluated in eager mode. """ op = gremlin_query(self, query, request_options) - return ResultSetDAGNode(self, op) + return self._session._wrapper(ResultSetDAGNode(self, op)) + + def submit(self, query, request_options=None): + return self.execute(query, request_options) def subgraph(self, gremlin_script, request_options=None): """Create a subgraph, which input is the result of the execution of `gremlin_script`. @@ -175,105 +176,7 @@ def subgraph(self, gremlin_script, request_options=None): request_options=request_options, oid_type=self._graph._oid_type, ) - return GraphDAGNode(self._session, op) - - def close(self): - """Close interactive engine and release the resources. - - Returns: - :class:`graphscope.interactive.query.ClosedInteractiveQuery` - Evaluated in eager mode. - """ - op = close_interactive_query(self) - return ClosedInteractiveQuery(self._session, op) - - -class InteractiveQuery(object): - """`InteractiveQuery` class, is a simple wrapper around - `Gremlin-Python `_, - which implements Gremlin within the Python language. - It also can expose gremlin endpoint which can be used by - any other standard gremlin console, with the method `graph_url()`. - - It also has a method called `subgraph` which can extract some fragments - from origin graph, produce a new, smaller but concise graph stored in vineyard, - which lifetime is independent from the origin graph. - - User can either use `execute()` to submit a script, or use `traversal_source()` - to get a `GraphTraversalSource` for further traversal. - """ - - def __init__( - self, interactive_query_node=None, frontend_endpoint=None, object_id=None - ): - """Construct a :class:`InteractiveQuery` object.""" - - self._status = InteractiveQueryStatus.Initializing - self._graph_url = None - self._conn = None - # graph object id stored in vineyard - self._object_id = object_id - # interactive_query_node is None used for create a interative query - # implicitly in eager mode - if interactive_query_node is not None: - self._interactive_query_node = interactive_query_node - self._session = self._interactive_query_node.session - # copy and set op evaluated - self._interactive_query_node.op = deepcopy(self._interactive_query_node.op) - self._interactive_query_node.evaluated = True - self._session.dag.add_op(self._interactive_query_node.op) - if frontend_endpoint is not None: - frontend_endpoint = frontend_endpoint.split(",") - self._graph_url = [ - f"ws://{endpoint}/gremlin" for endpoint in frontend_endpoint - ] - - @property - def graph_url(self): - """The gremlin graph url can be used with any standard gremlin console, e.g., tinkerpop.""" - return self._graph_url - - @property - def status(self): - return self._status - - @property - def object_id(self): - return self._object_id - - @status.setter - def status(self, value): - self._status = value - - @property - def error_msg(self): - return self._error_msg - - @error_msg.setter - def error_msg(self, error_msg): - self._error_msg = error_msg - - def closed(self): - """Return if the current instance is closed.""" - return self._status == InteractiveQueryStatus.Closed - - def subgraph(self, gremlin_script, request_options=None): - if self._status != InteractiveQueryStatus.Running: - raise RuntimeError( - "Interactive query is unavailable with %s status.", str(self._status) - ) - return self._session._wrapper( - self._interactive_query_node.subgraph(gremlin_script, request_options) - ) - - def execute(self, query, request_options=None): - if self._status != InteractiveQueryStatus.Running: - raise RuntimeError( - "Interactive query is unavailable with %s status.", str(self._status) - ) - return self._session._wrapper( - self._interactive_query_node.execute(query, request_options) - ) + return self._session._wrapper(GraphDAGNode(self._session, op)) def traversal_source(self): """Create a GraphTraversalSource and return. @@ -297,35 +200,18 @@ def traversal_source(self): Returns: `GraphTraversalSource` """ - if self._status != InteractiveQueryStatus.Running: - raise RuntimeError( - "Interactive query is unavailable with %s status.", str(self._status) - ) if self._conn is None: self._conn = DriverRemoteConnection(self._graph_url[0], "g") return traversal().withRemote(self._conn) def close(self): + if self.closed: + return """Close interactive instance and release resources""" - if not self.closed(): - if self._conn is not None: - try: - self._conn.close() - except Exception: - pass # be silent when closing - self._conn = None - - if not self._session.closed: - self._session._wrapper(self._interactive_query_node.close()) - self._session._close_interactive_instance(self) - self._status = InteractiveQueryStatus.Closed - - -class ClosedInteractiveQuery(DAGNode): - """Closed interactive query node in a DAG.""" - - def __init__(self, session, op): - self._session = session - self._op = op - # add op to dag - self._session.dag.add_op(self._op) + if self._conn is not None: + try: + self._conn.close() + except: # noqa: E722 + pass + self._session._close_interactive_instance(self) + self.closed = True diff --git a/python/graphscope/learning/graph.py b/python/graphscope/learning/graph.py index c08e7c388011..480b9291ed2f 100644 --- a/python/graphscope/learning/graph.py +++ b/python/graphscope/learning/graph.py @@ -26,59 +26,16 @@ except ImportError: GLGraph = object -from graphscope.framework.dag import DAGNode -from graphscope.framework.dag_utils import close_learning_instance -from graphscope.framework.dag_utils import create_learning_instance from graphscope.framework.errors import InvalidArgumentError from graphscope.framework.errors import check_argument - - -class GraphDAGNode(DAGNode): - """A class represents a learning instance in a DAG. - - The following example demonstrates its usage: - - .. code:: python - - >>> # lazy mode - >>> import graphscope as gs - >>> sess = gs.session(mode="lazy") - >>> g = sess.g() # - >>> lg = sess.graphlearn(g) - >>> print(lg) # - >>> lg_graph = sess.run(lg) - >>> print(lg) # - """ - - def __init__(self, session, graph, nodes=None, edges=None, gen_labels=None): - """ - See params detail in :meth:`graphscope.Session.graphlearn` - """ - self._session = session - self._graph = graph - self._op = create_learning_instance(self._graph, nodes, edges, gen_labels) - # add op to dag - self._session.dag.add_op(self._op) - - def close(self): - """Close learning instance and release the resources. - - Returns: - :class:`graphscope.learning.graph.ClosedLearningInstance` - """ - op = close_learning_instance(self) - return ClosedLearningInstance(self._session, op) +from graphscope.proto import graph_def_pb2 class Graph(GLGraph): - def __init__(self, graph_node, handle, config=None, object_id=None): + def __init__(self, graph, handle, config=None, object_id=None): """Initialize a graph for the learning engine using a handle.""" - self.graph_node = graph_node - self.graphscope_session = self.graph_node.session - # copy and set op evaluated - self.graph_node.op = deepcopy(self.graph_node.op) - self.graph_node.evaluated = True - self.graphscope_session.dag.add_op(self.graph_node.op) + self.graph = graph + self.graphscope_session = self.graph._session handle = self.decode_arg(handle) config = self.decode_arg(config) @@ -132,9 +89,7 @@ def close(self): self.closed = True super(Graph, self).close() # close client first # close server instance - if self.graphscope_session is not None: - self.graphscope_session._wrapper(self.graph_node.close()) - self.graphscope_session._close_learning_instance(self) + self.graphscope_session._close_learning_instance(self) @staticmethod # noqa: C901 def preprocess_args(handle, nodes, edges, gen_labels): # noqa: C901 @@ -322,11 +277,119 @@ def E(self, edge_type, feed=None, reverse=False): return super(Graph, self).E(edge_type, feed, reverse) -class ClosedLearningInstance(DAGNode): - """Closed learning instance node in a DAG.""" +def get_gl_handle(schema, vineyard_id, engine_hosts, engine_config): + """Dump a handler for GraphLearn for interaction. + + Fields in :code:`schema` are: + + + the name of node type or edge type + + whether the graph is weighted graph + + whether the graph is labeled graph + + the number of int attributes + + the number of float attributes + + the number of string attributes - def __init__(self, session, op): - self._session = session - self._op = op - # add op to dag - self._session.dag.add_op(self._op) + An example of the graph handle: + + .. code:: python + + { + "server": "127.0.0.1:8888,127.0.0.1:8889", + "client_count": 1, + "vineyard_socket": "/var/run/vineyard.sock", + "vineyard_id": 13278328736, + "node_schema": [ + "user:false:false:10:0:0", + "item:true:false:0:0:5" + ], + "edge_schema": [ + "user:click:item:true:false:0:0:0", + "user:buy:item:true:true:0:0:0", + "item:similar:item:false:false:10:0:0" + ], + "node_attribute_types": { + "person": { + "age": "i", + "name": "s", + }, + }, + "edge_attribute_types": { + "knows": { + "weight": "f", + }, + }, + } + + The handle can be decoded using: + + .. code:: python + + base64.b64decode(handle.encode('ascii')).decode('ascii') + + Note that the ports are selected from a range :code:`(8000, 9000)`. + + Args: + schema: The graph schema. + vineyard_id: The object id of graph stored in vineyard. + engine_hosts: A list of hosts for GraphScope engine workers. + engine_config: dict of config for GAE engine. + + Returns: + str: Base64 encoded handle + + """ + + def group_property_types(props): + weighted, labeled, i, f, s, attr_types = "false", "false", 0, 0, 0, {} + for prop in props: + if prop.type in [graph_def_pb2.STRING]: + s += 1 + attr_types[prop.name] = "s" + elif prop.type in (graph_def_pb2.FLOAT, graph_def_pb2.DOUBLE): + f += 1 + attr_types[prop.name] = "f" + else: + i += 1 + attr_types[prop.name] = "i" + if prop.name == "weight": + weighted = "true" + elif prop.name == "label": + labeled = "true" + return weighted, labeled, i, f, s, attr_types + + node_schema, node_attribute_types = [], dict() + for label in schema.vertex_labels: + weighted, labeled, i, f, s, attr_types = group_property_types( + schema.get_vertex_properties(label) + ) + node_schema.append( + "{}:{}:{}:{}:{}:{}".format(label, weighted, labeled, i, f, s) + ) + node_attribute_types[label] = attr_types + + edge_schema, edge_attribute_types = [], dict() + for label in schema.edge_labels: + weighted, labeled, i, f, s, attr_types = group_property_types( + schema.get_edge_properties(label) + ) + for rel in schema.get_relationships(label): + edge_schema.append( + "{}:{}:{}:{}:{}:{}:{}:{}".format( + rel[0], label, rel[1], weighted, labeled, i, f, s + ) + ) + edge_attribute_types[label] = attr_types + + engine_hosts = ",".join(engine_hosts) + handle = { + "hosts": engine_hosts, + "client_count": 1, + "vineyard_id": vineyard_id, + "vineyard_socket": engine_config["vineyard_socket"], + "node_schema": node_schema, + "edge_schema": edge_schema, + "node_attribute_types": node_attribute_types, + "edge_attribute_types": edge_attribute_types, + } + handle_json_string = json.dumps(handle) + return base64.b64encode(handle_json_string.encode("utf-8")).decode("utf-8") diff --git a/python/graphscope/nx/conftest.py b/python/graphscope/nx/conftest.py index d2f351055169..f7439fa26e38 100644 --- a/python/graphscope/nx/conftest.py +++ b/python/graphscope/nx/conftest.py @@ -26,8 +26,6 @@ @pytest.fixture(scope="module") def graphscope_session(): graphscope.set_option(show_log=True) - graphscope.set_option(initializing_interactive_engine=False) - if os.environ.get("DEPLOYMENT", None) == "standalone": sess = graphscope.session(cluster_type="hosts", num_workers=1) else: diff --git a/python/graphscope/proto/coordinator_service.proto b/python/graphscope/proto/coordinator_service.proto index 02f4ed4229f7..cfe917bed62c 100644 --- a/python/graphscope/proto/coordinator_service.proto +++ b/python/graphscope/proto/coordinator_service.proto @@ -36,4 +36,16 @@ service CoordinatorService { // Distribute the specified libary to servers rpc AddLib(AddLibRequest) returns (AddLibResponse); + + rpc CreateAnalyticalInstance (CreateAnalyticalInstanceRequest) returns (CreateAnalyticalInstanceResponse); + + rpc CreateInteractiveInstance (CreateInteractiveInstanceRequest) returns (CreateInteractiveInstanceResponse); + + rpc CreateLearningInstance (CreateLearningInstanceRequest) returns (CreateLearningInstanceResponse); + + rpc CloseAnalyticalInstance (CloseAnalyticalInstanceRequest) returns (CloseAnalyticalInstanceResponse); + + rpc CloseInteractiveInstance (CloseInteractiveInstanceRequest) returns (CloseInteractiveInstanceResponse); + + rpc CloseLearningInstance (CloseLearningInstanceRequest) returns (CloseLearningInstanceResponse); } diff --git a/python/graphscope/proto/message.proto b/python/graphscope/proto/message.proto index 00622ce1c3c9..b49f0e93515e 100644 --- a/python/graphscope/proto/message.proto +++ b/python/graphscope/proto/message.proto @@ -50,8 +50,6 @@ message ConnectSessionResponse { // session handle to close the session. string session_id = 2; ClusterType cluster_type = 3; - string engine_config = 4; - repeated string pod_name_list = 5; int32 num_workers = 6; string namespace = 7; } @@ -63,6 +61,7 @@ message ConnectSessionResponse { //////////////////////////////////////////////////////////////////////////////// message HeartBeatRequest { + string session_id = 1; } message HeartBeatResponse { @@ -166,4 +165,72 @@ message AddLibRequest{ } message AddLibResponse{ -} \ No newline at end of file +} + +message CreateAnalyticalInstanceRequest { + string session_id = 1; +}; + +message CreateAnalyticalInstanceResponse { + string instance_id = 1; + string engine_config = 2; + repeated string host_names = 5; +}; + +message CreateInteractiveInstanceRequest { + string session_id = 1; + int64 object_id = 2; + string schema_path = 3; +}; + +message CreateInteractiveInstanceResponse { + string gremlin_endpoint = 1; + int64 object_id = 2; +}; + +message CreateLearningInstanceRequest { + string session_id = 1; + int64 object_id = 2; + string handle = 3; + string config = 4; +}; + +message CreateLearningInstanceResponse { + int64 object_id = 1; + string handle = 2; + string config = 3; + repeated string endpoints = 4; +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Close Instance request/response protos. +// +//////////////////////////////////////////////////////////////////////////////// + +message CloseAnalyticalInstanceRequest { + string session_id = 1; + string instance_id = 2; +}; + +message CloseAnalyticalInstanceResponse { + +}; + +message CloseInteractiveInstanceRequest { + string session_id = 1; + int64 object_id = 2; +}; + +message CloseInteractiveInstanceResponse { + +}; + +message CloseLearningInstanceRequest { + string session_id = 1; + int64 object_id = 2; +}; + +message CloseLearningInstanceResponse { + +}; diff --git a/python/graphscope/proto/types.proto b/python/graphscope/proto/types.proto index 41f95e864a6d..2286405449b1 100644 --- a/python/graphscope/proto/types.proto +++ b/python/graphscope/proto/types.proto @@ -104,14 +104,10 @@ enum OperationType { INDUCE_SUBGRAPH = 22; // induce subgraph UNLOAD_CONTEXT = 23; // unload context - CREATE_INTERACTIVE_QUERY = 31; // interactive query SUBGRAPH = 32; // subgraph in interactive query GREMLIN_QUERY = 33; // queries on gremlin engine FETCH_GREMLIN_RESULT = 34; - CLOSE_INTERACTIVE_QUERY = 35; - CREATE_LEARNING_INSTANCE = 41; // learning graph - CLOSE_LEARNING_INSTANCE = 42; DATA_SOURCE = 46; // loader DATA_SINK = 47; @@ -176,7 +172,6 @@ enum ParamKey { DISTRIBUTED = 27; SCHEMA_PATH = 31; - GIE_GREMLIN_ENGINE_PARAMS = 34; GIE_GREMLIN_QUERY_MESSAGE = 35; GIE_GREMLIN_REQUEST_OPTIONS = 36; GIE_GREMLIN_FETCH_RESULT_TYPE = 37; diff --git a/python/graphscope/tests/conftest.py b/python/graphscope/tests/conftest.py index b0fd8cac3739..78f279df3500 100644 --- a/python/graphscope/tests/conftest.py +++ b/python/graphscope/tests/conftest.py @@ -34,7 +34,6 @@ @pytest.fixture(scope="module") def graphscope_session(): graphscope.set_option(show_log=True) - graphscope.set_option(initializing_interactive_engine=False) if os.environ.get("DEPLOYMENT", None) == "standalone": sess = graphscope.session(cluster_type="hosts", num_workers=1) else: diff --git a/python/graphscope/tests/minitest/test_min.py b/python/graphscope/tests/minitest/test_min.py index b6144cb1e8ca..1c2bd3af1c41 100644 --- a/python/graphscope/tests/minitest/test_min.py +++ b/python/graphscope/tests/minitest/test_min.py @@ -41,7 +41,6 @@ logger = logging.getLogger("graphscope") graphscope.set_option(show_log=True) -graphscope.set_option(initializing_interactive_engine=False) @pytest.fixture(scope="module") diff --git a/python/graphscope/tests/unittest/test_graph.py b/python/graphscope/tests/unittest/test_graph.py index 5bf742aa2fd7..25896ab25386 100644 --- a/python/graphscope/tests/unittest/test_graph.py +++ b/python/graphscope/tests/unittest/test_graph.py @@ -34,7 +34,6 @@ from graphscope.framework.errors import InvalidArgumentError from graphscope.framework.loader import Loader from graphscope.proto import graph_def_pb2 -from graphscope.proto import types_pb2 logger = logging.getLogger("graphscope") prefix = os.path.expandvars("${GS_TEST_DIR}") diff --git a/python/graphscope/tests/unittest/test_lazy.py b/python/graphscope/tests/unittest/test_lazy.py index 042e502c5ba2..6d1c1082b487 100644 --- a/python/graphscope/tests/unittest/test_lazy.py +++ b/python/graphscope/tests/unittest/test_lazy.py @@ -16,13 +16,7 @@ # limitations under the License. # -import importlib -import logging import os -import random -import string -import sys -import time import numpy as np import pytest @@ -32,10 +26,6 @@ graphscope.set_option(show_log=True) from graphscope.dataset import load_p2p_network -from graphscope.framework.app import AppAssets -from graphscope.framework.app import AppDAGNode -from graphscope.framework.errors import AnalyticalEngineInternalError -from graphscope.framework.errors import InvalidArgumentError from graphscope.framework.loader import Loader test_repo_dir = os.path.expandvars("${GS_TEST_DIR}") @@ -229,3 +219,11 @@ def test_simulate_eager(sess): c, {"id_col": "v.id", "data_col": "v.data", "result_col": "r"} ) g2 = sess.run(g2_node) + + +def test_across_engine(sess): + g_node = load_p2p_network(sess) + interactive = sess.gremlin(g_node) + res = interactive.execute("g.V().count()").all() + res = sess.run(res) + assert res[0] == 62586 diff --git a/python/graphscope/tests/unittest/test_scalability.py b/python/graphscope/tests/unittest/test_scalability.py index 2e2e3e42f838..f604223f8bf5 100644 --- a/python/graphscope/tests/unittest/test_scalability.py +++ b/python/graphscope/tests/unittest/test_scalability.py @@ -29,7 +29,6 @@ def p2p_property_graph(num_workers, directed=True): data_dir = os.path.expandvars("${GS_TEST_DIR}/property") graphscope.set_option(show_log=True) - graphscope.set_option(initializing_interactive_engine=False) sess = graphscope.session(num_workers=num_workers, cluster_type="hosts") graph = sess.g(directed=directed) graph = graph.add_vertices("{}/p2p-31_property_v_0".format(data_dir), "person") diff --git a/python/graphscope/tests/unittest/test_session.py b/python/graphscope/tests/unittest/test_session.py index a9fa5c999045..dcbafc469a03 100644 --- a/python/graphscope/tests/unittest/test_session.py +++ b/python/graphscope/tests/unittest/test_session.py @@ -31,7 +31,6 @@ def setUpModule(): graphscope.set_option(show_log=True) - graphscope.set_option(initializing_interactive_engine=False) @pytest.fixture diff --git a/scripts/install_deps.sh b/scripts/install_deps.sh index 28dbd2e55e17..86f44bc2bb07 100755 --- a/scripts/install_deps.sh +++ b/scripts/install_deps.sh @@ -957,7 +957,7 @@ install_deps_dev() { succ_msg="The script has installed all dependencies for builing GraphScope, use commands:\n $ source ${OUTPUT_ENV_FILE} - $ make graphscope\n + $ sudo make install\n to build and develop GraphScope." if [[ ${GRAPE_JDK} == true ]]; then install_fastFFI